diff --git a/docs/configurationChoices.md b/docs/configurationChoices.md index 9244fab4..e4d61e51 100644 --- a/docs/configurationChoices.md +++ b/docs/configurationChoices.md @@ -45,6 +45,16 @@ components is not currently supported: better management of the database and decouples its lifecycle from that of the OpenWhisk deployment. - The event providers: alarmprovider and kafkaprovider. +### Openwhisk Scheduler + +By default, the scheduler is enabled. To disable the scheduler, add the following +to your `mycluster.yaml` + +```yaml +scheduler: + enable: false +``` + ### Using an external database You may want to use an external CouchDB or Cloudant instance instead @@ -180,6 +190,8 @@ k8s: enabled: false ``` +Currently, etcd persistence is not supported. + ### Selectively Deploying Event Providers The default settings of the Helm chart will deploy OpenWhisk's alarm diff --git a/docs/k8s-custom-build-cluster-scaleup.md b/docs/k8s-custom-build-cluster-scaleup.md index 36327949..62040b47 100644 --- a/docs/k8s-custom-build-cluster-scaleup.md +++ b/docs/k8s-custom-build-cluster-scaleup.md @@ -40,7 +40,7 @@ Modifying the above mentioned parameters, one can easily increase the concurrenc In order to further increase the scale-up beyond `Small Scale`, one needs to modify the following additional configurations appropriately (on top of the above mentioned): * `invoker:jvmHeapMB`: jvmHeap memory available to each invoker instance. May or may not require increase based on running functions. For more information check `troubleshooting` below. * `invoker:containerFactory:_:replicaCount`: number of invoker instances that will be used to handle the incoming workload. By default, there is only one invoker instance which can become overwhelmed if workload goes beyond a certain threshold. -* `controller:replicaCount`: number of controller instances that will be used to handle the incoming workload. Same as invoker instances. +* `controller:replicaCount`: number of controller instances that will be used to handle the incoming workload. Same as invoker and scheduler instances. * `invoker:options`: Log processing at the invoker can become a bottleneck for the KubernetesContainerFactory. One might try disabling invoker log processing by setting it to `-Dwhisk.spi.LogStoreProvider=org.apache.openwhisk.core.containerpool.logging.LogDriverLogStoreProvider`. In general, one needs to offload log processing from the invoker to a node-level log store provider if one is trying to push a large load through the system. ## Troubleshooting diff --git a/docs/k8s-kind.md b/docs/k8s-kind.md index 769a36c6..875623f8 100644 --- a/docs/k8s-kind.md +++ b/docs/k8s-kind.md @@ -94,8 +94,8 @@ OpenWhisk apihost property to be set to localhost:31001 ## Hints and Tips If you are working on the core OpenWhisk system and want -to use a locally built controller or invoker image to test -your changes, you need to push the image to the docker image +to use a locally built controller, invoker, or scheduler image +to test your changes, you need to push the image to the docker image repository inside the `kind` cluster. For example, suppose I had a local change to the controller diff --git a/helm/openwhisk/templates/_helpers.tpl b/helm/openwhisk/templates/_helpers.tpl index 847ef8e2..2275e290 100644 --- a/helm/openwhisk/templates/_helpers.tpl +++ b/helm/openwhisk/templates/_helpers.tpl @@ -42,6 +42,11 @@ app: {{ template "openwhisk.fullname" . }} {{ .Release.Name }}-controller.{{ .Release.Namespace }}.svc.{{ .Values.k8s.domain }} {{- end -}} +{{/* hostname for scheduler */}} +{{- define "openwhisk.scheduler_host" -}} +{{ .Release.Name }}-scheduler.{{ .Release.Namespace }}.svc.{{ .Values.k8s.domain }} +{{- end -}} + {{/* hostname for database */}} {{- define "openwhisk.db_host" -}} {{- if .Values.db.external -}} @@ -68,6 +73,15 @@ app: {{ template "openwhisk.fullname" . }} {{- end -}} {{- end -}} +{{/* hostname for etcd */}} +{{- define "openwhisk.etcd_host" -}} +{{- if .Values.etcd.external -}} +{{ .Values.etcd.host }} +{{- else -}} +{{ .Release.Name }}-etcd.{{ .Release.Namespace }}.svc.{{ .Values.k8s.domain }} +{{- end -}} +{{- end -}} + {{/* client connection string for zookeeper cluster (server1:port,server2:port, ... serverN:port)*/}} {{- define "openwhisk.zookeeper_connect" -}} {{- if .Values.zookeeper.external -}} @@ -224,12 +238,20 @@ app: {{ template "openwhisk.fullname" . }} value: {{ .Values.whisk.kafka.topics.health.retentionMs | quote }} - name: "CONFIG_whisk_kafka_topics_health_segmentBytes" value: {{ .Values.whisk.kafka.topics.health.segmentBytes | quote }} + - name: "CONFIG_whisk_kafka_topics_invoker_retentionBytes" value: {{ .Values.whisk.kafka.topics.invoker.retentionBytes | quote }} - name: "CONFIG_whisk_kafka_topics_invoker_retentionMs" value: {{ .Values.whisk.kafka.topics.invoker.retentionMs | quote }} - name: "CONFIG_whisk_kafka_topics_invoker_segmentBytes" value: {{ .Values.whisk.kafka.topics.invoker.segmentBytes | quote }} + +- name: "CONFIG_whisk_kafka_topics_scheduler_retentionBytes" + value: {{ .Values.whisk.kafka.topics.scheduler.retentionBytes | quote }} +- name: "CONFIG_whisk_kafka_topics_scheduler_retentionMs" + value: {{ .Values.whisk.kafka.topics.scheduler.retentionMs | quote }} +- name: "CONFIG_whisk_kafka_topics_scheduler_segmentBytes" + value: {{ .Values.whisk.kafka.topics.scheduler.segmentBytes | quote }} {{- end -}} {{/* tlssecretname for ingress */}} diff --git a/helm/openwhisk/templates/_readiness.tpl b/helm/openwhisk/templates/_readiness.tpl index da6a0fec..17f4983a 100644 --- a/helm/openwhisk/templates/_readiness.tpl +++ b/helm/openwhisk/templates/_readiness.tpl @@ -57,6 +57,17 @@ command: ["sh", "-c", "result=1; until [ $result -eq 0 ]; do echo 'Checking controller readiness'; wget -T 5 --spider $READINESS_URL; result=$?; sleep 1; done; echo 'Success: controller is ready'"] {{- end -}} +{{/* Init container that waits for scheduler to be ready */}} +{{- define "openwhisk.readiness.waitForScheduler" -}} +- name: "wait-for-scheduler" + image: "{{- .Values.docker.registry.name -}}{{- .Values.busybox.imageName -}}:{{- .Values.busybox.imageTag -}}" + imagePullPolicy: "IfNotPresent" + env: + - name: "READINESS_URL" + value: http://{{ include "openwhisk.scheduler_host" . }}:{{ .Values.scheduler.port }}/ping + command: ["sh", "-c", "result=1; until [ $result -eq 0 ]; do echo 'Checking scheduler readiness'; wget -T 5 --spider $READINESS_URL; result=$?; sleep 1; done; echo 'Success: scheduler is ready'"] +{{- end -}} + {{/* Init container that waits for at least 1 healthy invoker */}} {{- define "openwhisk.readiness.waitForHealthyInvoker" -}} - name: "wait-for-healthy-invoker" diff --git a/helm/openwhisk/templates/akka_rolebind.yaml b/helm/openwhisk/templates/akka_rolebind.yaml new file mode 100644 index 00000000..2a6e48eb --- /dev/null +++ b/helm/openwhisk/templates/akka_rolebind.yaml @@ -0,0 +1,44 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This file is needed for Akka Cluster Bootstrapping +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-reader +rules: +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "watch", "list"] + +# Core pods are allowed to get information about other pods +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ .Release.Name }}-read-pods + labels: +{{ include "openwhisk.label_boilerplate" . | indent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: pod-reader +subjects: + - kind: ServiceAccount + name: {{ .Release.Name }}-core + namespace: {{ .Release.Namespace | quote }} diff --git a/helm/openwhisk/templates/controller-pod.yaml b/helm/openwhisk/templates/controller-pod.yaml index 66813700..f6326137 100644 --- a/helm/openwhisk/templates/controller-pod.yaml +++ b/helm/openwhisk/templates/controller-pod.yaml @@ -60,6 +60,7 @@ spec: {{- if not .Values.controller.lean }} # The controller must wait for kafka and/or couchdb to be ready before it starts {{ include "openwhisk.readiness.waitForKafka" . | indent 6 }} +# Note: we may need to wait for etcd {{- end }} {{ include "openwhisk.readiness.waitForCouchDB" . | indent 6 }} {{- if eq .Values.activationStoreBackend "ElasticSearch" }} @@ -85,7 +86,7 @@ spec: - name: controller containerPort: {{ .Values.controller.port }} - name: akka-remoting - containerPort: 2552 + containerPort: 25520 - name: akka-mgmt-http containerPort: 19999 {{- if .Values.controller.lean }} @@ -114,6 +115,11 @@ spec: - name: "TZ" value: {{ .Values.docker.timezone | quote }} + - name: "POD_IP" + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: "CONFIG_whisk_info_date" valueFrom: configMapKeyRef: @@ -137,6 +143,15 @@ spec: - name: "RUNTIMES_MANIFEST" value: {{ template "openwhisk.runtimes_manifest" . }} + # scheduler settings +{{ if .Values.scheduler.enable }} + - name: "CONFIG_whisk_spi_LoadBalancerProvider" + value: "org.apache.openwhisk.core.loadBalancer.FPCPoolBalancer" + + - name: "CONFIG_whisk_spi_EntitlementSpiProvider" + value: "org.apache.openwhisk.core.entitlement.FPCEntitlementProvider" +{{ end }} + # Action limits {{ include "openwhisk.limitsEnvVars" . | indent 8 }} @@ -151,11 +166,22 @@ spec: value: "{{ include "openwhisk.kafka_connect" . }}" {{ include "openwhisk.kafkaConfigEnvVars" . | indent 8 }} + # etcd properties + - name: "CONFIG_whisk_etcd_hosts" + value: {{ include "openwhisk.etcd_host" . }}:{{ .Values.etcd.port }} + + - name: "CONFIG_whisk_etcd_lease_timeout" + value: {{ .Values.etcd.lease_timeout | quote }} + + - name: "CONFIG_whisk_etcd_pool_threads" + value: {{ .Values.etcd.pool_threads | quote }} + # properties for DB connection {{ include "openwhisk.dbEnvVars" . | indent 8 }} - name: "CONTROLLER_INSTANCES" value: {{ .Values.controller.replicaCount | quote }} + {{- if gt (int .Values.controller.replicaCount) 1 }} - name: "CONFIG_whisk_cluster_useClusterBootstrap" value: "true" @@ -169,7 +195,11 @@ spec: value: "name={{ .Release.Name }}-controller" - name: "CONFIG_akka_discovery_kubernetesApi_podPortName" value: "akka-mgmt-http" +{{- else }} + - name: "CONFIG_akka_cluster_seedNodes_0" + value: "akka://controller-actor-system@$(POD_IP):25520" {{- end }} + {{- if .Values.metrics.prometheusEnabled }} - name: "OPENWHISK_ENCODED_CONFIG" value: {{ template "openwhisk.whiskconfig" . }} diff --git a/helm/openwhisk/templates/etcd-pod.yaml b/helm/openwhisk/templates/etcd-pod.yaml new file mode 100644 index 00000000..cb905cf6 --- /dev/null +++ b/helm/openwhisk/templates/etcd-pod.yaml @@ -0,0 +1,120 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +{{ if not .Values.etcd.external }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ .Release.Name }}-etcd + labels: + name: {{ .Release.Name }}-etcd +{{ include "openwhisk.label_boilerplate" . | indent 4 }} +spec: + replicas: {{ .Values.etcd.replicaCount }} + selector: + matchLabels: + name: {{ .Release.Name }}-etcd + {{- if .Values.k8s.persistence.enabled }} + strategy: + type: "Recreate" + {{- end }} + template: + metadata: + labels: + name: {{ .Release.Name }}-etcd +{{ include "openwhisk.label_boilerplate" . | indent 8 }} + spec: + restartPolicy: {{ .Values.etcd.restartPolicy }} + + {{- if .Values.affinity.enabled }} + affinity: +{{ include "openwhisk.affinity.core" . | indent 8 }} +{{ include "openwhisk.affinity.selfAntiAffinity" ( printf "%s-etcd" .Release.Name | quote ) | indent 8 }} + {{- end }} + + {{- if .Values.toleration.enabled }} + tolerations: +{{ include "openwhisk.toleration.core" . | indent 8 }} + {{- end }} + +{{- if .Values.k8s.persistence.enabled }} + volumes: + - name: etcd-data + persistentVolumeClaim: + claimName: {{ .Release.Name }}-etcd-pvc +{{- end }} + +{{- if .Values.k8s.persistence.enabled }} + initContainers: + - name: etcd-init + image: "{{- .Values.docker.registry.name -}}{{- .Values.busybox.imageName -}}:{{- .Values.busybox.imageTag -}}" + command: + - chown + - -v + - -R + - 999:999 + - /data + volumeMounts: + - mountPath: /data + name: etcd-data + readOnly: false +{{- end }} +{{ include "openwhisk.docker.imagePullSecrets" . | indent 6 }} + # TODO: current command will always restart from scratch. + containers: + - name: etcd + image: "{{- .Values.docker.registry.name -}}{{- .Values.etcd.imageName -}}:{{- .Values.etcd.imageTag -}}" + command: + - /usr/local/bin/etcd + - --data-dir=/data + - --name + - etcd0 + - --initial-advertise-peer-urls + - http://127.0.0.1:2480 + - --advertise-client-urls + - http://0.0.0.0:2379 + - --listen-peer-urls + - http://127.0.0.1:2480 + - --listen-client-urls + - http://0.0.0.0:2379 + - --initial-cluster + - etcd0=http://127.0.0.1:2480 + - --initial-cluster-state + - new + - --initial-cluster-token + - openwhisk-etcd-token + - --quota-backend-bytes + - "0" + - --snapshot-count + - "100000" + - --auto-compaction-retention + - "1" + - --auto-compaction-mode + - periodic + - --log-level + - info + imagePullPolicy: {{ .Values.etcd.imagePullPolicy | quote }} +{{- if .Values.k8s.persistence.enabled }} + volumeMounts: + - mountPath: /data + name: etcd-data + readOnly: false +{{- end }} + ports: + - name: etcd + containerPort: {{ .Values.etcd.port }} +{{ end }} diff --git a/helm/openwhisk/templates/etcd-pvc.yaml b/helm/openwhisk/templates/etcd-pvc.yaml new file mode 100644 index 00000000..860360c1 --- /dev/null +++ b/helm/openwhisk/templates/etcd-pvc.yaml @@ -0,0 +1,34 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +{{- if and (not .Values.etcd.external) .Values.k8s.persistence.enabled }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Release.Name }}-etcd-pvc + labels: +{{ include "openwhisk.label_boilerplate" . | indent 4 }} +spec: +{{- if not .Values.k8s.persistence.hasDefaultStorageClass }} + storageClassName: {{ .Values.k8s.persistence.explicitStorageClass }} +{{- end }} + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ .Values.etcd.persistence.size }} +{{- end }} diff --git a/helm/openwhisk/templates/etcd-svc.yaml b/helm/openwhisk/templates/etcd-svc.yaml new file mode 100644 index 00000000..0765c979 --- /dev/null +++ b/helm/openwhisk/templates/etcd-svc.yaml @@ -0,0 +1,32 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +{{ if not .Values.etcd.external }} +apiVersion: v1 +kind: Service +metadata: + name: {{ .Release.Name }}-etcd + labels: + name: {{ .Release.Name }}-etcd +{{ include "openwhisk.label_boilerplate" . | indent 4 }} +spec: + selector: + name: {{ .Release.Name }}-etcd + ports: + - port: {{ .Values.etcd.port }} + name: etcd +{{ end }} diff --git a/helm/openwhisk/templates/install-packages-job.yaml b/helm/openwhisk/templates/install-packages-job.yaml index fc7cb304..5f747d8c 100644 --- a/helm/openwhisk/templates/install-packages-job.yaml +++ b/helm/openwhisk/templates/install-packages-job.yaml @@ -41,6 +41,9 @@ spec: {{ include "openwhisk.readiness.waitForController" . | indent 6 }} {{- else }} {{ include "openwhisk.readiness.waitForHealthyInvoker" . | indent 6 }} +{{- if .Values.scheduler.enable }} +{{ include "openwhisk.readiness.waitForScheduler" . | indent 6 }} +{{ end }} {{ end }} {{ include "openwhisk.docker.imagePullSecrets" . | indent 6 }} containers: diff --git a/helm/openwhisk/templates/invoker-pod.yaml b/helm/openwhisk/templates/invoker-pod.yaml index dbb80289..7449625b 100644 --- a/helm/openwhisk/templates/invoker-pod.yaml +++ b/helm/openwhisk/templates/invoker-pod.yaml @@ -76,7 +76,7 @@ spec: # Pull images for all default runtimes before starting invoker {{ include "openwhisk.docker_pull_runtimes" . | indent 6 }} {{- end }} - # Wait for a controller to be up (which implies kafka, zookeeper, couchdb are all up as well). + # Wait for a controller to be up (which implies kafka, zookeeper, couchdb, etcd are all up as well). {{ include "openwhisk.readiness.waitForController" . | indent 6 }} {{ include "openwhisk.docker.imagePullSecrets" . | indent 6 }} containers: @@ -140,6 +140,21 @@ spec: - name: "JAVA_OPTS" value: "-Xmx{{- .Values.invoker.jvmHeapMB -}}M {{ .Values.invoker.jvmOptions }}" +{{- if .Values.scheduler.enable }} + # Options only needed when new scheduler is enabled + - name: "CONFIG_whisk_scheduler_dataManagementService_retryInterval" + value: "{{ .Values.scheduler.dataManagementService.retryInterval }}" + + - name: "CONFIG_whisk_spi_InvokerProvider" + value: "org.apache.openwhisk.core.invoker.FPCInvokerReactive" + + - name: "CONFIG_whisk_spi_InvokerServerProvider" + value: "org.apache.openwhisk.core.invoker.FPCInvokerServer" + + - name: "CONFIG_whisk_invoker_containerCreation_maxPeek" + value: "500" +{{- end }} + # Invoker options - name: "INVOKER_OPTS" value: "{{ .Values.invoker.options }} {{ include "openwhisk.invoker.add_opts" . }}" @@ -170,6 +185,16 @@ spec: value: "{{ include "openwhisk.kafka_connect" . }}" {{ include "openwhisk.kafkaConfigEnvVars" . | indent 10 }} + # etcd properties + - name: "CONFIG_whisk_etcd_hosts" + value: {{ include "openwhisk.etcd_host" . }}:{{ .Values.etcd.port }} + + - name: "CONFIG_whisk_etcd_lease_timeout" + value: {{ .Values.etcd.lease_timeout | quote }} + + - name: "CONFIG_whisk_etcd_pool_threads" + value: {{ .Values.etcd.pool_threads | quote }} + # properties for zookeeper connection - name: "ZOOKEEPER_HOSTS" value: "{{ include "openwhisk.zookeeper_connect" . }}" diff --git a/helm/openwhisk/templates/scheduler-pdb.yaml b/helm/openwhisk/templates/scheduler-pdb.yaml new file mode 100644 index 00000000..56790e1d --- /dev/null +++ b/helm/openwhisk/templates/scheduler-pdb.yaml @@ -0,0 +1,34 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +{{- if .Values.scheduler.enable }} +{{- if and .Values.pdb.enable (gt (int .Values.scheduler.replicaCount) 1) }} +--- +apiVersion: policy/v1beta1 +kind: PodDisruptionBudget +metadata: + name: {{ .Release.Name }}-scheduler-pdb + labels: + name: {{ .Release.Name }}-scheduler-pdb +{{ include "openwhisk.label_boilerplate" . | indent 4 }} +spec: + selector: + matchLabels: + name: {{ .Release.Name }}-scheduler + maxUnavailable: {{ .Values.pdb.scheduler.maxUnavailable | default 1 }} +{{- end }} +{{- end }} diff --git a/helm/openwhisk/templates/scheduler-pod.yaml b/helm/openwhisk/templates/scheduler-pod.yaml new file mode 100644 index 00000000..7fbe4dd8 --- /dev/null +++ b/helm/openwhisk/templates/scheduler-pod.yaml @@ -0,0 +1,273 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +{{- if .Values.scheduler.enable }} +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{ .Release.Name }}-scheduler + labels: + name: {{ .Release.Name }}-scheduler +{{ include "openwhisk.label_boilerplate" . | indent 4 }} +spec: + serviceName: {{ .Release.Name }}-scheduler + podManagementPolicy: "Parallel" + replicas: {{ .Values.scheduler.replicaCount }} + selector: + matchLabels: + name: {{ .Release.Name }}-scheduler + template: + metadata: + labels: + name: {{ .Release.Name }}-scheduler +{{ include "openwhisk.label_boilerplate" . | indent 8 }} + + {{- if .Values.metrics.prometheusEnabled }} + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '{{ .Values.scheduler.port }}' + {{- end }} + + spec: + serviceAccountName: {{ .Release.Name }}-core + restartPolicy: {{ .Values.scheduler.restartPolicy }} + + {{- if .Values.affinity.enabled }} + affinity: +{{ include "openwhisk.affinity.core" . | indent 8 }} +{{ include "openwhisk.affinity.selfAntiAffinity" ( printf "%s-scheduler" .Release.Name ) | indent 8 }} + {{- end }} + + {{- if .Values.toleration.enabled }} + tolerations: +{{ include "openwhisk.toleration.core" . | indent 8 }} + {{- end }} + + initContainers: + # The scheduler must wait for kafka and/or couchdb to be ready before it starts +{{ include "openwhisk.readiness.waitForController" . | indent 6 }} +{{ include "openwhisk.docker.imagePullSecrets" . | indent 6 }} + + containers: + - name: scheduler + imagePullPolicy: {{ .Values.scheduler.imagePullPolicy | quote }} + image: "{{- .Values.docker.registry.name -}}{{- .Values.scheduler.imageName -}}:{{- .Values.scheduler.imageTag -}}" + command: ["/bin/bash", "-c", "/init.sh `hostname | awk -F '-' '{print $NF}'`"] + ports: + - name: scheduler + containerPort: {{ .Values.scheduler.port }} + - name: rpc + containerPort: {{ .Values.scheduler.rpcPort }} + - name: akka-remoting + containerPort: 25520 + - name: akka-mgmt-http + containerPort: 19999 + livenessProbe: + httpGet: + path: "/ping" + port: {{ .Values.scheduler.port }} + scheme: "HTTP" + initialDelaySeconds: {{ .Values.probes.scheduler.livenessProbe.initialDelaySeconds }} + periodSeconds: {{ .Values.probes.scheduler.livenessProbe.periodSeconds }} + timeoutSeconds: {{ .Values.probes.scheduler.livenessProbe.timeoutSeconds }} + readinessProbe: + httpGet: + path: "/ping" + port: {{ .Values.scheduler.port }} + scheme: "HTTP" + initialDelaySeconds: {{ .Values.probes.scheduler.readinessProbe.initialDelaySeconds }} + periodSeconds: {{ .Values.probes.scheduler.readinessProbe.periodSeconds }} + timeoutSeconds: {{ .Values.probes.scheduler.readinessProbe.timeoutSeconds }} + env: + - name: "PORT" + value: {{ .Values.scheduler.port | quote }} + + - name: "POD_IP" + valueFrom: + fieldRef: + fieldPath: status.podIP + + - name: "WHISK_SCHEDULER_ENDPOINTS_RPCPORT" + value: {{ .Values.scheduler.rpcPort | quote }} + + - name: "WHISK_SCHEDULER_ENDPOINTS_AKKAPORT" + value: "25520" + + - name: "TZ" + value: {{ .Values.docker.timezone | quote }} + + - name: "CONFIG_whisk_info_date" + valueFrom: + configMapKeyRef: + name: {{ .Release.Name }}-whisk.config + key: whisk_info_date + - name: "CONFIG_whisk_info_buildNo" + valueFrom: + configMapKeyRef: + name: {{ .Release.Name }}-whisk.config + key: whisk_info_buildNo + + # Java options + - name: "JAVA_OPTS" + value: "-Xmx{{- .Values.scheduler.jvmHeapMB -}}M {{ .Values.scheduler.jvmOptions }}" + + # specific scheduler arguments + - name: "SCHEDULER_OPTS" + value: "{{ .Values.scheduler.options }}" + + # Action limits +{{ include "openwhisk.limitsEnvVars" . | indent 8 }} + + - name: "CONFIG_whisk_durationChecker_timeWindow" + value: {{ .Values.whisk.durationChecker.timeWindow }} + + - name: "CONFIG_whisk_loadbalancer_blackboxFraction" + value: {{ .Values.whisk.loadbalancer.blackboxFraction | quote }} + + - name: "CONFIG_whisk_loadbalancer_timeoutFactor" + value: {{ .Values.whisk.loadbalancer.timeoutFactor | quote }} + + # Kafka properties + - name: "KAFKA_HOSTS" + value: "{{ include "openwhisk.kafka_connect" . }}" +{{ include "openwhisk.kafkaConfigEnvVars" . | indent 8 }} + + # etcd properties + - name: "CONFIG_whisk_etcd_hosts" + value: {{ include "openwhisk.etcd_host" . }}:{{ .Values.etcd.port }} + + - name: "CONFIG_whisk_etcd_lease_timeout" + value: {{ .Values.etcd.lease_timeout | quote }} + + - name: "CONFIG_whisk_etcd_pool_threads" + value: {{ .Values.etcd.pool_threads | quote }} + + # properties for zookeeper connection + - name: "ZOOKEEPER_HOSTS" + value: "{{ include "openwhisk.zookeeper_connect" . }}" + + # action runtimes + - name: "RUNTIMES_MANIFEST" + value: {{ template "openwhisk.runtimes_manifest" . }} + + # properties for DB connection +{{ include "openwhisk.dbEnvVars" . | indent 8 }} + + - name: "WHISK_SCHEDULER_ENDPOINTS_HOST" + valueFrom: + fieldRef: + fieldPath: status.podIP + + - name: "CONFIG_whisk_scheduler_dataManagementService_retryInterval" + value: {{ .Values.scheduler.dataManagementService.retryInterval | quote }} + + - name: "CONFIG_whisk_scheduler_queueManager_maxSchedulingTime" + value: {{ .Values.scheduler.queueManager.maxSchedulingTime | quote }} + + - name: "CONFIG_whisk_scheduler_queueManager_maxRetriesToGetQueue" + value: {{ .Values.scheduler.queueManager.maxRetriesToGetQueue | quote }} + + - name: "CONFIG_whisk_scheduler_queue_idleGrace" + value: {{ .Values.scheduler.queue.idleGrace | quote }} + + - name: "CONFIG_whisk_scheduler_queue_stopGrace" + value: {{ .Values.scheduler.queue.stopGrace | quote }} + + - name: "CONFIG_whisk_scheduler_queue_flushGrace" + value: {{ .Values.scheduler.queue.flushGrace | quote }} + + - name: "CONFIG_whisk_scheduler_queue_gracefulShutdownTimeout" + value: {{ .Values.scheduler.queue.gracefulShutdownTimeout | quote }} + + - name: "CONFIG_whisk_scheduler_queue_maxRetentionSize" + value: {{ .Values.scheduler.queue.maxRetentionSize | quote }} + + - name: "CONFIG_whisk_scheduler_queue_maxRetentionMs" + value: {{ .Values.scheduler.queue.maxRetentionMs | quote }} + + - name: "CONFIG_whisk_scheduler_queue_throttlingFraction" + value: {{ .Values.scheduler.queue.throttlingFraction | quote }} + + - name: "CONFIG_whisk_scheduler_queue_durationBufferSize" + value: {{ .Values.scheduler.queue.durationBufferSize | quote }} + + - name: "CONFIG_whisk_scheduler_protocol" + value: {{ .Values.scheduler.protocol | quote }} + + - name: "CONFIG_whisk_scheduler_maxPeek" + value: {{ .Values.scheduler.maxPeek | quote }} + + - name: "CONFIG_whisk_scheduler_inProgressJobRetention" + value: {{ .Values.scheduler.inProgressJobRetention | quote }} + + - name: "CONFIG_whisk_scheduler_blackboxMultiple" + value: {{ .Values.scheduler.blackboxMultiple | quote }} + + - name: "SCHEDULER_INSTANCES" + value: {{ .Values.scheduler.replicaCount | quote }} + +{{- if gt (int .Values.scheduler.replicaCount) 1 }} + - name: "CONFIG_whisk_cluster_useClusterBootstrap" + value: "true" + - name: "CONFIG_akka_actor_provider" + value: "cluster" + - name: "CONFIG_akka_management_cluster_bootstrap_contactPointDiscovery_discoveryMethod" + value: "kubernetes-api" + - name: "CONFIG_akka_coordinatedShutdown_exitJvm" + value: "on" + - name: "CONFIG_akka_discovery_kubernetesApi_podNamespace" + value: {{ .Release.Namespace | quote }} + - name: "CONFIG_akka_discovery_kubernetesApi_podLabelSelector" + value: "name={{ .Release.Name }}-scheduler" +{{- else }} + - name: "CONFIG_akka_cluster_seedNodes_0" + value: "akka://scheduler-actor-system@$(POD_IP):25520" +{{- end }} + +{{- if .Values.metrics.prometheusEnabled }} + - name: "OPENWHISK_ENCODED_CONFIG" + value: {{ template "openwhisk.whiskconfig" . }} +{{- end }} +{{ if or .Values.metrics.kamonEnabled .Values.metrics.prometheusEnabled }} + - name: "METRICS_KAMON" + value: "true" +{{ end }} +{{ if .Values.metrics.kamonTags }} + - name: "METRICS_KAMON_TAGS" + value: "true" +{{ end }} +{{ if .Values.metrics.userMetricsEnabled }} + - name: "CONFIG_whisk_userEvents_enabled" + value: "true" +{{ end }} + - name: "CONFIG_logback_log_level" + value: "{{ .Values.scheduler.loglevel }}" +{{- if eq .Values.activationStoreBackend "ElasticSearch" }} + - name: "CONFIG_whisk_activationStore_elasticsearch_protocol" + value: "{{ .Values.elasticsearch.protocol }}" + - name: "CONFIG_whisk_activationStore_elasticsearch_hosts" + value: {{ template "openwhisk.elasticsearch_connect" . }} + - name: "CONFIG_whisk_activationStore_elasticsearch_indexPattern" + value: {{ .Values.elasticsearch.indexPattern }} + - name: "CONFIG_whisk_activationStore_elasticsearch_username" + value: "{{ .Values.elasticsearch.username }}" + - name: "CONFIG_whisk_activationStore_elasticsearch_password" + value: "{{ .Values.elasticsearch.password }}" + - name: "CONFIG_whisk_spi_ActivationStoreProvider" + value: "org.apache.openwhisk.core.database.elasticsearch.ElasticSearchActivationStoreProvider" +{{- end }} +{{- end }} diff --git a/helm/openwhisk/templates/scheduler-svc.yaml b/helm/openwhisk/templates/scheduler-svc.yaml new file mode 100644 index 00000000..7fd6fc72 --- /dev/null +++ b/helm/openwhisk/templates/scheduler-svc.yaml @@ -0,0 +1,36 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +{{- if .Values.scheduler.enable }} +apiVersion: v1 +kind: Service +metadata: + name: {{ .Release.Name }}-scheduler + labels: + name: {{ .Release.Name }}-scheduler +{{ include "openwhisk.label_boilerplate" . | indent 4 }} +spec: + selector: + name: {{ .Release.Name }}-scheduler + ports: + - port: {{ .Values.scheduler.port }} + name: http + - port: {{ .Values.scheduler.rpcPort }} + name: grpc + - port: 25520 + name: akka +{{- end }} diff --git a/helm/openwhisk/values.schema.json b/helm/openwhisk/values.schema.json index 98207686..cb45d5f2 100644 --- a/helm/openwhisk/values.schema.json +++ b/helm/openwhisk/values.schema.json @@ -150,8 +150,7 @@ "time": { "$ref": "#/definitions/actionLimitSpecString" }, "memory": { "$ref": "#/definitions/actionLimitSpecString" }, "concurrency": { "$ref": "#/definitions/actionLimitSpecInteger" }, - "log": { "$ref": "#/definitions/actionLimitSpecString" }, - "time": { "$ref": "#/definitions/actionLimitSpecString" } + "log": { "$ref": "#/definitions/actionLimitSpecString" } } }, "activation": { @@ -186,7 +185,8 @@ "completed": { "$ref": "#/definitions/kafkaTopicConfig" }, "events": { "$ref": "#/definitions/kafkaTopicConfig" }, "health": { "$ref": "#/definitions/kafkaTopicConfig" }, - "invoker": { "$ref": "#/definitions/kafkaTopicConfig" } + "invoker": { "$ref": "#/definitions/kafkaTopicConfig" }, + "scheduler": { "$ref": "#/definitions/kafkaTopicConfig" } } } } @@ -198,6 +198,10 @@ } }, "runtimes": { "type": "string" }, + "durationChecker": { + "type": "object", + "timeWindow": { "type": "string" } + }, "testing": { "type": "object", "properties": { @@ -368,6 +372,57 @@ ] }, + "scheduler": { + "allOf": [ + { "$ref": "#/definitions/podspec" }, + { "properties": { "port": { "type": "integer", "minimum": 0 } }, "required": ["port"] }, + { "properties": { "rpcPort": { "type": "integer", "minimum": 0 } }, "required": ["rpcPort"] }, + { "properties": { + "options": { "type": "string" }, + "jvmHeapMB": { "type": "string" }, + "jvmOptions": { "type": "string" }, + "logLevel": { "type": "string" }, + "protocol": { "type": "string" }, + "maxPeek": { "type": "integer", "minimum": 0 }, + "inProgressJobRetention": { "type": "string" }, + "blackboxMultiple": { "type": "integer" }, + "dataManagementService": { "retryInterval": { "type": "string" } }, + "queue": { + "idleGrace": { "type": "string" }, + "stopGrace": { "type": "string" }, + "flushGrace": { "type": "string" }, + "gracefulShutdownTimeout": { "type": "string" }, + "maxRetentionSize": { "type": "integer", "minimum": 0 }, + "maxRetentionMs": { "type": "integer", "minimum": 0 }, + "throttlingFraction": { "type": "number", "minimum": 0, "maximum": 1.0 }, + "durationBufferSize": { "type": "integer", "minimum": 0 } }, + "queueManager": { + "maxSchedulingTime": { "type": "string" }, + "maxRetriesToGetQueue": { "type": "integer", "minimum": 0 } } + }, + "required": ["jvmHeapMB"] + } + ] + }, + + "etcd": { + "type": "object", + "properties": { "external": { "type": "boolean" } }, + "required": ["external"], + "if": { "properties": { "external": { "const": false } } }, + "then": { + "allOf": [ + { "$ref": "#/definitions/podspec" }, + { "properties": { "port": { "type": "integer", "minimum": 0 } }, "required": ["port"] }, + { "properties": { "replicaCount": { "const": 1 } } }, + { "properties": { "lease_timeout": { "type": "integer" } } }, + { "properties": { "pool_threads": { "type": "integer", "minimum": 1 } } }, + { "properties": { "persistence": { "size": { "type": "string" } } } }, + { "$ref": "#/definitions/optionalExternalHostCheck" } + ] + } + }, + "invoker": { "allOf": [ { "$ref": "#/definitions/daemonset" }, @@ -514,7 +569,9 @@ "type": "object", "properties": { "zookeeper": { "$ref": "#/definitions/standardPodProbes" }, - "controller": { "$ref": "#/definitions/standardPodProbes" } + "kafka": { "$ref": "#/definitions/standardPodProbes" }, + "controller": { "$ref": "#/definitions/standardPodProbes" }, + "scheduler": { "$ref": "#/definitions/standardPodProbes" } } } }, diff --git a/helm/openwhisk/values.yaml b/helm/openwhisk/values.yaml index 787e6e74..52c4e77d 100644 --- a/helm/openwhisk/values.yaml +++ b/helm/openwhisk/values.yaml @@ -120,9 +120,15 @@ whisk: segmentBytes: "" retentionBytes: "" retentionMs: "" + scheduler: + segmentBytes: "" + retentionBytes: "" + retentionMs: "" containerPool: userMemory: "2048m" runtimes: "runtimes.json" + durationChecker: + timeWindow: "1 d" testing: includeTests: true includeSystemTests: false @@ -151,7 +157,7 @@ k8s: # Images used to run auxillary tasks/jobs utility: imageName: "openwhisk/ow-utils" - imageTag: "c5970a6" + imageTag: "a1639f0" imagePullPolicy: "IfNotPresent" # Docker registry @@ -249,20 +255,69 @@ nginx: # Controller configurations controller: imageName: "openwhisk/controller" - imageTag: "c5970a6" + imageTag: "a1639f0" + imagePullPolicy: "IfNotPresent" + replicaCount: 1 + restartPolicy: "Always" + port: 8080 + options: "" + jvmHeapMB: "1024" + jvmOptions: "" + loglevel: "INFO" + +# Scheduler configurations +scheduler: + enable: true + imageName: "openwhisk/scheduler" + imageTag: "a1639f0" imagePullPolicy: "IfNotPresent" replicaCount: 1 restartPolicy: "Always" port: 8080 + rpcPort: 13001 options: "" jvmHeapMB: "1024" jvmOptions: "" loglevel: "INFO" + protocol: "http" + maxPeek: 128 + # Sometimes the kubernetes client takes a long time for pod creation + inProgressJobRetention: "20 seconds" + blackboxMultiple: 15 + dataManagementService: + retryInterval: "1 second" + queueManager: + maxSchedulingTime: "20 seconds" + maxRetriesToGetQueue: "13" + queue: + idleGrace: "20 seconds" + stopGrace: "20 seconds" + flushGrace: "60 seconds" + gracefulShutdownTimeout: "5 seconds" + maxRetentionSize: 10000 + maxRetentionMs: 60000 + throttlingFraction: 0.9 + durationBufferSize: 10 + +# etcd (used by scheduler and controller if scheduler is enabled) +etcd: + external: false + imageName: "quay.io/coreos/etcd" + imageTag: "v3.4.0" + imagePullPolicy: "IfNotPresent" + # NOTE: setting replicaCount > 1 will not work; need to add etcd cluster configuration + replicaCount: 1 + restartPolicy: "Always" + port: 2379 + lease_timeout: 1 + pool_threads: 10 + persistence: + size: 256Mi # Invoker configurations invoker: imageName: "openwhisk/invoker" - imageTag: "c5970a6" + imageTag: "a1639f0" imagePullPolicy: "IfNotPresent" restartPolicy: "Always" runtimeDeleteTimeout: "30 seconds" @@ -317,7 +372,7 @@ redis: # User-events configuration user_events: imageName: "openwhisk/user-events" - imageTag: "c5970a6" + imageTag: "a1639f0" imagePullPolicy: "IfNotPresent" replicaCount: 1 restartPolicy: "Always" @@ -462,6 +517,15 @@ probes: initialDelaySeconds: 10 periodSeconds: 10 timeoutSeconds: 1 + scheduler: + livenessProbe: + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 1 + readinessProbe: + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 1 # Pod Disruption Budget allows Pods to survive Voluntary and Involuntary Disruptions. # for more information refer - https://kubernetes.io/docs/concepts/workloads/pods/disruptions/ diff --git a/tools/travis/collect-logs.sh b/tools/travis/collect-logs.sh index 57254973..f6c9d773 100755 --- a/tools/travis/collect-logs.sh +++ b/tools/travis/collect-logs.sh @@ -32,6 +32,7 @@ kubectl -n openwhisk logs -lname=ow4travis-couchdb >& logs/couchdb.log kubectl -n openwhisk logs -lname=ow4travis-zookeeper >& logs/zookeeper.log kubectl -n openwhisk logs -lname=ow4travis-kafka >& logs/kafka.log kubectl -n openwhisk logs -lname=ow4travis-controller >& logs/controller.log +kubectl -n openwhisk logs -lname=ow4travis-scheduler >& logs/scheduler.log kubectl -n openwhisk logs -lname=ow4travis-invoker -c docker-pull-runtimes >& logs/invoker-docker-pull.log kubectl -n openwhisk logs -lname=ow4travis-invoker -c invoker >& logs/invoker-invoker.log kubectl -n openwhisk logs -lname=ow4travis-nginx >& logs/nginx.log diff --git a/tools/travis/deploy-chart.sh b/tools/travis/deploy-chart.sh index e4bcc583..cacce71b 100755 --- a/tools/travis/deploy-chart.sh +++ b/tools/travis/deploy-chart.sh @@ -216,12 +216,17 @@ if [ "${OW_LEAN_MODE:-false}" == "false" ]; then # Wait for the controller to confirm that it has at least one healthy invoker verifyHealthyInvoker + # Wait for scheduler to be up + statefulsetHealthCheck "ow4travis-scheduler" + # Verify that the user-metrics components were deployed successfully deploymentHealthCheck "ow4travis-user-events" # deploymentHealthCheck "ow4travis-prometheus-server" deploymentHealthCheck "ow4travis-grafana" fi + + # Wait for install-packages job to complete successfully jobHealthCheck "ow4travis-install-packages"