From 843319fd589548001d6059e3b111ae7068415ccd Mon Sep 17 00:00:00 2001 From: yanghua Date: Tue, 7 Feb 2023 18:16:42 +0800 Subject: [PATCH] [Addon #603] Add Apache Spark as a experimental addon Signed-off-by: yanghua --- .../spark-kubernetes-operator/README.md | 170 ++++++++++++++++++ .../definitions/spark-application.cue | 90 ++++++++++ .../spark-kubernetes-operator/metadata.yaml | 15 ++ .../spark-kubernetes-operator/parameter.cue | 20 +++ .../spark-kubernetes-operator/template.cue | 101 +++++++++++ 5 files changed, 396 insertions(+) create mode 100644 experimental/addons/spark-kubernetes-operator/README.md create mode 100644 experimental/addons/spark-kubernetes-operator/definitions/spark-application.cue create mode 100644 experimental/addons/spark-kubernetes-operator/metadata.yaml create mode 100644 experimental/addons/spark-kubernetes-operator/parameter.cue create mode 100644 experimental/addons/spark-kubernetes-operator/template.cue diff --git a/experimental/addons/spark-kubernetes-operator/README.md b/experimental/addons/spark-kubernetes-operator/README.md new file mode 100644 index 00000000..bce23450 --- /dev/null +++ b/experimental/addons/spark-kubernetes-operator/README.md @@ -0,0 +1,170 @@ +# spark-kubernetes-operator + +A kubernetes operator for Apache Spark(https://github.com/GoogleCloudPlatform/spark-on-k8s-operator), it allows users to manage Spark applications and their lifecycle through native K8S tooling like `kubectl`. + +> Note: It's not provided by Apache Spark. But widely used by a large number of companies(https://github.com/GoogleCloudPlatform/spark-on-k8s-operator/blob/master/docs/who-is-using.md). + +# Install + +``` +#The following steps are for enabling fluxcd and spark-kubernetes-operator in namespace called "spark-operator". + +vela addon enable fluxcd +vela addon enable spark-kubernetes-operator +``` + +# Uninstall + +``` +vela addon disable spark-kubernetes-operator +vela addon disable fluxcd +``` + +# To check the spark-kubernetes-operator running status + +* Firstly, check the spark-kubernetes-operator (and the fluxcd and we need to deploy by helm) running status + +``` +vela addon status spark-kubernetes-operator +vela ls -A | grep spark +``` + +* Secondly, show the component type `spark-cluster`, so we know how to use it in one application. As a spark user, you can choose the parameter to set for your spark cluster. + +``` +vela show spark-application +# Specification ++---------------------+------------------------------------------------------------------------------------------------------+-------------------+----------+---------+ +| NAME | DESCRIPTION | TYPE | REQUIRED | DEFAULT | ++---------------------+------------------------------------------------------------------------------------------------------+-------------------+----------+---------+ +| name | Specify the spark application name. | string | true | | +| namespace | Specify the namespace for spark application to install. | string | true | | +| type | Specify the application language type, e.g. "Scala", "Python", "Java" or "R". | string | true | | +| pythonVersion | Specify the python version. | string | false | | +| mode | Specify the deploy mode, e.go "cluster", "client" or "in-cluster-client". | string | true | | +| image | Specify the container image for the driver, executor, and init-container. | string | true | | +| imagePullPolicy | Specify the image pull policy for the driver, executor, and init-container. | string | true | | +| mainClass | Specify the fully-qualified main class of the Spark application. | string | true | | +| mainApplicationFile | Specify the path to a bundled JAR, Python, or R file of the application. | string | true | | +| sparkVersion | Specify the version of Spark the application uses. | string | true | | +| driverCores | Specify the number of CPU cores to request for the driver pod. | int | true | | +| executorCores | Specify the number of CPU cores to request for the executor pod. | int | true | | +| arguments | Specify a list of arguments to be passed to the application. | []string | false | | +| sparkConf | Specify the config information carries user-specified Spark configuration properties as they would | map[string]string | false | | +| | use the "--conf" option in spark-submit. | | | | +| hadoopConf | Specify the config information carries user-specified Hadoop configuration properties as they would | map[string]string | false | | +| | use the the "--conf" option in spark-submit. The SparkApplication controller automatically adds | | | | +| | prefix "spark.hadoop." to Hadoop configuration properties. | | | | +| sparkConfigMap | Specify the name of the ConfigMap containing Spark configuration files such as log4j.properties. The | string | false | | +| | controller will add environment variable SPARK_CONF_DIR to the path where the ConfigMap is mounted | | | | +| | to. | | | | +| hadoopConfigMap | Specify the name of the ConfigMap containing Hadoop configuration files such as core-site.xml. The | string | false | | +| | controller will add environment variable HADOOP_CONF_DIR to the path where the ConfigMap is mounted | | | | +| | to. | | | | ++---------------------+------------------------------------------------------------------------------------------------------+-------------------+----------+---------+ + +``` + +# Example for how to run a component typed spark-cluster in application + +1. Firstly, copy the following example to "spark-app-v1.yaml": + +> The addon will create a namespace named "spark-cluster" + +``` +apiVersion: core.oam.dev/v1beta1 +kind: Application +metadata: + name: spark-app-v1 + namespace: spark-cluster +spec: + components: + - name: my-spark-application-component + type: spark-application + properties: + name: my-spark-app + namespace: spark-cluster + type: Scala + mode: cluster + image: "gcr.io/spark-operator/spark:v3.1.1" + imagePullPolicy: Always + mainClass: org.apache.spark.examples.streaming.JavaQueueStream + mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.12-3.1.1.jar" + sparkVersion: "3.1.1" + driverCores: 1 + executorCores: 1 +``` + +2. Secondly, start the application: + +``` +vela up -f spark-app-v1.yaml +``` + +You will see the stdout like this: + +``` +Applying an application in vela K8s object format... +I0227 16:54:37.069480 361176 apply.go:121] "creating object" name="spark-app-v1" resource="core.oam.dev/v1beta1, Kind=Application" +✅ App has been deployed 🚀🚀🚀 + Port forward: vela port-forward spark-app-v1 -n spark-cluster + SSH: vela exec spark-app-v1 -n spark-cluster + Logging: vela logs spark-app-v1 -n spark-cluster + App status: vela status spark-app-v1 -n spark-cluster + Endpoint: vela status spark-app-v1 -n spark-cluster --endpoint +Application spark-cluster/spark-app-v1 applied. +``` + +3. Then, you can use the native command to check the status of the Spark applicaiton: + +``` +$ kubectl get sparkapplications -n spark-cluster +NAME STATUS ATTEMPTS START FINISH AGE +my-spark-app RUNNING 1 2023-02-27T08:54:40Z 2m33s +``` + +or get the application detail via this command: + +``` +$ kubectl describe sparkapplication my-spark-app -n spark-cluster +Name: my-spark-app +Namespace: spark-cluster +Labels: app.oam.dev/app-revision-hash=4e5592aea53a5961 + app.oam.dev/appRevision=spark-app-v1-v1 + app.oam.dev/cluster=local + app.oam.dev/component=my-spark-application-component + app.oam.dev/name=spark-app-v1 + app.oam.dev/namespace=spark-cluster + app.oam.dev/resourceType=TRAIT + app.oam.dev/revision= + oam.dev/render-hash=640a3298d803274e + trait.oam.dev/resource=spark + trait.oam.dev/type=AuxiliaryWorkload +Annotations: app.oam.dev/last-applied-configuration: + {"apiVersion":"sparkoperator.k8s.io/v1beta2","kind":"SparkApplication","metadata":{"annotations":{"app.oam.dev/last-applied-time":"2023-02... + app.oam.dev/last-applied-time: 2023-02-27T16:54:37+08:00 + oam.dev/kubevela-version: v1.7.0 +API Version: sparkoperator.k8s.io/v1beta2 +Kind: SparkApplication +Metadata: +...... +``` + +or get the general purpose detail information via this command: + +``` +$ kubectl get app spark-app-v1 -n spark-cluster -oyaml +apiVersion: core.oam.dev/v1beta1 +kind: Application +metadata: +...... +``` + +4. Show the service of spark application via this command: + +``` +$ kubectl get svc -n spark-cluster +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +my-spark-app-c58a1c869214bfe5-driver-svc ClusterIP None 7078/TCP,7079/TCP,4040/TCP 19m +my-spark-app-ui-svc ClusterIP xx.xx.xx.xx 4040/TCP 19m +``` diff --git a/experimental/addons/spark-kubernetes-operator/definitions/spark-application.cue b/experimental/addons/spark-kubernetes-operator/definitions/spark-application.cue new file mode 100644 index 00000000..44b786b4 --- /dev/null +++ b/experimental/addons/spark-kubernetes-operator/definitions/spark-application.cue @@ -0,0 +1,90 @@ +"spark-application": { + annotations: {} + attributes: workload: type: "autodetects.core.oam.dev" + description: "Describes a containerized spark application that can specify resource spec." + labels: {} + type: "component" +} + +template: { + parameter: { + // +usage=Specify the spark application name + name: string + // +usage=Specify the namespace for spark application to install + namespace: string + // +usage=Specify the application language type, e.g. "Scala", "Python", "Java" or "R" + type: string + // +usage=Specify the python version + pythonVersion ?: string + // +usage=Specify the deploy mode, e.go "cluster", "client" or "in-cluster-client" + mode: string + // +usage=Specify the container image for the driver, executor, and init-container + image: string + // +usage=Specify the image pull policy for the driver, executor, and init-container + imagePullPolicy: string + // +usage=Specify the fully-qualified main class of the Spark application + mainClass: string + // +usage=Specify the path to a bundled JAR, Python, or R file of the application + mainApplicationFile: string + // +usage=Specify the version of Spark the application uses + sparkVersion: string + // +usage=Specify the number of CPU cores to request for the driver pod + driverCores: int + // +usage=Specify the number of CPU cores to request for the executor pod + executorCores: int + // +usage=Specify a list of arguments to be passed to the application + arguments ?: [...string] + // +usage=Specify the config information carries user-specified Spark configuration properties as they would use the "--conf" option in spark-submit + sparkConf ?: [string]: string + // +usage=Specify the config information carries user-specified Hadoop configuration properties as they would use the the "--conf" option in spark-submit. The SparkApplication controller automatically adds prefix "spark.hadoop." to Hadoop configuration properties + hadoopConf ?: [string]: string + // +usage=Specify the name of the ConfigMap containing Spark configuration files such as log4j.properties. The controller will add environment variable SPARK_CONF_DIR to the path where the ConfigMap is mounted to + sparkConfigMap ?: string + // +usage=Specify the name of the ConfigMap containing Hadoop configuration files such as core-site.xml. The controller will add environment variable HADOOP_CONF_DIR to the path where the ConfigMap is mounted to + hadoopConfigMap ?: string + + } + + output: { + kind: "ClusterRoleBinding" + apiVersion: "rbac.authorization.k8s.io/v1" + metadata: name: parameter.name + roleRef: { + name: "edit" + apiGroup: "rbac.authorization.k8s.io" + kind: "ClusterRole" + } + subjects: [{ + name: "default" + kind: "ServiceAccount" + namespace: parameter.namespace + }] + } + + outputs: { + + "spark": { + kind: "SparkApplication" + apiVersion: "sparkoperator.k8s.io/v1beta2" + metadata: { + name: parameter.name + namespace: parameter.namespace + } + spec: { + type: parameter.type + mode: parameter.mode + image: parameter.image + imagePullPolicy: parameter.imagePullPolicy + mainClass: parameter.mainClass + mainApplicationFile: parameter.mainApplicationFile + sparkVersion: parameter.sparkVersion + driver: { + cores: parameter.driverCores + } + executor: { + cores: parameter.executorCores + } + } + } + } +} diff --git a/experimental/addons/spark-kubernetes-operator/metadata.yaml b/experimental/addons/spark-kubernetes-operator/metadata.yaml new file mode 100644 index 00000000..65d95dd6 --- /dev/null +++ b/experimental/addons/spark-kubernetes-operator/metadata.yaml @@ -0,0 +1,15 @@ +description: A kubernetes operator for Apache Spark +icon: "https://spark.apache.org/images/spark-logo.png" +invisible: false +name: spark-kubernetes-operator +tags: + - GoogleCloudPlatform/spark-on-k8s-operator +version: v1beta2-1.3.8-3.1.1 +url: https://github.com/GoogleCloudPlatform/spark-on-k8s-operator + +dependencies: + - name: fluxcd + +system: + vela: ">=1.5.0-beta.3" + kubernetes: ">=1.16" diff --git a/experimental/addons/spark-kubernetes-operator/parameter.cue b/experimental/addons/spark-kubernetes-operator/parameter.cue new file mode 100644 index 00000000..17de99c8 --- /dev/null +++ b/experimental/addons/spark-kubernetes-operator/parameter.cue @@ -0,0 +1,20 @@ +// parameter.cue is used to store addon parameters. +// +// You can use these parameters in template.cue or in resources/ by 'parameter.myparam' +// +// For example, you can use parameters to allow the user to customize +// container images, ports, and etc. +parameter: { + // +usage=Deploy to specified clusters. Leave empty to deploy to all clusters. + clusters?: [...string] + // +usage=Namespace to deploy to + namespace: *"spark-operator" | string + // +usage=Specify if create the webhook or not + "createWebhook": *false | bool + // +usage=Specify the image repository + "imageRepository": *"ghcr.io/googlecloudplatform/spark-operator" | string + // +usage=Specify the image tag + "imageTag": *"v1beta2-1.3.8-3.1.1" | string + // +usage=Specify if create the sa for job or not + "createSparkServiceAccount": *false|bool +} diff --git a/experimental/addons/spark-kubernetes-operator/template.cue b/experimental/addons/spark-kubernetes-operator/template.cue new file mode 100644 index 00000000..55ac84bd --- /dev/null +++ b/experimental/addons/spark-kubernetes-operator/template.cue @@ -0,0 +1,101 @@ +package main +output: { + apiVersion: "core.oam.dev/v1beta1" + kind: "Application" + spec: { + components: [ + { + type: "k8s-objects" + name: "spark-operator-ns" + properties: objects: [{ + apiVersion: "v1" + kind: "Namespace" + metadata: name: parameter.namespace + }] + }, + { + type: "k8s-objects" + name: "spark-cluster-ns" + properties: objects: [{ + apiVersion: "v1" + kind: "Namespace" + metadata: name: "spark-cluster" + }, + { + apiVersion: "v1" + kind: "ServiceAccount" + metadata: { + name: "spark" + namespace: "spark-cluster" + } + }] + }, + { + name: "spark-operator-helm" + type: "helm" + dependsOn: ["spark-operator-ns"] + type: "helm" + properties: { + repoType: "helm" + url: "https://googlecloudplatform.github.io/spark-on-k8s-operator/" + chart: "spark-operator" + targetNamespace: parameter["namespace"] + version: "1.1.26" + values: { + image: { + repository: parameter["imageRepository"] + tag: parameter["imageTag"] + } + + serviceAccounts: { + spark: { + create: parameter["createSparkServiceAccount"] + } + } + + serviceAccounts: { + sparkoperator: { + name: "spark-kubernetes-operator" + } + } + + webhook: { + enable: parameter["createWebhook"] + } + } + } + }, + ] + + policies: [ + { + name: "gc-dependency", + type: "garbage-collect", + properties: { + order: "dependency" + } + } + { + type: "shared-resource" + name: "shared-resource-via-namespace" + properties: rules: [{ + selector: resourceTypes: ["Namespace"] + }] + } + { + type: "topology" + name: "deploy-operator" + properties: { + namespace: parameter.namespace + if parameter.clusters != _|_ { + clusters: parameter.clusters + } + + if parameter.clusters == _|_ { + clusterLabelSelector: {} + } + } + } + ] + } +}