fix conflict

apache · Jan 23, 2025 · 9957e45 · 9957e45
2 parents 20733cd + 1a49237
commit 9957e45
Show file tree

Hide file tree

Showing 74 changed files with 1,738 additions and 754 deletions.
diff --git a/.github/workflows/build_branch40.yml b/.github/workflows/build_branch40.yml
@@ -40,7 +40,7 @@ jobs:
           "SCALA_PROFILE": "scala2.13",
           "PYSPARK_IMAGE_TO_TEST": "",
           "PYTHON_TO_TEST": "",
-          "ORACLE_DOCKER_IMAGE_NAME": "gvenzl/oracle-free:23.5-slim"
+          "ORACLE_DOCKER_IMAGE_NAME": "gvenzl/oracle-free:23.6-slim"
         }
       jobs: >-
         {

diff --git a/LICENSE-binary b/LICENSE-binary
@@ -268,6 +268,8 @@ io.fabric8:kubernetes-model-storageclass
 io.fabric8:zjsonpatch
 io.github.java-diff-utils:java-diff-utils
 io.jsonwebtoken:jjwt-api
+io.jsonwebtoken:jjwt-impl
+io.jsonwebtoken:jjwt-jackson
 io.netty:netty-all
 io.netty:netty-buffer
 io.netty:netty-codec

diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -345,11 +345,10 @@
       </properties>
     </profile>
 
-    <!-- Pull in jjwt-impl and jjwt-jackson jars -->
     <profile>
-      <id>jjwt</id>
+      <id>jjwt-provided</id>
       <properties>
-        <jjwt.deps.scope>compile</jjwt.deps.scope>
+        <jjwt.deps.scope>provided</jjwt.deps.scope>
       </properties>
     </profile>
 

diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -82,6 +82,12 @@
     ],
     "sqlState" : "22003"
   },
+  "ARTIFACT_ALREADY_EXISTS" : {
+    "message" : [
+      "The artifact <normalizedRemoteRelativePath> already exists. Please choose a different name for the new artifact because it cannot be overwritten."
+    ],
+    "sqlState" : "42713"
+  },
   "ASSIGNMENT_ARITY_MISMATCH" : {
     "message" : [
       "The number of columns or variables assigned or aliased: <numTarget> does not match the number of source expressions: <numExpr>."
@@ -3962,6 +3968,12 @@
     ],
     "sqlState" : "0A000"
   },
+  "NOT_SUPPORTED_CHANGE_SAME_COLUMN" : {
+    "message" : [
+      "ALTER TABLE ALTER/CHANGE COLUMN is not supported for changing <table>'s column <fieldName> including its nested fields multiple times in the same command."
+    ],
+    "sqlState" : "0A000"
+  },
   "NOT_SUPPORTED_COMMAND_FOR_V2_TABLE" : {
     "message" : [
       "<cmd> is not supported for v2 tables."

diff --git a/...connect/client/jvm/src/main/scala/org/apache/spark/sql/internal/DataFrameWriterImpl.scala b/...connect/client/jvm/src/main/scala/org/apache/spark/sql/internal/DataFrameWriterImpl.scala
@@ -103,7 +103,7 @@ final class DataFrameWriterImpl[T] private[sql] (ds: Dataset[T]) extends DataFra
     // Cannot both be set
     require(!(builder.hasPath && builder.hasTable))
 
-    builder.setMode(mode match {
+    builder.setMode(curmode match {
       case SaveMode.Append => proto.WriteOperation.SaveMode.SAVE_MODE_APPEND
       case SaveMode.Overwrite => proto.WriteOperation.SaveMode.SAVE_MODE_OVERWRITE
       case SaveMode.Ignore => proto.WriteOperation.SaveMode.SAVE_MODE_IGNORE

diff --git a/connector/docker-integration-tests/README.md b/connector/docker-integration-tests/README.md
@@ -45,7 +45,7 @@ the container bootstrapping. To run an individual Docker integration test, use t
 
 Besides the default Docker images, the integration tests can be run with custom Docker images. For example,
 
-    ORACLE_DOCKER_IMAGE_NAME=gvenzl/oracle-free:23.5-slim-faststart ./build/sbt -Pdocker-integration-tests "docker-integration-tests/testOnly *OracleIntegrationSuite"
+    ORACLE_DOCKER_IMAGE_NAME=gvenzl/oracle-free:23.6-slim ./build/sbt -Pdocker-integration-tests "docker-integration-tests/testOnly *OracleIntegrationSuite"
 
 The following environment variables can be used to specify the custom Docker images for different databases:
 

diff --git a/...r-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleDatabaseOnDocker.scala b/...r-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleDatabaseOnDocker.scala
@@ -21,7 +21,7 @@ import org.apache.spark.internal.Logging
 
 class OracleDatabaseOnDocker extends DatabaseOnDocker with Logging {
   lazy override val imageName =
-    sys.env.getOrElse("ORACLE_DOCKER_IMAGE_NAME", "gvenzl/oracle-free:23.5-slim")
+    sys.env.getOrElse("ORACLE_DOCKER_IMAGE_NAME", "gvenzl/oracle-free:23.6-slim")
   val oracle_password = "Th1s1sThe0racle#Pass"
   override val env = Map(
     "ORACLE_PWD" -> oracle_password, // oracle images uses this

diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala
@@ -22,12 +22,10 @@ import com.google.protobuf.{BoolValue, BytesValue, DoubleValue, FloatValue, Int3
 import com.google.protobuf.Descriptors.{Descriptor, FieldDescriptor}
 import com.google.protobuf.WireFormat
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.types._
 
-@DeveloperApi
 object SchemaConverters extends Logging {
 
   /**
@@ -42,13 +40,13 @@ object SchemaConverters extends Logging {
    *
    * @since 3.4.0
    */
-  def toSqlType(
+  private[protobuf] def toSqlType(
       descriptor: Descriptor,
       protobufOptions: ProtobufOptions = ProtobufOptions(Map.empty)): SchemaType = {
     toSqlTypeHelper(descriptor, protobufOptions)
   }
 
-  def toSqlTypeHelper(
+  private[protobuf] def toSqlTypeHelper(
       descriptor: Descriptor,
       protobufOptions: ProtobufOptions): SchemaType = {
     val fields = descriptor.getFields.asScala.flatMap(
@@ -65,7 +63,7 @@ object SchemaConverters extends Logging {
   // exceed the maximum recursive depth specified by the recursiveFieldMaxDepth option.
   // A return of None implies the field has reached the maximum allowed recursive depth and
   // should be dropped.
-  def structFieldFor(
+  private def structFieldFor(
       fd: FieldDescriptor,
       existingRecordNames: Map[String, Int],
       protobufOptions: ProtobufOptions): Option[StructField] = {

diff --git a/core/pom.xml b/core/pom.xml
@@ -622,12 +622,6 @@
         <script.extension>.sh</script.extension>
       </properties>
     </profile>
-    <profile>
-      <id>jjwt</id>
-      <properties>
-        <jjwt.deps.scope>compile</jjwt.deps.scope>
-      </properties>
-    </profile>
     <profile>
       <id>sparkr</id>
       <build>

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -33,7 +33,7 @@ import org.apache.spark._
 import org.apache.spark.api.python.PythonFunction.PythonAccumulator
 import org.apache.spark.internal.{Logging, LogKeys, MDC}
 import org.apache.spark.internal.LogKeys.TASK_NAME
-import org.apache.spark.internal.config.{BUFFER_SIZE, EXECUTOR_CORES, Python}
+import org.apache.spark.internal.config.{BUFFER_SIZE, EXECUTOR_CORES}
 import org.apache.spark.internal.config.Python._
 import org.apache.spark.rdd.InputFileBlockHolder
 import org.apache.spark.resource.ResourceProfile.{EXECUTOR_CORES_LOCAL_PROPERTY, PYSPARK_MEMORY_LOCAL_PROPERTY}
@@ -90,11 +90,11 @@ private[spark] object PythonEvalType {
   }
 }
 
-private object BasePythonRunner {
+private[spark] object BasePythonRunner {
 
-  private lazy val faultHandlerLogDir = Utils.createTempDir(namePrefix = "faulthandler")
+  private[spark] lazy val faultHandlerLogDir = Utils.createTempDir(namePrefix = "faulthandler")
 
-  private def faultHandlerLogPath(pid: Int): Path = {
+  private[spark] def faultHandlerLogPath(pid: Int): Path = {
     new File(faultHandlerLogDir, pid.toString).toPath
   }
 }
@@ -574,15 +574,15 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
         JavaFiles.deleteIfExists(path)
         throw new SparkException(s"Python worker exited unexpectedly (crashed): $error", e)
 
-      case eof: EOFException if !faultHandlerEnabled =>
+      case e: IOException if !faultHandlerEnabled =>
         throw new SparkException(
           s"Python worker exited unexpectedly (crashed). " +
             "Consider setting 'spark.sql.execution.pyspark.udf.faulthandler.enabled' or" +
-            s"'${Python.PYTHON_WORKER_FAULTHANLDER_ENABLED.key}' configuration to 'true' for" +
-            "the better Python traceback.", eof)
+            s"'${PYTHON_WORKER_FAULTHANLDER_ENABLED.key}' configuration to 'true' for " +
+            "the better Python traceback.", e)
 
-      case eof: EOFException =>
-        throw new SparkException("Python worker exited unexpectedly (crashed)", eof)
+      case e: IOException =>
+        throw new SparkException("Python worker exited unexpectedly (crashed)", e)
     }
   }
 

diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -22,7 +22,7 @@ import java.lang.Thread.UncaughtExceptionHandler
 import java.lang.management.ManagementFactory
 import java.net.{URI, URL}
 import java.nio.ByteBuffer
-import java.util.{Locale, Properties, Timer, TimerTask}
+import java.util.{Locale, Properties}
 import java.util.concurrent._
 import java.util.concurrent.atomic.AtomicBoolean
 import java.util.concurrent.locks.ReentrantLock
@@ -209,10 +209,9 @@ private[spark] class Executor(
   // The default isolation group
   val defaultSessionState: IsolatedSessionState = newSessionState(JobArtifactState("default", None))
 
-  private val cacheExpiryTime = 30 * 60 * 1000
   val isolatedSessionCache: Cache[String, IsolatedSessionState] = CacheBuilder.newBuilder()
     .maximumSize(100)
-    .expireAfterAccess(cacheExpiryTime, TimeUnit.MILLISECONDS)
+    .expireAfterAccess(30, TimeUnit.MINUTES)
     .removalListener(new RemovalListener[String, IsolatedSessionState]() {
       override def onRemoval(
           notification: RemovalNotification[String, IsolatedSessionState]): Unit = {
@@ -296,8 +295,6 @@ private[spark] class Executor(
 
   private val pollOnHeartbeat = if (METRICS_POLLING_INTERVAL_MS > 0) false else true
 
-  private val timer = new Timer("executor-state-timer", true)
-
   // Poller for the memory metrics. Visible for testing.
   private[executor] val metricsPoller = new ExecutorMetricsPoller(
     env.memoryManager,
@@ -448,9 +445,6 @@ private[spark] class Executor(
         case NonFatal(e) =>
           logWarning("Unable to stop heartbeater", e)
       }
-      if (timer != null) {
-        timer.cancel()
-      }
       ShuffleBlockPusher.stop()
       if (threadPool != null) {
         threadPool.shutdown()
@@ -565,17 +559,9 @@ private[spark] class Executor(
     override def run(): Unit = {
 
       // Classloader isolation
-      var maybeTimerTask: Option[TimerTask] = None
       val isolatedSession = taskDescription.artifacts.state match {
         case Some(jobArtifactState) =>
-          val state = isolatedSessionCache.get(
-            jobArtifactState.uuid, () => newSessionState(jobArtifactState))
-          maybeTimerTask = Some(new TimerTask {
-            // Resets the expire time till the task ends.
-            def run(): Unit = isolatedSessionCache.getIfPresent(jobArtifactState.uuid)
-          })
-          maybeTimerTask.foreach(timer.schedule(_, cacheExpiryTime / 10, cacheExpiryTime / 10))
-          state
+          isolatedSessionCache.get(jobArtifactState.uuid, () => newSessionState(jobArtifactState))
         case _ => defaultSessionState
       }
 
@@ -876,7 +862,6 @@ private[spark] class Executor(
             uncaughtExceptionHandler.uncaughtException(Thread.currentThread(), t)
           }
       } finally {
-        maybeTimerTask.foreach(_.cancel)
         cleanMDCForTask(taskName, mdcProperties)
         runningTasks.remove(taskId)
         if (taskStarted) {

diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3
@@ -142,6 +142,8 @@ jettison/1.5.4//jettison-1.5.4.jar
 jetty-util-ajax/11.0.24//jetty-util-ajax-11.0.24.jar
 jetty-util/11.0.24//jetty-util-11.0.24.jar
 jjwt-api/0.12.6//jjwt-api-0.12.6.jar
+jjwt-impl/0.12.6//jjwt-impl-0.12.6.jar
+jjwt-jackson/0.12.6//jjwt-jackson-0.12.6.jar
 jline/2.14.6//jline-2.14.6.jar
 jline/3.26.3//jline-3.26.3.jar
 jna/5.14.0//jna-5.14.0.jar

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -1126,6 +1126,7 @@ def __hash__(self):
         "pyspark.ml.tests.connect.test_parity_clustering",
         "pyspark.ml.tests.connect.test_parity_evaluation",
         "pyspark.ml.tests.connect.test_parity_feature",
+        "pyspark.ml.tests.connect.test_parity_pipeline",
     ],
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -1754,10 +1754,6 @@ Apart from these, the following properties are also available, and may be useful
     <br /><code>spark.ui.filters=com.test.filter1</code>
     <br /><code>spark.com.test.filter1.param.name1=foo</code>
     <br /><code>spark.com.test.filter1.param.name2=bar</code>
-    <br />
-    <br />Note that some filter requires additional dependencies. For example,
-    the built-in <code>org.apache.spark.ui.JWSFilter</code> requires
-    <code>jjwt-impl</code> and <code>jjwt-jackson</code> jar files.
   </td>
   <td>1.0.0</td>
 </tr>

diff --git a/docs/security.md b/docs/security.md
@@ -55,8 +55,7 @@ To enable authorization, Spark Master should have
 `spark.master.rest.filters=org.apache.spark.ui.JWSFilter` and
 `spark.org.apache.spark.ui.JWSFilter.param.secretKey=BASE64URL-ENCODED-KEY` configurations, and
 client should provide HTTP `Authorization` header which contains JSON Web Token signed by
-the shared secret key. Please note that this feature requires a Spark distribution built with
-`jjwt` profile.
+the shared secret key.
 
 ### YARN
 
@@ -816,7 +815,7 @@ be limited to origin hosts that need to access the services.
 
 However, like the REST Submission port, Spark also supports HTTP `Authorization` header
 with a cryptographically signed JSON Web Token (JWT) for all UI ports.
-To use it, a user needs the Spark distribution built with `jjwt` profile and to configure
+To use it, a user needs to configure
 `spark.ui.filters=org.apache.spark.ui.JWSFilter` and
 `spark.org.apache.spark.ui.JWSFilter.param.secretKey=BASE64URL-ENCODED-KEY`.
 

diff --git a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator
@@ -52,3 +52,4 @@ org.apache.spark.ml.feature.MinMaxScaler
 org.apache.spark.ml.feature.RobustScaler
 org.apache.spark.ml.feature.StringIndexer
 org.apache.spark.ml.feature.PCA
+org.apache.spark.ml.feature.Word2Vec
diff --git a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
@@ -50,3 +50,4 @@ org.apache.spark.ml.feature.MinMaxScalerModel
 org.apache.spark.ml.feature.RobustScalerModel
 org.apache.spark.ml.feature.StringIndexerModel
 org.apache.spark.ml.feature.PCAModel
+org.apache.spark.ml.feature.Word2VecModel
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.types.DoubleType
 /**
  * Abstraction for multiclass classification results for a given model.
  */
-private[classification] trait ClassificationSummary extends Summary with Serializable {
+private[spark] trait ClassificationSummary extends Summary with Serializable {
 
   /**
    * Dataframe output by the model's `transform` method.
@@ -148,7 +148,7 @@ private[classification] trait ClassificationSummary extends Summary with Seriali
 /**
  * Abstraction for training results.
  */
-private[classification] trait TrainingSummary {
+private[spark] trait TrainingSummary {
 
   /**
    *  objective function (scaled loss + regularization) at each iteration.
@@ -168,7 +168,7 @@ private[classification] trait TrainingSummary {
 /**
  * Abstraction for binary classification results for a given model.
  */
-private[classification] trait BinaryClassificationSummary extends ClassificationSummary {
+private[spark] trait BinaryClassificationSummary extends ClassificationSummary {
 
   private val sparkSession = predictions.sparkSession
   import sparkSession.implicits._

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -1078,7 +1078,7 @@ class LogisticRegressionModel private[spark] (
 
   // For ml connect only
   @Since("4.0.0")
-  private[ml] def this() = this(Identifiable.randomUID("logreg"), Vectors.zeros(0), 0)
+  private[ml] def this() = this(Identifiable.randomUID("logreg"), Vectors.empty, 0)
 
   /**
    * A vector of model coefficients for "binomial" logistic regression. If this model was trained

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -211,6 +211,8 @@ class Word2VecModel private[ml] (
 
   import Word2VecModel._
 
+  private[ml] def this() = this(Identifiable.randomUID("w2v"), null)
+
   /**
    * Returns a dataframe with two fields, "word" and "vector", with "word" being a String and
    * and the vector the DenseVector that it is mapped to.

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -704,7 +704,7 @@ class LinearRegressionModel private[ml] (
 
   // For ml connect only
   @Since("4.0.0")
-  private[ml] def this() = this(Identifiable.randomUID("linReg"), Vectors.zeros(0), 0.0, 0.0)
+  private[ml] def this() = this(Identifiable.randomUID("linReg"), Vectors.empty, 0.0, 0.0)
 
   override val numFeatures: Int = coefficients.size
 

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala
@@ -122,7 +122,7 @@ private[spark] trait DecisionTreeModel {
  * Abstraction for models which are ensembles of decision trees
  * @tparam M  Type of tree model in this ensemble
  */
-private[ml] trait TreeEnsembleModel[M <: DecisionTreeModel] {
+private[spark] trait TreeEnsembleModel[M <: DecisionTreeModel] {
 
   // Note: We use getTrees since subclasses of TreeEnsembleModel will store subclasses of
   //       DecisionTreeModel.

diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/HasTrainingSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/util/HasTrainingSummary.scala
@@ -27,7 +27,7 @@ import org.apache.spark.annotation.Since
  * @tparam T Summary instance type
  */
 @Since("3.0.0")
-private[ml] trait HasTrainingSummary[T] {
+private[spark] trait HasTrainingSummary[T] {
 
   private[ml] final var trainingSummary: Option[T] = None
 

diff --git a/pom.xml b/pom.xml
@@ -280,7 +280,7 @@
     <orc.deps.scope>compile</orc.deps.scope>
     <parquet.deps.scope>compile</parquet.deps.scope>
     <parquet.test.deps.scope>test</parquet.test.deps.scope>
-    <jjwt.deps.scope>test</jjwt.deps.scope>
+    <jjwt.deps.scope>compile</jjwt.deps.scope>
 
     <spark.yarn.isHadoopProvided>false</spark.yarn.isHadoopProvided>
 
@@ -3531,7 +3531,7 @@
       <id>sparkr</id>
     </profile>
     <profile>
-      <id>jjwt</id>
+      <id>jjwt-provided</id>
     </profile>
     <!-- use org.openlabtesting.leveldbjni on aarch64 platform except MacOS -->
     <profile>