fix: parse s3 path when contains blank space in name

leo-the-nardo · Nov 10, 2024 · 7621e3b · 7621e3b
1 parent a410548
commit 7621e3b
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 8 deletions.
diff --git a/spark-jobs/brazilian-finder/brazilian-finder_spark.py b/spark-jobs/brazilian-finder/brazilian-finder_spark.py
@@ -1,13 +1,15 @@
+from urllib.parse import quote
+
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import col, lower, broadcast
 from spark_session import execute_spark
 
 def spark_job(spark: SparkSession, params, *args, **kwargs):
     # Extract parameters
-    s3_input_combo_path = params.get("source_bucket")
-    s3_brazilian_words_path = params.get("brazilian_words_bucket")
-    s3_output_path = params.get("output_bucket")
-    s3_master_combo_path = params.get("master_bucket")
+    s3_input_combo_path = quote(params.get("source_bucket"), safe=':/')
+    s3_brazilian_words_path = quote(params.get("brazilian_words_bucket"), safe=':/')
+    s3_output_path = quote(params.get("output_bucket"), safe=':/')
+    s3_master_combo_path = quote(params.get("master_bucket"), safe=':/')
     # Load and broadcast the words table
     words_df = spark.read.text(s3_brazilian_words_path).select(lower("value").alias("word"))
     broadcasted_words_df = broadcast(words_df)

diff --git a/spark-jobs/combopurifier/combopurifier_spark.py b/spark-jobs/combopurifier/combopurifier_spark.py
@@ -7,11 +7,13 @@
     trim,
     col
 )
+from urllib.parse import quote
 def spark_job(spark: SparkSession, params, *args, **kwargs):
-    s3_input_combo_path = params.get("source_bucket")
-    s3_output_combo_path = params.get("target_bucket")
-    s3_output_delta_combo_path = params.get("target_bucket_delta")
-    s3_master_combo_path = params.get("master_bucket")
+
+    s3_input_combo_path = quote(params.get("source_bucket"), safe=':/')
+    s3_output_combo_path = quote(params.get("target_bucket"), safe=':/')
+    s3_output_delta_combo_path = quote(params.get("target_bucket_delta"), safe=':/')
+    s3_master_combo_path = quote(params.get("master_bucket"), safe=':/')
 
     # 3. Read Incoming File
     df_raw = spark.read.text(s3_input_combo_path)