Skip to content

Commit

Permalink
fix: parse s3 path when contains blank space in name
Browse files Browse the repository at this point in the history
  • Loading branch information
leo-the-nardo committed Nov 10, 2024
1 parent a410548 commit 7621e3b
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 8 deletions.
10 changes: 6 additions & 4 deletions spark-jobs/brazilian-finder/brazilian-finder_spark.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from urllib.parse import quote

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, broadcast
from spark_session import execute_spark

def spark_job(spark: SparkSession, params, *args, **kwargs):
# Extract parameters
s3_input_combo_path = params.get("source_bucket")
s3_brazilian_words_path = params.get("brazilian_words_bucket")
s3_output_path = params.get("output_bucket")
s3_master_combo_path = params.get("master_bucket")
s3_input_combo_path = quote(params.get("source_bucket"), safe=':/')
s3_brazilian_words_path = quote(params.get("brazilian_words_bucket"), safe=':/')
s3_output_path = quote(params.get("output_bucket"), safe=':/')
s3_master_combo_path = quote(params.get("master_bucket"), safe=':/')
# Load and broadcast the words table
words_df = spark.read.text(s3_brazilian_words_path).select(lower("value").alias("word"))
broadcasted_words_df = broadcast(words_df)
Expand Down
10 changes: 6 additions & 4 deletions spark-jobs/combopurifier/combopurifier_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
trim,
col
)
from urllib.parse import quote
def spark_job(spark: SparkSession, params, *args, **kwargs):
s3_input_combo_path = params.get("source_bucket")
s3_output_combo_path = params.get("target_bucket")
s3_output_delta_combo_path = params.get("target_bucket_delta")
s3_master_combo_path = params.get("master_bucket")

s3_input_combo_path = quote(params.get("source_bucket"), safe=':/')
s3_output_combo_path = quote(params.get("target_bucket"), safe=':/')
s3_output_delta_combo_path = quote(params.get("target_bucket_delta"), safe=':/')
s3_master_combo_path = quote(params.get("master_bucket"), safe=':/')

# 3. Read Incoming File
df_raw = spark.read.text(s3_input_combo_path)
Expand Down

0 comments on commit 7621e3b

Please sign in to comment.