diff --git a/experiments/flights/load_data.jl b/experiments/flights/load_data.jl index a7b3618..ae87255 100644 --- a/experiments/flights/load_data.jl +++ b/experiments/flights/load_data.jl @@ -2,8 +2,8 @@ using CSV using DataFrames: DataFrame dataset = "flights" -dirty_table = CSV.File("datasets/$(dataset)_dirty.csv") |> DataFrame -clean_data = CSV.File("datasets/$(dataset)_clean.csv") |> DataFrame +dirty_table = CSV.File("datasets/$(dataset)_dirty.csv", stringtype=String) |> DataFrame +clean_data = CSV.File("datasets/$(dataset)_clean.csv", stringtype=String) |> DataFrame times_for_flight = Dict{String, Set{String}}("$fl-$field" => Set() for fl in unique(dirty_table.flight) for field in [:sched_dep_time, :sched_arr_time, :act_dep_time, :act_arr_time]) for row in eachrow(dirty_table) diff --git a/experiments/hospital/load_data.jl b/experiments/hospital/load_data.jl index 20f7d37..b927443 100644 --- a/experiments/hospital/load_data.jl +++ b/experiments/hospital/load_data.jl @@ -2,8 +2,8 @@ using CSV using DataFrames: DataFrame dataset = "hospital" -dirty_table = CSV.File("datasets/$(dataset)_dirty.csv") |> DataFrame -clean_table = CSV.File("datasets/$(dataset)_clean.csv") |> DataFrame +dirty_table = CSV.File("datasets/$(dataset)_dirty.csv", stringtype=String) |> DataFrame +clean_table = CSV.File("datasets/$(dataset)_clean.csv", stringtype=String) |> DataFrame # In the dirty data, CSV.jl infers that PhoneNumber, ZipCode, and ProviderNumber # are strings. Our PClean script also models these columns as string-valued. @@ -16,4 +16,4 @@ clean_table[!, :ProviderNumber] = map(x -> "$x", clean_table[!, :ProviderNumber] # Stores sets of unique observed values of each attribute. possibilities = Dict(col => remove_missing(unique(collect(dirty_table[!, col]))) - for col in propertynames(dirty_table)) \ No newline at end of file + for col in propertynames(dirty_table)) diff --git a/experiments/rents/load_data.jl b/experiments/rents/load_data.jl index c417fbc..c25685f 100644 --- a/experiments/rents/load_data.jl +++ b/experiments/rents/load_data.jl @@ -3,8 +3,8 @@ using DataFrames: DataFrame # Load data dataset = "rents" -dirty_table = CSV.File("datasets/$(dataset)_dirty.csv") |> DataFrame -clean_table = CSV.File("datasets/$(dataset)_clean.csv") |> DataFrame +dirty_table = CSV.File("datasets/$(dataset)_dirty.csv", stringtype=String) |> DataFrame +clean_table = CSV.File("datasets/$(dataset)_clean.csv", stringtype=String) |> DataFrame dirty_table[!, :CountyKey] = map(x -> "$(x[1])$(split(x)[1][end])", dirty_table[!, :County])