121 lines
4 KiB
Julia
121 lines
4 KiB
Julia
### A Pluto.jl notebook ###
|
|
# v0.12.20
|
|
|
|
using Markdown
|
|
using InteractiveUtils
|
|
|
|
# ╔═╡ 21895968-6c5d-11eb-1800-cd0f921c6d83
|
|
begin
|
|
using DataFrames, CSV
|
|
using Dates
|
|
using Flux
|
|
end
|
|
|
|
# ╔═╡ 326f6ec0-6c5d-11eb-105a-ff2f251973c2
|
|
splitLabel(label) = split(label, " ")[1]
|
|
|
|
# ╔═╡ 3771c65c-6c5d-11eb-24d7-39b38b5bfde6
|
|
function onehot_df(training::DataFrame, key::String, labels::Array, insert::Bool)
|
|
if !insert
|
|
return select(training, Not(key))
|
|
end
|
|
oh_key = Flux.onehotbatch(training[!, key], labels)
|
|
return hcat(select(training, Not(key)), DataFrame(oh_key', (key[1:4]*"_") .* splitLabel.(labels)))
|
|
end
|
|
|
|
# ╔═╡ 3b016552-6c5d-11eb-0818-096189d166bb
|
|
dateConv(d) = Dates.DateTime(d, "yyyy-mm-dd HH:MM:SS")
|
|
|
|
# ╔═╡ 49b50c02-6c5d-11eb-2c20-15a0ffa11887
|
|
convertDay(array) = Dates.value.(convert.(Dates.Day, round.(array, Dates.Day)))
|
|
|
|
# ╔═╡ 4d686d12-6c5d-11eb-0f48-f9ad13682cd2
|
|
begin
|
|
filename_training = "training_SC_GGP_AXA_FR.csv"
|
|
filename_scoring = "scoring_SC_GGP_AXA_FR.csv"
|
|
training = DataFrame(CSV.File(filename_training))
|
|
scoring = DataFrame(CSV.File(filename_scoring))
|
|
|
|
# Date conversion
|
|
# Missing
|
|
today_ = Dates.now()
|
|
today_string = Dates.format(today_, "yyyy-mm-dd HH:MM:SS")
|
|
replace!(training.ouverture_dernier_sinistre, missing => today_string)
|
|
replace!(scoring.ouverture_dernier_sinistre, missing => today_string)
|
|
replace!(training.cloture_dernier_sinistre, missing => today_string)
|
|
replace!(scoring.cloture_dernier_sinistre, missing => today_string)
|
|
|
|
# Date conversion
|
|
date_keys = ["debut_contrat", "ouverture_dernier_sinistre", "cloture_dernier_sinistre"]
|
|
for key in date_keys
|
|
training[!, key] = dateConv.(training[!, key])
|
|
scoring[!, key] = dateConv.(scoring[!, key])
|
|
end
|
|
|
|
# Add duree
|
|
insertcols!(training, "duree_contrat" => convertDay(today_ - training.debut_contrat))
|
|
insertcols!(scoring, "duree_contrat" => convertDay(today_ - scoring.debut_contrat))
|
|
insertcols!(training, "duree_dernier_sinistre" => convertDay(training.cloture_dernier_sinistre - training.ouverture_dernier_sinistre))
|
|
insertcols!(scoring, "duree_dernier_sinistre" => convertDay(scoring.cloture_dernier_sinistre - scoring.ouverture_dernier_sinistre))
|
|
insertcols!(training, "duree_zero_sinistre" => convertDay(today_ - training.cloture_dernier_sinistre))
|
|
insertcols!(scoring, "duree_zero_sinistre" => convertDay(today_ - scoring.cloture_dernier_sinistre))
|
|
|
|
# Remove dates
|
|
for key in date_keys
|
|
select!(training, Not(key))
|
|
select!(scoring, Not(key))
|
|
end
|
|
|
|
# One hot encode categorical
|
|
categorical_keys = ["departement", "categorie_socio_professionnelle", "type_de_bien", "statut"]
|
|
insert_bools = [true, true, true, true]
|
|
for i in 1:4
|
|
key = categorical_keys[i]
|
|
insert = insert_bools[i]
|
|
labels = sort(unique(training[!, key]))
|
|
global training = onehot_df(training, key, labels, insert)
|
|
global scoring = onehot_df(scoring, key, labels, insert)
|
|
end
|
|
|
|
# drop useless
|
|
select!(training, Not("index"))
|
|
select!(scoring, Not("index"))
|
|
|
|
training = Float32.(training)
|
|
scoring = Float32.(scoring)
|
|
|
|
#write to files
|
|
Xscoring = Float32.(scoring)
|
|
CSV.write("x"*filename_scoring, Xscoring)
|
|
|
|
Xtraining = Float32.(select(training, Not("target")))
|
|
CSV.write("x"*filename_training, Xtraining)
|
|
|
|
Ytraining = Float32.(training[!, "target"])
|
|
CSV.write("y"*filename_training, DataFrame(target=Ytraining));
|
|
|
|
end
|
|
|
|
# ╔═╡ 9eb19f36-6c71-11eb-32da-394849a133bd
|
|
training
|
|
|
|
# ╔═╡ a231b4a2-6c71-11eb-306b-cf574cf3d093
|
|
scoring
|
|
|
|
# ╔═╡ 97a6a438-6c72-11eb-3996-0f96b5765cfe
|
|
describe(training)
|
|
|
|
# ╔═╡ 9da64908-6c72-11eb-256e-9f683ffde6bc
|
|
describe(scoring)
|
|
|
|
# ╔═╡ Cell order:
|
|
# ╠═21895968-6c5d-11eb-1800-cd0f921c6d83
|
|
# ╠═326f6ec0-6c5d-11eb-105a-ff2f251973c2
|
|
# ╠═3771c65c-6c5d-11eb-24d7-39b38b5bfde6
|
|
# ╠═3b016552-6c5d-11eb-0818-096189d166bb
|
|
# ╠═49b50c02-6c5d-11eb-2c20-15a0ffa11887
|
|
# ╠═4d686d12-6c5d-11eb-0f48-f9ad13682cd2
|
|
# ╠═9eb19f36-6c71-11eb-32da-394849a133bd
|
|
# ╠═a231b4a2-6c71-11eb-306b-cf574cf3d093
|
|
# ╠═97a6a438-6c72-11eb-3996-0f96b5765cfe
|
|
# ╠═9da64908-6c72-11eb-256e-9f683ffde6bc
|