From 837655db855f51db95b1ab720493c6fb4e7d5e61 Mon Sep 17 00:00:00 2001 From: Victor San Kho Lin Date: Sat, 4 Jan 2025 19:55:16 +1100 Subject: [PATCH] Added OrcaVault raw schema link models * Baseline linking between Libray hub to other hub models Sample, InternalSubject, Project, SequencingRun, Experiment --- dev/src/load.sh | 2 + dev/src/ods.sql | 54 +++---- .../models/raw/link_library_experiment.sql | 44 ++++++ .../raw/link_library_internal_subject.sql | 51 +++++++ orcavault/models/raw/link_library_project.sql | 50 +++++++ orcavault/models/raw/link_library_sample.sql | 49 +++++++ .../raw/link_library_sequencing_run.sql | 46 ++++++ orcavault/models/raw/link_schema.yml | 133 ++++++++++++++++++ 8 files changed, 402 insertions(+), 27 deletions(-) create mode 100644 orcavault/models/raw/link_library_experiment.sql create mode 100644 orcavault/models/raw/link_library_internal_subject.sql create mode 100644 orcavault/models/raw/link_library_project.sql create mode 100644 orcavault/models/raw/link_library_sample.sql create mode 100644 orcavault/models/raw/link_library_sequencing_run.sql create mode 100644 orcavault/models/raw/link_schema.yml diff --git a/dev/src/load.sh b/dev/src/load.sh index 34d9679..e2b38a2 100644 --- a/dev/src/load.sh +++ b/dev/src/load.sh @@ -11,6 +11,8 @@ PGPASSWORD=dev psql -h 0.0.0.0 -d orcavault -U dev < '') and + (experiment_id is not null and experiment_id <> '') + +), + +transformed as ( + + select + encode(sha256(cast(experiment_id as bytea)), 'hex') as experiment_hk, + encode(sha256(cast(library_id as bytea)), 'hex') as library_hk, + cast('{{ run_started_at }}' as timestamptz) as load_datetime, + (select 'lab') as record_source + from + cleaned + +), + +final as ( + + select + encode(sha256(concat(experiment_hk, library_hk)::bytea), 'hex') as library_experiment_hk, + experiment_hk, + library_hk, + load_datetime, + record_source + from + transformed + +) + +select * from final diff --git a/orcavault/models/raw/link_library_internal_subject.sql b/orcavault/models/raw/link_library_internal_subject.sql new file mode 100644 index 0000000..848f282 --- /dev/null +++ b/orcavault/models/raw/link_library_internal_subject.sql @@ -0,0 +1,51 @@ +with source as ( + + select library_id, subject_id as internal_subject_id from {{ source('ods', 'data_portal_labmetadata') }} + union + select library_id, subject_id as internal_subject_id from {{ source('ods', 'data_portal_limsrow') }} + union + select lib.library_id as library_id, idv.individual_id as internal_subject_id from {{ source('ods', 'metadata_manager_library') }} as lib + join {{ source('ods', 'metadata_manager_subject') }} as sbj on sbj.orcabus_id = lib.subject_orcabus_id + join {{ source('ods', 'metadata_manager_subjectindividuallink') }} as lnk on lnk.subject_orcabus_id = sbj.orcabus_id + join {{ source('ods', 'metadata_manager_individual') }} as idv on idv.orcabus_id = lnk.individual_orcabus_id + +), + +cleaned as ( + + select + distinct library_id, internal_subject_id + from + source + where + (library_id is not null and library_id <> '') and + (internal_subject_id is not null and internal_subject_id <> '') + +), + +transformed as ( + + select + encode(sha256(cast(internal_subject_id as bytea)), 'hex') as internal_subject_hk, + encode(sha256(cast(library_id as bytea)), 'hex') as library_hk, + cast('{{ run_started_at }}' as timestamptz) as load_datetime, + (select 'lab') as record_source + from + cleaned + +), + +final as ( + + select + encode(sha256(concat(internal_subject_hk, library_hk)::bytea), 'hex') as library_internal_subject_hk, + internal_subject_hk, + library_hk, + load_datetime, + record_source + from + transformed + +) + +select * from final diff --git a/orcavault/models/raw/link_library_project.sql b/orcavault/models/raw/link_library_project.sql new file mode 100644 index 0000000..eaa797c --- /dev/null +++ b/orcavault/models/raw/link_library_project.sql @@ -0,0 +1,50 @@ +with source as ( + + select library_id, project_name as project_id from {{ source('ods', 'data_portal_labmetadata') }} + union + select library_id, project_name as project_id from {{ source('ods', 'data_portal_limsrow') }} + union + select lib.library_id as library_id, prj.project_id as project_id from {{ source('ods', 'metadata_manager_library') }} as lib + join {{ source('ods', 'metadata_manager_libraryprojectlink') }} as lnk on lnk.library_orcabus_id = lib.orcabus_id + join {{ source('ods', 'metadata_manager_project') }} as prj on lnk.project_orcabus_id = prj.orcabus_id + +), + +cleaned as ( + + select + distinct library_id, project_id + from + source + where + (library_id is not null and library_id <> '') and + (project_id is not null and project_id <> '') + +), + +transformed as ( + + select + encode(sha256(cast(project_id as bytea)), 'hex') as project_hk, + encode(sha256(cast(library_id as bytea)), 'hex') as library_hk, + cast('{{ run_started_at }}' as timestamptz) as load_datetime, + (select 'lab') as record_source + from + cleaned + +), + +final as ( + + select + encode(sha256(concat(project_hk, library_hk)::bytea), 'hex') as library_project_hk, + project_hk, + library_hk, + load_datetime, + record_source + from + transformed + +) + +select * from final diff --git a/orcavault/models/raw/link_library_sample.sql b/orcavault/models/raw/link_library_sample.sql new file mode 100644 index 0000000..84dfe66 --- /dev/null +++ b/orcavault/models/raw/link_library_sample.sql @@ -0,0 +1,49 @@ +with source as ( + + select library_id, sample_id from {{ source('ods', 'data_portal_limsrow') }} + union + select library_id, sample_id from {{ source('ods', 'data_portal_labmetadata') }} + union + select library_id, smp.sample_id as sample_id from {{ source('ods', 'metadata_manager_library') }} as lib + join {{ source('ods', 'metadata_manager_sample') }} as smp on lib.sample_orcabus_id = smp.orcabus_id + +), + +cleaned as ( + + select + distinct library_id, sample_id + from + source + where + (library_id is not null and library_id <> '') and + (sample_id is not null and sample_id <> '') + +), + +transformed as ( + + select + encode(sha256(cast(sample_id as bytea)), 'hex') as sample_hk, + encode(sha256(cast(library_id as bytea)), 'hex') as library_hk, + cast('{{ run_started_at }}' as timestamptz) as load_datetime, + (select 'lab') as record_source + from + cleaned + +), + +final as ( + + select + encode(sha256(concat(sample_hk, library_hk)::bytea), 'hex') as library_sample_hk, + sample_hk, + library_hk, + load_datetime, + record_source + from + transformed + +) + +select * from final diff --git a/orcavault/models/raw/link_library_sequencing_run.sql b/orcavault/models/raw/link_library_sequencing_run.sql new file mode 100644 index 0000000..9bd9a82 --- /dev/null +++ b/orcavault/models/raw/link_library_sequencing_run.sql @@ -0,0 +1,46 @@ +with source as ( + + select library_id, instrument_run_id as sequencing_run_id from {{ source('ods', 'data_portal_libraryrun') }} + union + select library_id, illumina_id as sequencing_run_id from {{ source('ods', 'data_portal_limsrow') }} + +), + +cleaned as ( + + select + distinct library_id, sequencing_run_id + from + source + where + (library_id is not null and library_id <> '') and + (sequencing_run_id is not null and sequencing_run_id <> '') + +), + +transformed as ( + + select + encode(sha256(cast(sequencing_run_id as bytea)), 'hex') as sequencing_run_hk, + encode(sha256(cast(library_id as bytea)), 'hex') as library_hk, + cast('{{ run_started_at }}' as timestamptz) as load_datetime, + (select 'lab') as record_source + from + cleaned + +), + +final as ( + + select + encode(sha256(concat(sequencing_run_hk, library_hk)::bytea), 'hex') as library_sequencing_run_hk, + sequencing_run_hk, + library_hk, + load_datetime, + record_source + from + transformed + +) + +select * from final diff --git a/orcavault/models/raw/link_schema.yml b/orcavault/models/raw/link_schema.yml new file mode 100644 index 0000000..32a7d8e --- /dev/null +++ b/orcavault/models/raw/link_schema.yml @@ -0,0 +1,133 @@ +version: 2 + +models: + + - name: link_library_sequencing_run + config: + contract: { enforced: true } + constraints: + - type: primary_key + columns: [ library_sequencing_run_hk ] + - type: foreign_key + columns: [ sequencing_run_hk ] + to: ref('hub_sequencing_run') + to_columns: [ sequencing_run_hk ] + - type: foreign_key + columns: [ library_hk ] + to: ref('hub_library') + to_columns: [ library_hk ] + columns: + - name: library_sequencing_run_hk + data_type: char(64) + - name: sequencing_run_hk + data_type: char(64) + - name: library_hk + data_type: char(64) + - name: load_datetime + data_type: timestamptz + - name: record_source + data_type: varchar(255) + + - name: link_library_sample + config: + contract: { enforced: true } + constraints: + - type: primary_key + columns: [ library_sample_hk ] + - type: foreign_key + columns: [ sample_hk ] + to: ref('hub_sample') + to_columns: [ sample_hk ] + - type: foreign_key + columns: [ library_hk ] + to: ref('hub_library') + to_columns: [ library_hk ] + columns: + - name: library_sample_hk + data_type: char(64) + - name: sample_hk + data_type: char(64) + - name: library_hk + data_type: char(64) + - name: load_datetime + data_type: timestamptz + - name: record_source + data_type: varchar(255) + + - name: link_library_internal_subject + config: + contract: { enforced: true } + constraints: + - type: primary_key + columns: [ library_internal_subject_hk ] + - type: foreign_key + columns: [ internal_subject_hk ] + to: ref('hub_internal_subject') + to_columns: [ internal_subject_hk ] + - type: foreign_key + columns: [ library_hk ] + to: ref('hub_library') + to_columns: [ library_hk ] + columns: + - name: library_internal_subject_hk + data_type: char(64) + - name: internal_subject_hk + data_type: char(64) + - name: library_hk + data_type: char(64) + - name: load_datetime + data_type: timestamptz + - name: record_source + data_type: varchar(255) + + - name: link_library_experiment + config: + contract: { enforced: true } + constraints: + - type: primary_key + columns: [ library_experiment_hk ] + - type: foreign_key + columns: [ experiment_hk ] + to: ref('hub_experiment') + to_columns: [ experiment_hk ] + - type: foreign_key + columns: [ library_hk ] + to: ref('hub_library') + to_columns: [ library_hk ] + columns: + - name: library_experiment_hk + data_type: char(64) + - name: experiment_hk + data_type: char(64) + - name: library_hk + data_type: char(64) + - name: load_datetime + data_type: timestamptz + - name: record_source + data_type: varchar(255) + + - name: link_library_project + config: + contract: { enforced: true } + constraints: + - type: primary_key + columns: [ library_project_hk ] + - type: foreign_key + columns: [ project_hk ] + to: ref('hub_project') + to_columns: [ project_hk ] + - type: foreign_key + columns: [ library_hk ] + to: ref('hub_library') + to_columns: [ library_hk ] + columns: + - name: library_project_hk + data_type: char(64) + - name: project_hk + data_type: char(64) + - name: library_hk + data_type: char(64) + - name: load_datetime + data_type: timestamptz + - name: record_source + data_type: varchar(255)