From 1701977670b9e23931b86c6e00363ef9a5f94fa0 Mon Sep 17 00:00:00 2001 From: Gun9niR Date: Sat, 30 Mar 2024 18:09:14 -0400 Subject: [PATCH 1/9] add cost model for agg --- optd-datafusion-repr/src/cost/base_cost.rs | 72 +++++++- optd-datafusion-repr/src/properties/schema.rs | 15 +- optd-sqlplannertest/src/lib.rs | 1 - optd-sqlplannertest/tests/tpch.planner.sql | 164 ++++++++++++++++++ optd-sqlplannertest/tests/tpch.yml | 31 ++++ 5 files changed, 278 insertions(+), 5 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 55a789ee..1bd08f68 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -1,7 +1,7 @@ use std::{collections::HashMap, sync::Arc}; use crate::plan_nodes::{ - BinOpType, ColumnRefExpr, ConstantExpr, ConstantType, LogOpType, OptRelNode, UnOpType, + BinOpType, ColumnRefExpr, ConstantExpr, ConstantType, ExprList, LogOpType, OptRelNode, UnOpType, }; use crate::properties::column_ref::{ColumnRefPropertyBuilder, GroupColumnRefs}; use crate::{ @@ -323,6 +323,8 @@ const DEFAULT_EQ_SEL: f64 = 0.005; const DEFAULT_INEQ_SEL: f64 = 0.3333333333333333; // Default selectivity estimate for pattern-match operators such as LIKE const DEFAULT_MATCH_SEL: f64 = 0.005; +// Default n-distinct estimate for derived columns or columns lacking statistics +const DEFAULT_N_DISTINCT: u64 = 1; const INVALID_SEL: f64 = 0.01; @@ -499,10 +501,15 @@ impl CostModel for OptCostM Self::cost(row_cnt, row_cnt * row_cnt.ln_1p().max(1.0), 0.0) } OptRelNodeTyp::PhysicalAgg => { - let (row_cnt, _, _) = Self::cost_tuple(&children[0]); + let row_cnt = self.get_agg_row_cnt(context, optimizer); + let child_row_cnt = Self::row_cnt(&children[0]); let (_, compute_cost_1, _) = Self::cost_tuple(&children[1]); let (_, compute_cost_2, _) = Self::cost_tuple(&children[2]); - Self::cost(row_cnt, row_cnt * (compute_cost_1 + compute_cost_2), 0.0) + Self::cost( + row_cnt, + child_row_cnt * (compute_cost_1 + compute_cost_2), + 0.0, + ) } OptRelNodeTyp::List => { let compute_cost = children @@ -544,6 +551,65 @@ impl OptCostModel { } } + fn get_agg_row_cnt( + &self, + context: Option, + optimizer: Option<&CascadesOptimizer>, + ) -> f64 { + match context { + Some(context) => { + if let Some(optimizer) = optimizer { + let group_by_id = context.children_group_ids[2]; + let mut group_by_exprs: Vec>> = + optimizer.get_all_group_bindings(group_by_id, false); + assert!( + group_by_exprs.len() == 1, + "ExprList expression should be the only expression in the GROUP BY group" + ); + let group_by = group_by_exprs.pop().unwrap(); + let group_by = ExprList::from_rel_node(group_by).unwrap(); + if group_by.is_empty() { + 1.0 + } else { + // Multiply the n-distinct of all the group by columns. + // TODO: improve with multi-dimensional n-distinct + let base_table_col_refs = optimizer + .get_property_by_group::(context.group_id, 1); + base_table_col_refs + .iter() + .take(group_by.len()) + .map(|col_ref| match col_ref { + ColumnRef::BaseTableColumnRef { table, col_idx } => { + let table_stats = self + .per_table_stats_map + .get(table); + let column_stats = table_stats.map( + |table_stats| { + table_stats.per_column_stats_vec + .get(*col_idx).unwrap() + } + ); + + if let Some(Some(column_stats)) = column_stats { + column_stats.ndistinct + } else { + // The column type is not supported or stats are missing. + DEFAULT_N_DISTINCT + } + } + ColumnRef::Derived => DEFAULT_N_DISTINCT, + _ => panic!("GROUP BY base table column ref must either be derived or base table"), + }) + .product::() as f64 + } + } else { + panic!("compute_cost() should not be called if optimizer is None") + } + } + None => panic!("compute_cost() should not be called if context is None"), + } + } + /// The expr_tree input must be a "mixed expression tree" /// An "expression node" refers to a RelNode that returns true for is_expression() /// A "full expression tree" is where every node in the tree is an expression node diff --git a/optd-datafusion-repr/src/properties/schema.rs b/optd-datafusion-repr/src/properties/schema.rs index f4967156..fc2e315c 100644 --- a/optd-datafusion-repr/src/properties/schema.rs +++ b/optd-datafusion-repr/src/properties/schema.rs @@ -4,7 +4,7 @@ use std::sync::Arc; use optd_core::property::PropertyBuilder; use super::DEFAULT_NAME; -use crate::plan_nodes::{ConstantType, EmptyRelationData, OptRelNodeTyp}; +use crate::plan_nodes::{ConstantType, EmptyRelationData, FuncType, OptRelNodeTyp}; #[derive(Clone, Debug, Serialize, Deserialize)] pub struct Field { @@ -96,6 +96,19 @@ impl PropertyBuilder for SchemaPropertyBuilder { children.len() ], }, + OptRelNodeTyp::Agg => { + let mut schema = children[1].clone(); + let schema2 = children[2].clone(); + schema.fields.extend(schema2.fields); + schema + } + OptRelNodeTyp::Func(FuncType::Agg(_)) => Schema { + fields: vec![Field { + name: DEFAULT_NAME.to_string(), + typ: ConstantType::Any, + nullable: true, + }], + }, _ => Schema { fields: vec![] }, } } diff --git a/optd-sqlplannertest/src/lib.rs b/optd-sqlplannertest/src/lib.rs index b62e1082..d69ca31d 100644 --- a/optd-sqlplannertest/src/lib.rs +++ b/optd-sqlplannertest/src/lib.rs @@ -140,7 +140,6 @@ impl DatafusionDBMS { task: &str, flags: &[String], ) -> Result<()> { - println!("task_explain(): called on sql={}", sql); use std::fmt::Write; let with_logical = flags.contains(&"with_logical".to_string()); diff --git a/optd-sqlplannertest/tests/tpch.planner.sql b/optd-sqlplannertest/tests/tpch.planner.sql index 99e00812..ba977f40 100644 --- a/optd-sqlplannertest/tests/tpch.planner.sql +++ b/optd-sqlplannertest/tests/tpch.planner.sql @@ -1522,6 +1522,170 @@ PhysicalLimit { skip: 0, fetch: 20 } └── PhysicalScan { table: nation } */ +-- TPC-H Q11 +select + ps_partkey, + sum(ps_supplycost * ps_availqty) as value +from + partsupp, + supplier, + nation +where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'CHINA' +group by + ps_partkey having + sum(ps_supplycost * ps_availqty) > ( + select + sum(ps_supplycost * ps_availqty) * 0.0001000000 + from + partsupp, + supplier, + nation + where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'CHINA' + ) +order by + value desc; + +/* +LogicalSort +├── exprs:SortOrder { order: Desc } +│ └── #1 +└── LogicalProjection { exprs: [ #0, #1 ] } + └── LogicalJoin + ├── join_type: Inner + ├── cond:Gt + │ ├── Cast { cast_to: Decimal128(38, 15), expr: #1 } + │ └── #2 + ├── LogicalAgg + │ ├── exprs:Agg(Sum) + │ │ └── Mul + │ │ ├── #2 + │ │ └── Cast { cast_to: Decimal128(10, 0), expr: #1 } + │ ├── groups: [ #0 ] + │ └── LogicalProjection { exprs: [ #0, #1, #2 ] } + │ └── LogicalJoin + │ ├── join_type: Inner + │ ├── cond:Eq + │ │ ├── #3 + │ │ └── #4 + │ ├── LogicalProjection { exprs: [ #0, #2, #3, #5 ] } + │ │ └── LogicalJoin + │ │ ├── join_type: Inner + │ │ ├── cond:Eq + │ │ │ ├── #1 + │ │ │ └── #4 + │ │ ├── LogicalProjection { exprs: [ #0, #1, #2, #3 ] } + │ │ │ └── LogicalScan { table: partsupp } + │ │ └── LogicalProjection { exprs: [ #0, #3 ] } + │ │ └── LogicalScan { table: supplier } + │ └── LogicalProjection { exprs: [ #0 ] } + │ └── LogicalFilter + │ ├── cond:Eq + │ │ ├── #1 + │ │ └── "CHINA" + │ └── LogicalProjection { exprs: [ #0, #1 ] } + │ └── LogicalScan { table: nation } + └── LogicalProjection + ├── exprs:Cast + │ ├── cast_to: Decimal128(38, 15) + │ ├── expr:Mul + │ │ ├── Cast { cast_to: Float64, expr: #0 } + │ │ └── 0.0001 + + └── LogicalAgg + ├── exprs:Agg(Sum) + │ └── Mul + │ ├── #1 + │ └── Cast { cast_to: Decimal128(10, 0), expr: #0 } + ├── groups: [] + └── LogicalProjection { exprs: [ #0, #1 ] } + └── LogicalJoin + ├── join_type: Inner + ├── cond:Eq + │ ├── #2 + │ └── #3 + ├── LogicalProjection { exprs: [ #1, #2, #4 ] } + │ └── LogicalJoin + │ ├── join_type: Inner + │ ├── cond:Eq + │ │ ├── #0 + │ │ └── #3 + │ ├── LogicalProjection { exprs: [ #1, #2, #3 ] } + │ │ └── LogicalScan { table: partsupp } + │ └── LogicalProjection { exprs: [ #0, #3 ] } + │ └── LogicalScan { table: supplier } + └── LogicalProjection { exprs: [ #0 ] } + └── LogicalFilter + ├── cond:Eq + │ ├── #1 + │ └── "CHINA" + └── LogicalProjection { exprs: [ #0, #1 ] } + └── LogicalScan { table: nation } +PhysicalSort +├── exprs:SortOrder { order: Desc } +│ └── #1 +└── PhysicalProjection { exprs: [ #0, #1 ] } + └── PhysicalNestedLoopJoin + ├── join_type: Inner + ├── cond:Gt + │ ├── Cast { cast_to: Decimal128(38, 15), expr: #1 } + │ └── #0 + ├── PhysicalAgg + │ ├── aggrs:Agg(Sum) + │ │ └── Mul + │ │ ├── #2 + │ │ └── Cast { cast_to: Decimal128(10, 0), expr: #1 } + │ ├── groups: [ #0 ] + │ └── PhysicalProjection { exprs: [ #0, #1, #2 ] } + │ └── PhysicalHashJoin { join_type: Inner, left_keys: [ #3 ], right_keys: [ #0 ] } + │ ├── PhysicalProjection { exprs: [ #0, #2, #3, #5 ] } + │ │ └── PhysicalHashJoin { join_type: Inner, left_keys: [ #1 ], right_keys: [ #0 ] } + │ │ ├── PhysicalProjection { exprs: [ #0, #1, #2, #3 ] } + │ │ │ └── PhysicalScan { table: partsupp } + │ │ └── PhysicalProjection { exprs: [ #0, #3 ] } + │ │ └── PhysicalScan { table: supplier } + │ └── PhysicalProjection { exprs: [ #0 ] } + │ └── PhysicalFilter + │ ├── cond:Eq + │ │ ├── #1 + │ │ └── "CHINA" + │ └── PhysicalProjection { exprs: [ #0, #1 ] } + │ └── PhysicalScan { table: nation } + └── PhysicalProjection + ├── exprs:Cast + │ ├── cast_to: Decimal128(38, 15) + │ ├── expr:Mul + │ │ ├── Cast { cast_to: Float64, expr: #0 } + │ │ └── 0.0001 + + └── PhysicalAgg + ├── aggrs:Agg(Sum) + │ └── Mul + │ ├── #1 + │ └── Cast { cast_to: Decimal128(10, 0), expr: #0 } + ├── groups: [] + └── PhysicalProjection { exprs: [ #0, #1 ] } + └── PhysicalHashJoin { join_type: Inner, left_keys: [ #2 ], right_keys: [ #0 ] } + ├── PhysicalProjection { exprs: [ #1, #2, #4 ] } + │ └── PhysicalHashJoin { join_type: Inner, left_keys: [ #0 ], right_keys: [ #0 ] } + │ ├── PhysicalProjection { exprs: [ #1, #2, #3 ] } + │ │ └── PhysicalScan { table: partsupp } + │ └── PhysicalProjection { exprs: [ #0, #3 ] } + │ └── PhysicalScan { table: supplier } + └── PhysicalProjection { exprs: [ #0 ] } + └── PhysicalFilter + ├── cond:Eq + │ ├── #1 + │ └── "CHINA" + └── PhysicalProjection { exprs: [ #0, #1 ] } + └── PhysicalScan { table: nation } +*/ + -- TPC-H Q12 SELECT l_shipmode, diff --git a/optd-sqlplannertest/tests/tpch.yml b/optd-sqlplannertest/tests/tpch.yml index 2c3fe1dc..4670fa49 100644 --- a/optd-sqlplannertest/tests/tpch.yml +++ b/optd-sqlplannertest/tests/tpch.yml @@ -416,6 +416,37 @@ desc: TPC-H Q10 tasks: - explain:logical_optd,physical_optd +- sql: | + select + ps_partkey, + sum(ps_supplycost * ps_availqty) as value + from + partsupp, + supplier, + nation + where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'CHINA' + group by + ps_partkey having + sum(ps_supplycost * ps_availqty) > ( + select + sum(ps_supplycost * ps_availqty) * 0.0001000000 + from + partsupp, + supplier, + nation + where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'CHINA' + ) + order by + value desc; + desc: TPC-H Q11 + tasks: + - explain[with_logical]:logical_optd,physical_optd - sql: | SELECT l_shipmode, From 3c627f453035e5594afc9f5ed8d48c270538dd90 Mon Sep 17 00:00:00 2001 From: Gun9niR Date: Sat, 30 Mar 2024 18:15:26 -0400 Subject: [PATCH 2/9] cargo fmt --- optd-datafusion-repr/src/cost/base_cost.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 1bd08f68..27535c53 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -589,7 +589,7 @@ impl OptCostModel { .get(*col_idx).unwrap() } ); - + if let Some(Some(column_stats)) = column_stats { column_stats.ndistinct } else { From ff738aaa773e041c92c6145ebec31292de251fd2 Mon Sep 17 00:00:00 2001 From: Gun9niR Date: Sat, 30 Mar 2024 19:08:22 -0400 Subject: [PATCH 3/9] address comments --- optd-datafusion-repr/src/properties/schema.rs | 50 +++++++++---------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/optd-datafusion-repr/src/properties/schema.rs b/optd-datafusion-repr/src/properties/schema.rs index fc2e315c..0a5e91d9 100644 --- a/optd-datafusion-repr/src/properties/schema.rs +++ b/optd-datafusion-repr/src/properties/schema.rs @@ -12,6 +12,18 @@ pub struct Field { pub typ: ConstantType, pub nullable: bool, } + +impl Field { + /// Generate a field that is only a place holder whose members are never used. + fn placeholder() -> Self { + Self { + name: DEFAULT_NAME.to_string(), + typ: ConstantType::Any, + nullable: true, + } + } +} + #[derive(Clone, Debug, Serialize, Deserialize)] pub struct Schema { pub fields: Vec, @@ -69,16 +81,9 @@ impl PropertyBuilder for SchemaPropertyBuilder { bincode::deserialize(data.as_ref()).unwrap(); empty_relation_data.schema } - OptRelNodeTyp::ColumnRef => { - let data_typ = ConstantType::get_data_type_from_value(&data.unwrap()); - Schema { - fields: vec![Field { - name: DEFAULT_NAME.to_string(), - typ: data_typ, - nullable: true, - }], - } - } + OptRelNodeTyp::ColumnRef => Schema { + fields: vec![Field::placeholder()], + }, OptRelNodeTyp::List => { let mut fields = vec![]; for child in children { @@ -87,27 +92,18 @@ impl PropertyBuilder for SchemaPropertyBuilder { Schema { fields } } OptRelNodeTyp::LogOp(_) => Schema { - fields: vec![ - Field { - name: DEFAULT_NAME.to_string(), - typ: ConstantType::Any, - nullable: true - }; - children.len() - ], + fields: vec![Field::placeholder(); children.len()], }, OptRelNodeTyp::Agg => { - let mut schema = children[1].clone(); - let schema2 = children[2].clone(); - schema.fields.extend(schema2.fields); - schema + let mut group_by_schema = children[1].clone(); + let agg_schema = children[2].clone(); + group_by_schema.fields.extend(agg_schema.fields); + group_by_schema } OptRelNodeTyp::Func(FuncType::Agg(_)) => Schema { - fields: vec![Field { - name: DEFAULT_NAME.to_string(), - typ: ConstantType::Any, - nullable: true, - }], + // TODO: this is just a place holder now. + // The real type should be the column type. + fields: vec![Field::placeholder()], }, _ => Schema { fields: vec![] }, } From 30ea0b5873ca876d5a59dc32c3282a71401d9df2 Mon Sep 17 00:00:00 2001 From: Gun9niR Date: Sat, 30 Mar 2024 19:09:21 -0400 Subject: [PATCH 4/9] use postgres default n_distinct --- optd-datafusion-repr/src/cost/base_cost.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 27535c53..d0dd491b 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -324,7 +324,7 @@ const DEFAULT_INEQ_SEL: f64 = 0.3333333333333333; // Default selectivity estimate for pattern-match operators such as LIKE const DEFAULT_MATCH_SEL: f64 = 0.005; // Default n-distinct estimate for derived columns or columns lacking statistics -const DEFAULT_N_DISTINCT: u64 = 1; +const DEFAULT_N_DISTINCT: u64 = 200; const INVALID_SEL: f64 = 0.01; From 968b44459186fbc14c936d513e91dcea85622977 Mon Sep 17 00:00:00 2001 From: Gun9niR Date: Sat, 30 Mar 2024 19:54:19 -0400 Subject: [PATCH 5/9] use unk_sel --- optd-datafusion-repr/src/cost/base_cost.rs | 149 ++++++++++----------- 1 file changed, 70 insertions(+), 79 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index d0dd491b..0011901b 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -323,6 +323,8 @@ const DEFAULT_EQ_SEL: f64 = 0.005; const DEFAULT_INEQ_SEL: f64 = 0.3333333333333333; // Default selectivity estimate for pattern-match operators such as LIKE const DEFAULT_MATCH_SEL: f64 = 0.005; +// Default selectivity if we have no information +const DEFAULT_UNK_SEL: f64 = 0.005; // Default n-distinct estimate for derived columns or columns lacking statistics const DEFAULT_N_DISTINCT: u64 = 200; @@ -403,37 +405,33 @@ impl CostModel for OptCostM OptRelNodeTyp::PhysicalEmptyRelation => Self::cost(0.5, 0.01, 0.0), OptRelNodeTyp::PhysicalLimit => { let (row_cnt, compute_cost, _) = Self::cost_tuple(&children[0]); - let row_cnt = if let Some(context) = context { - if let Some(optimizer) = optimizer { - let mut fetch_expr = - optimizer.get_all_group_bindings(context.children_group_ids[2], false); - assert!( - fetch_expr.len() == 1, - "fetch expression should be the only expr in the group" - ); - let fetch_expr = fetch_expr.pop().unwrap(); - assert!( - matches!( - fetch_expr.typ, - OptRelNodeTyp::Constant(ConstantType::UInt64) - ), - "fetch type can only be UInt64" - ); - let fetch = ConstantExpr::from_rel_node(fetch_expr) - .unwrap() - .value() - .as_u64(); - // u64::MAX represents None - if fetch == u64::MAX { - row_cnt - } else { - row_cnt.min(fetch as f64) - } + let row_cnt = if let (Some(context), Some(optimizer)) = (context, optimizer) { + let mut fetch_expr = + optimizer.get_all_group_bindings(context.children_group_ids[2], false); + assert!( + fetch_expr.len() == 1, + "fetch expression should be the only expr in the group" + ); + let fetch_expr = fetch_expr.pop().unwrap(); + assert!( + matches!( + fetch_expr.typ, + OptRelNodeTyp::Constant(ConstantType::UInt64) + ), + "fetch type can only be UInt64" + ); + let fetch = ConstantExpr::from_rel_node(fetch_expr) + .unwrap() + .value() + .as_u64(); + // u64::MAX represents None + if fetch == u64::MAX { + row_cnt } else { - panic!("compute_cost() should not be called if optimizer is None") + row_cnt.min(fetch as f64) } } else { - panic!("compute_cost() should not be called if context is None") + (row_cnt * DEFAULT_UNK_SEL).max(1.0) }; Self::cost(row_cnt, compute_cost, 0.0) } @@ -501,8 +499,8 @@ impl CostModel for OptCostM Self::cost(row_cnt, row_cnt * row_cnt.ln_1p().max(1.0), 0.0) } OptRelNodeTyp::PhysicalAgg => { - let row_cnt = self.get_agg_row_cnt(context, optimizer); let child_row_cnt = Self::row_cnt(&children[0]); + let row_cnt = self.get_agg_row_cnt(context, optimizer, child_row_cnt); let (_, compute_cost_1, _) = Self::cost_tuple(&children[1]); let (_, compute_cost_2, _) = Self::cost_tuple(&children[2]); Self::cost( @@ -555,58 +553,51 @@ impl OptCostModel { &self, context: Option, optimizer: Option<&CascadesOptimizer>, + child_row_cnt: f64, ) -> f64 { - match context { - Some(context) => { - if let Some(optimizer) = optimizer { - let group_by_id = context.children_group_ids[2]; - let mut group_by_exprs: Vec>> = - optimizer.get_all_group_bindings(group_by_id, false); - assert!( - group_by_exprs.len() == 1, - "ExprList expression should be the only expression in the GROUP BY group" - ); - let group_by = group_by_exprs.pop().unwrap(); - let group_by = ExprList::from_rel_node(group_by).unwrap(); - if group_by.is_empty() { - 1.0 - } else { - // Multiply the n-distinct of all the group by columns. - // TODO: improve with multi-dimensional n-distinct - let base_table_col_refs = optimizer - .get_property_by_group::(context.group_id, 1); - base_table_col_refs - .iter() - .take(group_by.len()) - .map(|col_ref| match col_ref { - ColumnRef::BaseTableColumnRef { table, col_idx } => { - let table_stats = self - .per_table_stats_map - .get(table); - let column_stats = table_stats.map( - |table_stats| { - table_stats.per_column_stats_vec - .get(*col_idx).unwrap() - } - ); - - if let Some(Some(column_stats)) = column_stats { - column_stats.ndistinct - } else { - // The column type is not supported or stats are missing. - DEFAULT_N_DISTINCT - } - } - ColumnRef::Derived => DEFAULT_N_DISTINCT, - _ => panic!("GROUP BY base table column ref must either be derived or base table"), - }) - .product::() as f64 - } - } else { - panic!("compute_cost() should not be called if optimizer is None") - } + if let (Some(context), Some(optimizer)) = (context, optimizer) { + let group_by_id = context.children_group_ids[2]; + let mut group_by_exprs: Vec>> = + optimizer.get_all_group_bindings(group_by_id, false); + assert!( + group_by_exprs.len() == 1, + "ExprList expression should be the only expression in the GROUP BY group" + ); + let group_by = group_by_exprs.pop().unwrap(); + let group_by = ExprList::from_rel_node(group_by).unwrap(); + if group_by.is_empty() { + 1.0 + } else { + // Multiply the n-distinct of all the group by columns. + // TODO: improve with multi-dimensional n-distinct + let base_table_col_refs = optimizer + .get_property_by_group::(context.group_id, 1); + base_table_col_refs + .iter() + .take(group_by.len()) + .map(|col_ref| match col_ref { + ColumnRef::BaseTableColumnRef { table, col_idx } => { + let table_stats = self.per_table_stats_map.get(table); + let column_stats = table_stats.map(|table_stats| { + table_stats.per_column_stats_vec.get(*col_idx).unwrap() + }); + + if let Some(Some(column_stats)) = column_stats { + column_stats.ndistinct + } else { + // The column type is not supported or stats are missing. + DEFAULT_N_DISTINCT + } + } + ColumnRef::Derived => DEFAULT_N_DISTINCT, + _ => panic!( + "GROUP BY base table column ref must either be derived or base table" + ), + }) + .product::() as f64 } - None => panic!("compute_cost() should not be called if context is None"), + } else { + (row_cnt * DEFAULT_UNK_SEL).max(1.0) } } From 63e744fad789ea8e60ad77ec9be9d11d1ac9dec9 Mon Sep 17 00:00:00 2001 From: Gun9niR Date: Sat, 30 Mar 2024 20:02:26 -0400 Subject: [PATCH 6/9] fix --- optd-datafusion-repr/src/cost/base_cost.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 0011901b..64d85850 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -597,7 +597,7 @@ impl OptCostModel { .product::() as f64 } } else { - (row_cnt * DEFAULT_UNK_SEL).max(1.0) + (child_row_cnt * DEFAULT_UNK_SEL).max(1.0) } } From b0fe07a8be21eab104c20eb278441d4601bc4881 Mon Sep 17 00:00:00 2001 From: Gun9niR Date: Sat, 30 Mar 2024 20:54:34 -0400 Subject: [PATCH 7/9] change produceted type to f64 --- optd-datafusion-repr/src/cost/base_cost.rs | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost.rs b/optd-datafusion-repr/src/cost/base_cost.rs index 64d85850..7b19e1ec 100644 --- a/optd-datafusion-repr/src/cost/base_cost.rs +++ b/optd-datafusion-repr/src/cost/base_cost.rs @@ -11,8 +11,8 @@ use crate::{ use arrow_schema::{ArrowError, DataType}; use datafusion::arrow::array::{ Array, BooleanArray, Date32Array, Decimal128Array, Float32Array, Float64Array, Int16Array, - Int32Array, Int8Array, RecordBatch, RecordBatchIterator, RecordBatchReader, UInt16Array, - UInt32Array, UInt8Array, + Int32Array, Int8Array, RecordBatch, RecordBatchIterator, RecordBatchReader, StringArray, + UInt16Array, UInt32Array, UInt8Array, }; use itertools::Itertools; use optd_core::{ @@ -22,6 +22,7 @@ use optd_core::{ }; use optd_gungnir::stats::hyperloglog::{self, HyperLogLog}; use optd_gungnir::stats::tdigest::{self, TDigest}; +use optd_gungnir::utils::arith_encoder; use serde::{Deserialize, Serialize}; fn compute_plan_node_cost>( @@ -181,6 +182,7 @@ impl DataFusionPerTableStats { | DataType::UInt32 | DataType::Float32 | DataType::Float64 + | DataType::Utf8 ) } @@ -222,6 +224,10 @@ impl DataFusionPerTableStats { val as f64 } + fn str_to_f64(string: &str) -> f64 { + arith_encoder::encode(string) + } + match col_type { DataType::Boolean => { generate_stats_for_col!({ col, distr, hll, BooleanArray, to_f64_safe }) @@ -256,6 +262,9 @@ impl DataFusionPerTableStats { DataType::Decimal128(_, _) => { generate_stats_for_col!({ col, distr, hll, Decimal128Array, i128_to_f64 }) } + DataType::Utf8 => { + generate_stats_for_col!({ col, distr, hll, StringArray, str_to_f64 }) + } _ => unreachable!(), } } @@ -583,18 +592,18 @@ impl OptCostModel { }); if let Some(Some(column_stats)) = column_stats { - column_stats.ndistinct + column_stats.ndistinct as f64 } else { // The column type is not supported or stats are missing. - DEFAULT_N_DISTINCT + DEFAULT_N_DISTINCT as f64 } } - ColumnRef::Derived => DEFAULT_N_DISTINCT, + ColumnRef::Derived => DEFAULT_N_DISTINCT as f64, _ => panic!( "GROUP BY base table column ref must either be derived or base table" ), }) - .product::() as f64 + .product() } } else { (child_row_cnt * DEFAULT_UNK_SEL).max(1.0) From 728797b9bd807314714a17d029d61c8fd13c1583 Mon Sep 17 00:00:00 2001 From: Gun9niR Date: Sat, 30 Mar 2024 20:59:43 -0400 Subject: [PATCH 8/9] implement hll for str --- optd-gungnir/src/stats/hyperloglog.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/optd-gungnir/src/stats/hyperloglog.rs b/optd-gungnir/src/stats/hyperloglog.rs index aca39eb2..f71baa7a 100644 --- a/optd-gungnir/src/stats/hyperloglog.rs +++ b/optd-gungnir/src/stats/hyperloglog.rs @@ -25,10 +25,17 @@ pub struct HyperLogLog { alpha: f64, // The normal HLL multiplier factor. } +// Serialize common data types for hashing (&str). +impl ByteSerializable for &str { + fn to_bytes(&self) -> Vec { + self.as_bytes().to_vec() + } +} + // Serialize common data types for hashing (String). impl ByteSerializable for String { fn to_bytes(&self) -> Vec { - self.as_bytes().to_vec() + self.as_str().to_bytes() } } From 51caa12b17cdf535862b5e61ff43dad4317ae555 Mon Sep 17 00:00:00 2001 From: Gun9niR Date: Sat, 30 Mar 2024 21:12:22 -0400 Subject: [PATCH 9/9] fix test --- optd-datafusion-repr/src/properties/schema.rs | 13 +- optd-sqlplannertest/tests/tpch.planner.sql | 180 +++++++++--------- 2 files changed, 101 insertions(+), 92 deletions(-) diff --git a/optd-datafusion-repr/src/properties/schema.rs b/optd-datafusion-repr/src/properties/schema.rs index 0a5e91d9..04f2e7cb 100644 --- a/optd-datafusion-repr/src/properties/schema.rs +++ b/optd-datafusion-repr/src/properties/schema.rs @@ -81,9 +81,16 @@ impl PropertyBuilder for SchemaPropertyBuilder { bincode::deserialize(data.as_ref()).unwrap(); empty_relation_data.schema } - OptRelNodeTyp::ColumnRef => Schema { - fields: vec![Field::placeholder()], - }, + OptRelNodeTyp::ColumnRef => { + let data_typ = ConstantType::get_data_type_from_value(&data.unwrap()); + Schema { + fields: vec![Field { + name: DEFAULT_NAME.to_string(), + typ: data_typ, + nullable: true, + }], + } + } OptRelNodeTyp::List => { let mut fields = vec![]; for child in children { diff --git a/optd-sqlplannertest/tests/tpch.planner.sql b/optd-sqlplannertest/tests/tpch.planner.sql index ba977f40..1e5dc603 100644 --- a/optd-sqlplannertest/tests/tpch.planner.sql +++ b/optd-sqlplannertest/tests/tpch.planner.sql @@ -1630,50 +1630,51 @@ PhysicalSort ├── exprs:SortOrder { order: Desc } │ └── #1 └── PhysicalProjection { exprs: [ #0, #1 ] } - └── PhysicalNestedLoopJoin - ├── join_type: Inner - ├── cond:Gt - │ ├── Cast { cast_to: Decimal128(38, 15), expr: #1 } - │ └── #0 - ├── PhysicalAgg - │ ├── aggrs:Agg(Sum) - │ │ └── Mul - │ │ ├── #2 - │ │ └── Cast { cast_to: Decimal128(10, 0), expr: #1 } - │ ├── groups: [ #0 ] - │ └── PhysicalProjection { exprs: [ #0, #1, #2 ] } - │ └── PhysicalHashJoin { join_type: Inner, left_keys: [ #3 ], right_keys: [ #0 ] } - │ ├── PhysicalProjection { exprs: [ #0, #2, #3, #5 ] } - │ │ └── PhysicalHashJoin { join_type: Inner, left_keys: [ #1 ], right_keys: [ #0 ] } - │ │ ├── PhysicalProjection { exprs: [ #0, #1, #2, #3 ] } - │ │ │ └── PhysicalScan { table: partsupp } - │ │ └── PhysicalProjection { exprs: [ #0, #3 ] } - │ │ └── PhysicalScan { table: supplier } - │ └── PhysicalProjection { exprs: [ #0 ] } - │ └── PhysicalFilter - │ ├── cond:Eq - │ │ ├── #1 - │ │ └── "CHINA" - │ └── PhysicalProjection { exprs: [ #0, #1 ] } - │ └── PhysicalScan { table: nation } - └── PhysicalProjection - ├── exprs:Cast - │ ├── cast_to: Decimal128(38, 15) - │ ├── expr:Mul - │ │ ├── Cast { cast_to: Float64, expr: #0 } - │ │ └── 0.0001 + └── PhysicalProjection { exprs: [ #0, #1 ] } + └── PhysicalNestedLoopJoin + ├── join_type: Inner + ├── cond:Gt + │ ├── Cast { cast_to: Decimal128(38, 15), expr: #1 } + │ └── #0 + ├── PhysicalProjection + │ ├── exprs:Cast + │ │ ├── cast_to: Decimal128(38, 15) + │ │ ├── expr:Mul + │ │ │ ├── Cast { cast_to: Float64, expr: #0 } + │ │ │ └── 0.0001 + │ └── PhysicalAgg + │ ├── aggrs:Agg(Sum) + │ │ └── Mul + │ │ ├── #1 + │ │ └── Cast { cast_to: Decimal128(10, 0), expr: #0 } + │ ├── groups: [] + │ └── PhysicalProjection { exprs: [ #0, #1 ] } + │ └── PhysicalHashJoin { join_type: Inner, left_keys: [ #2 ], right_keys: [ #0 ] } + │ ├── PhysicalProjection { exprs: [ #1, #2, #4 ] } + │ │ └── PhysicalHashJoin { join_type: Inner, left_keys: [ #0 ], right_keys: [ #0 ] } + │ │ ├── PhysicalProjection { exprs: [ #1, #2, #3 ] } + │ │ │ └── PhysicalScan { table: partsupp } + │ │ └── PhysicalProjection { exprs: [ #0, #3 ] } + │ │ └── PhysicalScan { table: supplier } + │ └── PhysicalProjection { exprs: [ #0 ] } + │ └── PhysicalFilter + │ ├── cond:Eq + │ │ ├── #1 + │ │ └── "CHINA" + │ └── PhysicalProjection { exprs: [ #0, #1 ] } + │ └── PhysicalScan { table: nation } └── PhysicalAgg ├── aggrs:Agg(Sum) │ └── Mul - │ ├── #1 - │ └── Cast { cast_to: Decimal128(10, 0), expr: #0 } - ├── groups: [] - └── PhysicalProjection { exprs: [ #0, #1 ] } - └── PhysicalHashJoin { join_type: Inner, left_keys: [ #2 ], right_keys: [ #0 ] } - ├── PhysicalProjection { exprs: [ #1, #2, #4 ] } - │ └── PhysicalHashJoin { join_type: Inner, left_keys: [ #0 ], right_keys: [ #0 ] } - │ ├── PhysicalProjection { exprs: [ #1, #2, #3 ] } + │ ├── #2 + │ └── Cast { cast_to: Decimal128(10, 0), expr: #1 } + ├── groups: [ #0 ] + └── PhysicalProjection { exprs: [ #0, #1, #2 ] } + └── PhysicalHashJoin { join_type: Inner, left_keys: [ #3 ], right_keys: [ #0 ] } + ├── PhysicalProjection { exprs: [ #0, #2, #3, #5 ] } + │ └── PhysicalHashJoin { join_type: Inner, left_keys: [ #1 ], right_keys: [ #0 ] } + │ ├── PhysicalProjection { exprs: [ #0, #1, #2, #3 ] } │ │ └── PhysicalScan { table: partsupp } │ └── PhysicalProjection { exprs: [ #0, #3 ] } │ └── PhysicalScan { table: supplier } @@ -2016,55 +2017,56 @@ PhysicalSort ├── exprs:SortOrder { order: Asc } │ └── #0 └── PhysicalProjection { exprs: [ #0, #1, #2, #3, #4 ] } - └── PhysicalHashJoin { join_type: Inner, left_keys: [ #4 ], right_keys: [ #0 ] } - ├── PhysicalProjection { exprs: [ #0, #1, #2, #3, #5 ] } - │ └── PhysicalHashJoin { join_type: Inner, left_keys: [ #0 ], right_keys: [ #0 ] } - │ ├── PhysicalProjection { exprs: [ #0, #1, #2, #4 ] } - │ │ └── PhysicalScan { table: supplier } - │ └── PhysicalProjection { exprs: [ #0, #1 ] } - │ └── PhysicalAgg - │ ├── aggrs:Agg(Sum) - │ │ └── Mul - │ │ ├── #1 - │ │ └── Sub - │ │ ├── 1 - │ │ └── #2 - │ ├── groups: [ #0 ] - │ └── PhysicalProjection { exprs: [ #0, #1, #2 ] } - │ └── PhysicalFilter - │ ├── cond:And - │ │ ├── Geq - │ │ │ ├── #3 - │ │ │ └── 8401 - │ │ └── Lt - │ │ ├── #3 - │ │ └── 8491 - │ └── PhysicalProjection { exprs: [ #2, #5, #6, #10 ] } - │ └── PhysicalScan { table: lineitem } - └── PhysicalAgg - ├── aggrs:Agg(Max) - │ └── [ #0 ] - ├── groups: [] - └── PhysicalProjection { exprs: [ #1 ] } - └── PhysicalAgg - ├── aggrs:Agg(Sum) - │ └── Mul - │ ├── #1 - │ └── Sub - │ ├── 1 - │ └── #2 - ├── groups: [ #0 ] - └── PhysicalProjection { exprs: [ #0, #1, #2 ] } - └── PhysicalFilter - ├── cond:And - │ ├── Geq - │ │ ├── #3 - │ │ └── 8401 - │ └── Lt - │ ├── #3 - │ └── 8491 - └── PhysicalProjection { exprs: [ #2, #5, #6, #10 ] } - └── PhysicalScan { table: lineitem } + └── PhysicalProjection { exprs: [ #0, #1, #2, #3, #5, #6 ] } + └── PhysicalHashJoin { join_type: Inner, left_keys: [ #0 ], right_keys: [ #0 ] } + ├── PhysicalProjection { exprs: [ #0, #1, #2, #4 ] } + │ └── PhysicalScan { table: supplier } + └── PhysicalProjection { exprs: [ #0, #1, #2 ] } + └── PhysicalProjection { exprs: [ #1, #2, #0 ] } + └── PhysicalHashJoin { join_type: Inner, left_keys: [ #0 ], right_keys: [ #1 ] } + ├── PhysicalAgg + │ ├── aggrs:Agg(Max) + │ │ └── [ #0 ] + │ ├── groups: [] + │ └── PhysicalProjection { exprs: [ #1 ] } + │ └── PhysicalAgg + │ ├── aggrs:Agg(Sum) + │ │ └── Mul + │ │ ├── #1 + │ │ └── Sub + │ │ ├── 1 + │ │ └── #2 + │ ├── groups: [ #0 ] + │ └── PhysicalProjection { exprs: [ #0, #1, #2 ] } + │ └── PhysicalFilter + │ ├── cond:And + │ │ ├── Geq + │ │ │ ├── #3 + │ │ │ └── 8401 + │ │ └── Lt + │ │ ├── #3 + │ │ └── 8491 + │ └── PhysicalProjection { exprs: [ #2, #5, #6, #10 ] } + │ └── PhysicalScan { table: lineitem } + └── PhysicalAgg + ├── aggrs:Agg(Sum) + │ └── Mul + │ ├── #1 + │ └── Sub + │ ├── 1 + │ └── #2 + ├── groups: [ #0 ] + └── PhysicalProjection { exprs: [ #0, #1, #2 ] } + └── PhysicalFilter + ├── cond:And + │ ├── Geq + │ │ ├── #3 + │ │ └── 8401 + │ └── Lt + │ ├── #3 + │ └── 8491 + └── PhysicalProjection { exprs: [ #2, #5, #6, #10 ] } + └── PhysicalScan { table: lineitem } */ -- TPC-H Q17