diff --git a/.github/workflows/R_CMD_CHECK.yml b/.github/workflows/R_CMD_CHECK.yml index 2c0fb5794d09..6e0a873f2033 100644 --- a/.github/workflows/R_CMD_CHECK.yml +++ b/.github/workflows/R_CMD_CHECK.yml @@ -79,6 +79,15 @@ jobs: extra-packages: any::rcmdcheck needs: check + - name: Apply duckdb-r patches + shell: bash + working-directory: ${{ env.DUCKDB_SRC }} + run: | + shopt -s nullglob + for filename in .github/patches/duckdb-r/*.patch; do + git apply $filename + done + # needed so we can run git commit in vendor.sh - name: setup github and create parallel builds shell: bash diff --git a/src/common/types/list_segment.cpp b/src/common/types/list_segment.cpp index de350b605cb1..2a14718bf382 100644 --- a/src/common/types/list_segment.cpp +++ b/src/common/types/list_segment.cpp @@ -462,6 +462,10 @@ void SegmentPrimitiveFunction(ListSegmentFunctions &functions) { void GetSegmentDataFunctions(ListSegmentFunctions &functions, const LogicalType &type) { + if (type.id() == LogicalTypeId::UNKNOWN) { + throw ParameterNotResolvedException(); + } + auto physical_type = type.InternalType(); switch (physical_type) { case PhysicalType::BIT: diff --git a/src/common/types/vector.cpp b/src/common/types/vector.cpp index 469b569d71ad..03af4fb904cf 100644 --- a/src/common/types/vector.cpp +++ b/src/common/types/vector.cpp @@ -1131,9 +1131,12 @@ void Vector::VerifyMap(Vector &vector_p, const SelectionVector &sel_p, idx_t cou void Vector::VerifyUnion(Vector &vector_p, const SelectionVector &sel_p, idx_t count) { #ifdef DEBUG + D_ASSERT(vector_p.GetType().id() == LogicalTypeId::UNION); auto valid_check = UnionVector::CheckUnionValidity(vector_p, count, sel_p); - D_ASSERT(valid_check == UnionInvalidReason::VALID); + if (valid_check != UnionInvalidReason::VALID) { + throw InternalException("Union not valid, reason: %s", EnumUtil::ToString(valid_check)); + } #endif // DEBUG } @@ -1250,7 +1253,8 @@ void Vector::Verify(Vector &vector_p, const SelectionVector &sel_p, idx_t count) } if (vector->GetType().id() == LogicalTypeId::UNION) { - VerifyUnion(*vector, sel_p, count); + // Pass in raw vector + VerifyUnion(vector_p, sel_p, count); } } @@ -1968,10 +1972,28 @@ union_tag_t UnionVector::GetTag(const Vector &vector, idx_t index) { return FlatVector::GetData(tag_vector)[index]; } -UnionInvalidReason UnionVector::CheckUnionValidity(Vector &vector_p, idx_t count, const SelectionVector &sel) { - +//! Raw selection vector passed in (not merged with any other selection vectors) +UnionInvalidReason UnionVector::CheckUnionValidity(Vector &vector_p, idx_t count, const SelectionVector &sel_p) { D_ASSERT(vector_p.GetType().id() == LogicalTypeId::UNION); + // Will contain the (possibly) merged selection vector + const SelectionVector *sel = &sel_p; + SelectionVector owned_sel; + Vector *vector = &vector_p; + if (vector->GetVectorType() == VectorType::DICTIONARY_VECTOR) { + // In the case of a dictionary vector, unwrap the Vector, and merge the selection vectors. + auto &child = DictionaryVector::Child(*vector); + D_ASSERT(child.GetVectorType() != VectorType::DICTIONARY_VECTOR); + auto &dict_sel = DictionaryVector::SelVector(*vector); + // merge the selection vectors and verify the child + auto new_buffer = dict_sel.Slice(*sel, count); + owned_sel.Initialize(new_buffer); + sel = &owned_sel; + vector = &child; + } else if (vector->GetVectorType() == VectorType::CONSTANT_VECTOR) { + sel = ConstantVector::ZeroSelectionVector(count, owned_sel); + } + auto member_count = UnionType::GetMemberCount(vector_p.GetType()); if (member_count == 0) { return UnionInvalidReason::NO_MEMBERS; @@ -1981,7 +2003,7 @@ UnionInvalidReason UnionVector::CheckUnionValidity(Vector &vector_p, idx_t count vector_p.ToUnifiedFormat(count, vector_vdata); auto &entries = StructVector::GetEntries(vector_p); - vector child_vdata(entries.size()); + duckdb::vector child_vdata(entries.size()); for (idx_t entry_idx = 0; entry_idx < entries.size(); entry_idx++) { auto &child = *entries[entry_idx]; child.ToUnifiedFormat(count, child_vdata[entry_idx]); @@ -1990,33 +2012,34 @@ UnionInvalidReason UnionVector::CheckUnionValidity(Vector &vector_p, idx_t count auto &tag_vdata = child_vdata[0]; for (idx_t row_idx = 0; row_idx < count; row_idx++) { - auto mapped_idx = sel.get_index(row_idx); + auto mapped_idx = sel->get_index(row_idx); - if (!vector_vdata.validity.RowIsValid(vector_vdata.sel->get_index(mapped_idx))) { + if (!vector_vdata.validity.RowIsValid(mapped_idx)) { continue; } - if (!tag_vdata.validity.RowIsValid(tag_vdata.sel->get_index(mapped_idx))) { + auto tag_idx = tag_vdata.sel->get_index(sel_p.get_index(row_idx)); + if (!tag_vdata.validity.RowIsValid(tag_idx)) { // we can't have NULL tags! return UnionInvalidReason::NULL_TAG; } - - auto tag = UnifiedVectorFormat::GetData(tag_vdata)[tag_vdata.sel->get_index(mapped_idx)]; + auto tag = UnifiedVectorFormat::GetData(tag_vdata)[tag_idx]; if (tag >= member_count) { return UnionInvalidReason::TAG_OUT_OF_RANGE; } bool found_valid = false; - for (idx_t member_idx = 0; member_idx < member_count; member_idx++) { - auto &member_vdata = child_vdata[1 + member_idx]; // skip the tag - if (!member_vdata.validity.RowIsValid(member_vdata.sel->get_index(mapped_idx))) { + for (idx_t i = 0; i < member_count; i++) { + auto &member_vdata = child_vdata[1 + i]; // skip the tag + idx_t member_idx = member_vdata.sel->get_index(sel_p.get_index(row_idx)); + if (!member_vdata.validity.RowIsValid(member_idx)) { continue; } if (found_valid) { return UnionInvalidReason::VALIDITY_OVERLAP; } found_valid = true; - if (tag != static_cast(member_idx)) { + if (tag != static_cast(i)) { return UnionInvalidReason::TAG_MISMATCH; } } diff --git a/test/sql/types/nested/list/list_aggregates.test b/test/sql/types/nested/list/list_aggregates.test index bc5e4c3f25c7..76f3b0196aa9 100644 --- a/test/sql/types/nested/list/list_aggregates.test +++ b/test/sql/types/nested/list/list_aggregates.test @@ -122,3 +122,11 @@ select i, i % 2, list(i) over(partition by i % 2 order by i rows between 1 prece 5 1 [3, 5, 7] 7 1 [5, 7, 9] 9 1 [7, 9] + +# parameter not resolved issue (#484) + +statement ok +PREPARE rebind_stmt AS SELECT list(list_value({'foo': [?]})); + +statement ok +EXECUTE rebind_stmt(10); diff --git a/tools/pythonpkg/duckdb/query_graph/__main__.py b/tools/pythonpkg/duckdb/query_graph/__main__.py new file mode 100644 index 000000000000..974ac89e20de --- /dev/null +++ b/tools/pythonpkg/duckdb/query_graph/__main__.py @@ -0,0 +1,321 @@ +import json +import os +import sys +import webbrowser +from functools import reduce +import argparse + +qgraph_css = """ +.styled-table { + border-collapse: collapse; + margin: 25px 0; + font-size: 0.9em; + font-family: sans-serif; + min-width: 400px; + box-shadow: 0 0 20px rgba(0, 0, 0, 0.15); +} +.styled-table thead tr { + background-color: #009879; + color: #ffffff; + text-align: left; +} +.styled-table th, +.styled-table td { + padding: 12px 15px; +} +.styled-table tbody tr { + border-bottom: 1px solid #dddddd; +} + +.styled-table tbody tr:nth-of-type(even) { + background-color: #f3f3f3; +} + +.styled-table tbody tr:last-of-type { + border-bottom: 2px solid #009879; +} + +.node-body { + font-size:15px; +} +.tf-nc { + position: relative; + width: 250px; + text-align: center; + background-color: #fff100; +} +""" + + +class NodeTiming: + + def __init__(self, phase: str, time: float) -> object: + self.phase = phase + self.time = time + # percentage is determined later. + self.percentage = 0 + + def calculate_percentage(self, total_time: float) -> None: + self.percentage = self.time / total_time + + def combine_timing(l: object, r: object) -> object: + # TODO: can only add timings for same-phase nodes + total_time = l.time + r.time + return NodeTiming(l.phase, total_time) + + +class AllTimings: + + def __init__(self): + self.phase_to_timings = {} + + def add_node_timing(self, node_timing: NodeTiming): + if node_timing.phase in self.phase_to_timings: + self.phase_to_timings[node_timing.phase].append(node_timing) + return + self.phase_to_timings[node_timing.phase] = [node_timing] + + def get_phase_timings(self, phase: str): + return self.phase_to_timings[phase] + + def get_summary_phase_timings(self, phase: str): + return reduce(NodeTiming.combine_timing, self.phase_to_timings[phase]) + + def get_phases(self): + phases = list(self.phase_to_timings.keys()) + phases.sort(key=lambda x: (self.get_summary_phase_timings(x)).time) + phases.reverse() + return phases + + def get_sum_of_all_timings(self): + total_timing_sum = 0 + for phase in self.phase_to_timings.keys(): + total_timing_sum += self.get_summary_phase_timings(phase).time + return total_timing_sum + + +def open_utf8(fpath: str, flags: str) -> object: + return open(fpath, flags, encoding="utf8") + + +def get_child_timings(top_node: object, query_timings: object) -> str: + node_timing = NodeTiming(top_node['name'], float(top_node['timing'])) + query_timings.add_node_timing(node_timing) + for child in top_node['children']: + get_child_timings(child, query_timings) + + +color_map = { + "HASH_JOIN": "#ffffba", + "PROJECTION": "#ffb3ba", + "SEQ_SCAN": "#baffc9", + "UNGROUPED_AGGREGATE": "#ffdfba", + "FILTER": "#bae1ff", + "ORDER_BY": "#facd60", + "PERFECT_HASH_GROUP_BY": "#ffffba", + "HASH_GROUP_BY": "#ffffba", + "NESTED_LOOP_JOIN": "#ffffba", + "STREAMING_LIMIT": "#facd60", + "COLUMN_DATA_SCAN": "#1ac0c6", + "TOP_N": "#ffdfba" +} + + +def get_node_body(name: str, result: str, cardinality: float, extra_info: str, timing: object) -> str: + node_style = "" + stripped_name = name.strip() + if stripped_name in color_map: + node_style = f"background-color: {color_map[stripped_name]};" + + body = f"" + body += "
" + new_name = name.replace("_", " ") + body += f"

{new_name} ({result}s)

" + if extra_info: + extra_info = extra_info.replace("[INFOSEPARATOR]", "----") + extra_info = extra_info.replace("

", "
") + body += f"

{extra_info}

" + body += f"

cardinality = {cardinality}

" + # TODO: Expand on timing. Usually available from a detailed profiling + body += "
" + body += "
" + return body + + +def generate_tree_recursive(json_graph: object) -> str: + node_prefix_html = "
  • " + node_suffix_html = "
  • " + node_body = get_node_body(json_graph["name"], + json_graph["timing"], + json_graph["cardinality"], + json_graph["extra_info"].replace("\n", "
    "), + json_graph["timings"]) + + children_html = "" + if len(json_graph['children']) >= 1: + children_html += "
      " + for child in json_graph["children"]: + children_html += generate_tree_recursive(child) + children_html += "
    " + return node_prefix_html + node_body + children_html + node_suffix_html + + +# For generating the table in the top left. +def generate_timing_html(graph_json: object, query_timings: object) -> object: + json_graph = json.loads(graph_json) + gather_timing_information(json_graph, query_timings) + total_time = float(json_graph['timing']) + table_head = """ + + + + + + + + """ + + table_body = "" + table_end = "
    PhaseTimePercentage
    " + + execution_time = query_timings.get_sum_of_all_timings() + + all_phases = query_timings.get_phases() + query_timings.add_node_timing(NodeTiming("TOTAL TIME", total_time)) + query_timings.add_node_timing(NodeTiming("Execution Time", execution_time)) + all_phases = ["TOTAL TIME", "Execution Time"] + all_phases + for phase in all_phases: + summarized_phase = query_timings.get_summary_phase_timings(phase) + summarized_phase.calculate_percentage(total_time) + phase_column = f"{phase}" if phase == "TOTAL TIME" or phase == "Execution Time" else phase + table_body += f""" + + {phase_column} + {summarized_phase.time} + {str(summarized_phase.percentage * 100)[:6]}% + +""" + table_body += table_end + return table_head + table_body + + +def generate_tree_html(graph_json: object) -> str: + json_graph = json.loads(graph_json) + tree_prefix = "
    \n
      " + tree_suffix = "
    " + # first level of json is general overview + # FIXME: make sure json output first level always has only 1 level + tree_body = generate_tree_recursive(json_graph['children'][0]) + return tree_prefix + tree_body + tree_suffix + + +def generate_ipython(json_input: str) -> str: + from IPython.core.display import HTML + + html_output = generate_html(json_input, False) + + return HTML(("\n" + " ${CSS}\n" + " ${LIBRARIES}\n" + "
    \n" + " ${CHART_SCRIPT}\n" + " ").replace("${CSS}", html_output['css']).replace('${CHART_SCRIPT}', + html_output['chart_script']).replace( + '${LIBRARIES}', html_output['libraries'])) + + +def generate_style_html(graph_json: str, include_meta_info: bool) -> None: + treeflex_css = "\n" + css = "\n" + return { + 'treeflex_css': treeflex_css, + 'duckdb_css': css, + 'libraries': '', + 'chart_script': '' + } + + +def gather_timing_information(json: str, query_timings: object) -> None: + # add up all of the times + # measure each time as a percentage of the total time. + # then you can return a list of [phase, time, percentage] + get_child_timings(json['children'][0], query_timings) + + +def translate_json_to_html(input_file: str, output_file: str) -> None: + query_timings = AllTimings() + with open_utf8(input_file, 'r') as f: + text = f.read() + + html_output = generate_style_html(text, True) + timing_table = generate_timing_html(text, query_timings) + tree_output = generate_tree_html(text) + + # finally create and write the html + with open_utf8(output_file, "w+") as f: + html = """ + + + + + Query Profile Graph for Query + ${TREEFLEX_CSS} + + + +
    +
    + ${TIMING_TABLE} +
    + ${TREE} + + +""" + html = html.replace("${TREEFLEX_CSS}", html_output['treeflex_css']) + html = html.replace("${DUCKDB_CSS}", html_output['duckdb_css']) + html = html.replace("${TIMING_TABLE}", timing_table) + html = html.replace('${TREE}', tree_output) + f.write(html) + + +def main() -> None: + if sys.version_info[0] < 3: + print("Please use python3") + exit(1) + parser = argparse.ArgumentParser( + prog='Query Graph Generator', + description='Given a json profile output, generate a html file showing the query graph and timings of operators') + parser.add_argument('profile_input', help='profile input in json') + parser.add_argument('--out', required=False, default=False) + parser.add_argument('--open', required=False, action='store_true', default=True) + args = parser.parse_args() + + input = args.profile_input + output = args.out + if not args.out: + if ".json" in input: + output = input.replace(".json", ".html") + else: + print("please provide profile output in json") + exit(1) + else: + if ".html" in args.out: + output = args.out + else: + print("please provide valid .html file for output name") + exit(1) + + open_output = args.open + + translate_json_to_html(input, output) + + if open_output: + webbrowser.open('file://' + os.path.abspath(output), new=2) + + +if __name__ == '__main__': + main() diff --git a/tools/pythonpkg/setup.py b/tools/pythonpkg/setup.py index 30f1e1ccddb3..c836548928ae 100644 --- a/tools/pythonpkg/setup.py +++ b/tools/pythonpkg/setup.py @@ -319,6 +319,7 @@ def setup_data_files(data_files): packages = [ lib_name, 'duckdb.typing', + 'duckdb.query_graph', 'duckdb.functional', 'duckdb.value', 'duckdb-stubs', diff --git a/tools/pythonpkg/src/pyconnection.cpp b/tools/pythonpkg/src/pyconnection.cpp index 8161276ccf5a..2e46b858cd6d 100644 --- a/tools/pythonpkg/src/pyconnection.cpp +++ b/tools/pythonpkg/src/pyconnection.cpp @@ -47,6 +47,7 @@ #include "duckdb_python/pybind11/conversions/exception_handling_enum.hpp" #include "duckdb/parser/parsed_data/drop_info.hpp" #include "duckdb/catalog/catalog_entry/scalar_function_catalog_entry.hpp" +#include "duckdb/main/pending_query_result.hpp" #include @@ -448,7 +449,11 @@ unique_ptr DuckDBPyConnection::ExecuteInternal(const string &query, // if there are multiple statements, we directly execute the statements besides the last one // we only return the result of the last statement to the user, unless one of the previous statements fails for (idx_t i = 0; i + 1 < statements.size(); i++) { - // TODO: this doesn't take in any prepared parameters? + if (statements[i]->n_param != 0) { + throw NotImplementedException( + "Prepared parameters are only supported for the last statement, please split your query up into " + "separate 'execute' calls if you want to use prepared parameters"); + } auto pending_query = connection->PendingQuery(std::move(statements[i]), false); auto res = CompletePendingQuery(*pending_query); diff --git a/tools/pythonpkg/tests/fast/api/test_duckdb_execute.py b/tools/pythonpkg/tests/fast/api/test_duckdb_execute.py new file mode 100644 index 000000000000..5e365c8fee7f --- /dev/null +++ b/tools/pythonpkg/tests/fast/api/test_duckdb_execute.py @@ -0,0 +1,38 @@ +import duckdb +import pytest + + +class TestDuckDBExecute(object): + def test_execute_basic(self, duckdb_cursor): + duckdb_cursor.execute('create table t as select 5') + res = duckdb_cursor.table('t').fetchall() + assert res == [(5,)] + + def test_execute_many_basic(self, duckdb_cursor): + duckdb_cursor.execute("create table t(x int);") + + # This works because prepared parameter is only present in the last statement + duckdb_cursor.execute( + """ + delete from t where x=5; + insert into t(x) values($1); + """, + (99,), + ) + res = duckdb_cursor.table('t').fetchall() + assert res == [(99,)] + + def test_execute_many_error(self, duckdb_cursor): + duckdb_cursor.execute("create table t(x int);") + + # Prepared parameter used in a statement that is not the last + with pytest.raises( + duckdb.NotImplementedException, match='Prepared parameters are only supported for the last statement' + ): + duckdb_cursor.execute( + """ + delete from t where x=$1; + insert into t(x) values($1); + """, + (99,), + )