10000 Remove superfluous DISTINCT by dey4ss · Pull Request #2561 · hyrise/hyrise · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Remove superfluous DISTINCT #2561

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 36 additions & 14 deletions src/lib/optimizer/strategy/dependent_group_by_reduction_rule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

#include <unordered_map>

#include "expression/abstract_expression.hpp"
#include "expression/expression_functional.hpp"
#include "expression/expression_utils.hpp"
#include "hyrise.hpp"
Expand Down Expand Up @@ -81,19 +80,8 @@ void DependentGroupByReductionRule::_apply_to_plan_without_subqueries(
}
auto& aggregate_node = static_cast<AggregateNode&>(*node);

// Early exit: If there are no functional dependencies, we can skip this rule.
const auto& fds = aggregate_node.functional_dependencies();
if (fds.empty()) {
return LQPVisitation::VisitInputs;
}

// --- Preparation ---
// Store a copy of the root's output expressions before applying the rule.
const auto root_output_expressions = lqp_root->output_expressions();
// Also store a copy of the aggregate's output expressions to verify the output column order later on.
const auto initial_aggregate_output_expressions = aggregate_node.output_expressions();

// Gather group-by columns
// --- Preparation --
// Gather group-by columns.
const auto fetch_group_by_columns = [&aggregate_node]() {
auto group_by_columns = ExpressionUnorderedSet{aggregate_node.aggregate_expressions_begin_idx + 1};
auto node_expressions_iter = aggregate_node.node_expressions.cbegin();
Expand All @@ -103,6 +91,40 @@ void DependentGroupByReductionRule::_apply_to_plan_without_subqueries(
};
auto group_by_columns = fetch_group_by_columns();

// Early exit (i): To guarantee distinct results for SELECT DISTINCT clauses, the SQLTranslator adds an
// AggregateNode with the selected attributes as group by columns. If the AggregateNode's input is already unique
// for these columns, remove the entire node.
// Example: SELECT DISTINCT n_nationkey FROM nation is reflected as [ Aggregate GROUP BY n_nationkey ]. The
// AggregateNode does not have more node expressions than the single GROUP BY column and the input (StoredTableNode
// in this case) has a UCC for { n_nationkey }.
if (group_by_columns.size() == node->node_expressions.size() &&
node->left_input()->has_matching_ucc(group_by_columns)) {
const auto& output_expressions = aggregate_node.output_expressions();
// Remove the AggregateNode if it does not limit or reorder the output expressions.
if (expressions_equal(output_expressions, node->left_input()->output_expressions())) {
lqp_remove_node(node);
return LQPVisitation::VisitInputs;
}

// Else, replace it with a ProjectionNode. For instance, SELECT DISTINCT n_name, n_regionkey FROM nation (the
// column order is different than in the original table) turns from [ Aggregate GROUP BY n_name, n_regionkey ] to
// [ Projection n_name, n_regionkey ].
const auto projection_node = ProjectionNode::make(output_expressions);
lqp_replace_node(node, projection_node);
return LQPVisitation::VisitInputs;
}

// Early exit (ii): If there are no functional dependencies, we can skip this rule.
const auto& fds = aggregate_node.functional_dependencies();
if (fds.empty()) {
return LQPVisitation::VisitInputs;
}

// Store a copy of the root's output expressions before applying the rule.
const auto& root_output_expressions = lqp_root->output_expressions();
// Also store a copy of the aggregate's output expressions to verify the output column order later on.
const auto& initial_aggregate_output_expressions = aggregate_node.output_expressions();

// Get a sorted list of ColumnIDs from an FD's set of determinants.
const auto get_column_ids = [&](const auto& determinants) {
auto column_ids = std::vector<ColumnID>{};
Expand Down
10000
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
#pragma once

#include "abstract_rule.hpp"
#include "expression/abstract_expression.hpp"
#include "expression/lqp_column_expression.hpp"

namespace hyrise {

class AbstractLQPNode;
class AggregateNode;
class StoredTableNode;

/**
* In SQL, the returned columns of an aggregate are either "group-by columns" or "aggregate columns", e.g., SUM(a). As
* an example, in TPC-H Q10, the aggregate on customer includes both c_custkey as well as c_name (amongst others) in
Expand Down Expand Up @@ -38,6 +32,9 @@ class StoredTableNode;
* determinant column(s) in this case the group is ensured to be of size one). This rule implements choke point 1.4 of
* "TPC-H Analyzed: Hidden Messages and Lessons Learned from an Influential Benchmark" (Boncz et al.).
* However, not all queries listed in the paper can be optimized yet, since Hyrise lacks foreign key support.
*
* Besides, this rule removes AggregateNodes that are inserted for SELECT DISTINCT clauses if the required columns are
* already distinct.
*/
class DependentGroupByReductionRule : public AbstractRule {
public:
Expand Down
8 changes: 4 additions & 4 deletions src/lib/sql/sql_translator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1186,11 +1186,11 @@ void SQLTranslator::_translate_select_groupby_having(const hsql::SelectStatement
//
// This might create unnecessary aggregate nodes when we already have an aggregation that creates unique results:
// `SELECT DISTINCT a, MIN(b) FROM t GROUP BY a` would have one aggregate that groups by a and calculates MIN(b), and
// one that groups by both a and MIN(b) without calculating anything. Fixing this should be done by an optimizer rule
// that checks for each GROUP BY whether it guarantees the results to be unique or not. Doable, but no priority.
// one that groups by both a and MIN(b) without calculating anything. Fixing this is done by an optimizer rule
// (DependentGroupByReductionRule) that checks if the respective columns are already unique.
if (select.selectDistinct) {
_current_lqp = AggregateNode::make(_unwrap_elements(_inflated_select_list_elements),
std::vector<std::shared_ptr<AbstractExpression>>{}, _current_lqp);
_current_lqp =
AggregateNode::make(_unwrap_elements(_inflated_select_list_elements), expression_vector(), _current_lqp);
}
}

Expand Down
F438
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@ class DependentGroupByReductionRuleTest : public StrategyBaseTest {
void SetUp() override {
auto& storage_manager = Hyrise::get().storage_manager;

TableColumnDefinitions column_definitions{
{"column0", DataType::Int, false}, {"column1", DataType::Int, false}, {"column2", DataType::Int, false}};
TableColumnDefinitions column_definitions{{"column0", DataType::Int, false},
{"column1", DataType::Int, false},
{"column2", DataType::Int, false},
{"column3", DataType::Int, false}};

table_a = std::make_shared<Table>(column_definitions, TableType::Data, ChunkOffset{2}, UseMvcc::Yes);
table_a->add_soft_key_constraint({{ColumnID{0}}, KeyConstraintType::PRIMARY_KEY});
Expand Down Expand Up @@ -60,6 +62,7 @@ class DependentGroupByReductionRuleTest : public StrategyBaseTest {
column_e_0 = stored_table_node_e->get_column("column0");
column_e_1 = stored_table_node_e->get_column("column1");
column_e_2 = stored_table_node_e->get_column("column2");
column_e_3 = stored_table_node_e->get_column("column3");

rule = std::make_shared<DependentGroupByReductionRule>();
}
Expand All @@ -73,7 +76,7 @@ class DependentGroupByReductionRuleTest : public StrategyBaseTest {
std::shared_ptr<LQPColumnExpression> column_b_0, column_b_1, column_b_2;
std::shared_ptr<LQPColumnExpression> column_c_0, column_c_1, column_c_2;
std::shared_ptr<LQPColumnExpression> column_d_0;
std::shared_ptr<LQPColumnExpression> column_e_0, column_e_1, column_e_2;
std::shared_ptr<LQPColumnExpression> column_e_0, column_e_1, column_e_2, column_e_3;
};

// Test simple cases.
Expand All @@ -82,8 +85,8 @@ TEST_F(DependentGroupByReductionRuleTest, SimpleCases) {
{
const auto lqp = PredicateNode::make(equals_(column_a_0, 17), stored_table_node_a);

const auto actual_lqp = apply_rule(rule, lqp);
const auto expected_lqp = lqp->deep_copy();
const auto actual_lqp = apply_rule(rule, lqp);
EXPECT_LQP_EQ(actual_lqp, expected_lqp);
}

Expand All @@ -92,8 +95,8 @@ TEST_F(DependentGroupByReductionRuleTest, SimpleCases) {
const auto lqp =
AggregateNode::make(expression_vector(column_d_0), expression_vector(sum_(column_d_0)), stored_table_node_d);

const auto actual_lqp = apply_rule(rule, lqp);
const auto expected_lqp = lqp->deep_copy();
const auto actual_lqp = apply_rule(rule, lqp);
EXPECT_LQP_EQ(actual_lqp, expected_lqp);
}
}
Expand Down Expand Up @@ -141,8 +144,8 @@ TEST_F(DependentGroupByReductionRuleTest, IncompleteKey) {
stored_table_node_b);
// clang-format on

const auto actual_lqp = apply_rule(rule, lqp);
const auto expected_lqp = lqp->deep_copy();
const auto actual_lqp = apply_rule(rule, lqp);

EXPECT_LQP_EQ(actual_lqp, expected_lqp);
}
Expand All @@ -155,8 +158,8 @@ TEST_F(DependentGroupByReductionRuleTest, FullKeyGroupBy) {
stored_table_node_b);
// clang-format on

const auto actual_lqp = apply_rule(rule, lqp);
const auto expected_lqp = lqp->deep_copy();
const auto actual_lqp = apply_rule(rule, lqp);

EXPECT_LQP_EQ(actual_lqp, expected_lqp);
}
Expand Down Expand Up @@ -267,8 +270,8 @@ TEST_F(DependentGroupByReductionRuleTest, NoAdaptionForNullableColumns) {
stored_table_node_b));
// clang-format on

const auto actual_lqp = apply_rule(rule, lqp);
const auto expected_lqp = lqp->deep_copy();
const auto actual_lqp = apply_rule(rule, lqp);

EXPECT_LQP_EQ(actual_lqp, expected_lqp);
}
Expand All @@ -277,12 +280,12 @@ TEST_F(DependentGroupByReductionRuleTest, NoAdaptionForNullableColumns) {
TEST_F(DependentGroupByReductionRuleTest, ShortConstraintsFirst) {
// clang-format off
const auto lqp =
AggregateNode::make(expression_vector(column_e_0, column_e_1, column_e_2), expression_vector(),
AggregateNode::make(expression_vector(column_e_0, column_e_1, column_e_2), expression_vector(min_(column_e_3)),
stored_table_node_e);

const auto expected_lqp =
ProjectionNode::make(expression_vector(column_e_0, column_e_1, column_e_2),
AggregateNode::make(expression_vector(column_e_2), expression_vector(any_(column_e_1), any_(column_e_0)),
ProjectionNode::make(expression_vector(column_e_0, column_e_1, column_e_2, min_(column_e_3)),
AggregateNode::make(expression_vector(column_e_2), expression_vector(min_(column_e_3), any_(column_e_1), any_(column_e_0)), // NOLINT(whitespace/line_length)
stored_table_node_e));
// clang-format on

Expand Down Expand Up @@ -322,4 +325,109 @@ TEST_F(DependentGroupByReductionRuleTest, MultiKeyReduction) {
EXPECT_LQP_EQ(actual_lqp, expected_lqp);
}

TEST_F(DependentGroupByReductionRuleTest, RemoveSuperfluousDistinctAggregateSimple) {
// To guarantee distinct results for SELECT DISTINCT clauses, the SQLTranslator adds an AggregateNode with the
// selected attributes as group by columns. If the AggregateNode's input is already unique for these columns,
// remove the entire node.

// Basic case: Column is unique due to a constraint (e.g., it is a primary key).
// Example query: SELECT DISTINCT column0 FROM table_a;
{
// clang-format off
const auto lqp =
AggregateNode::make(expression_vector(column_a_0), expression_vector(),
stored_table_node_a);
// clang-format on

stored_table_node_a->set_pruned_column_ids({ColumnID{1}, ColumnID{2}, ColumnID{3}});

const auto expected_lqp = stored_table_node_a->deep_copy();
const auto actual_lqp = apply_rule(rule, lqp);
EXPECT_LQP_EQ(actual_lqp, expected_lqp);
}

// More advanced case: Column is unique due to another operation (e.g., we grouped by it before).
// Example query: SELECT DISTINCT column1, MIN(column2) FROM table_a GROUP BY column1;
{
// clang-format off
const auto lqp =
AggregateNode::make(expression_vector(column_a_1, min_(column_a_2)), expression_vector(),
AggregateNode::make(expression_vector(column_a_1), expression_vector(min_(column_a_2)),
stored_table_node_a));

const auto expected_lqp =
AggregateNode::make(expression_vector(column_a_1), expression_vector(min_(column_a_2)),
stored_table_node_a);
// clang-format on

const auto actual_lqp = apply_rule(rule, lqp);
EXPECT_LQP_EQ(actual_lqp, expected_lqp);
}
}

TEST_F(DependentGroupByReductionRuleTest, RemoveSuperfluousDistinctAggregateProjectColumns) {
// Remove the AggregateNode completely when it is used to for SELECT DISTINCT on a unique column, but add a
// ProjectionNode to output only the desired column.
{
// clang-format off
const auto lqp =
AggregateNode::make(expression_vector(column_a_0), expression_vector(),
stored_table_node_a);

const auto expected_lqp =
ProjectionNode::make(expression_vector(column_a_0),
stored_table_node_a);
// clang-format on

const auto actual_lqp = apply_rule(rule, lqp);
EXPECT_LQP_EQ(actual_lqp, expected_lqp);
}

// All columns are part of the grouped columns, but the order changes. Thus, we need a ProjectionNode that changes the
// order.
{
// clang-format off
const auto lqp =
AggregateNode::make(expression_vector(column_e_3, column_e_2, column_e_1, column_e_0), expression_vector(),
stored_table_node_e);

const auto expected_lqp =
ProjectionNode::make(expression_vector(column_e_3, column_e_2, column_e_1, column_e_0),
stored_table_node_e);
// clang-format on

const auto actual_lqp = apply_rule(rule, lqp);
EXPECT_LQP_EQ(actual_lqp, expected_lqp);
}
}

TEST_F(DependentGroupByReductionRuleTest, DoNotRemoveRequiredDistinctAggregate) {
// Do not remove the AggregateNode when the grouped column is not unique (only a combination of the two columns
// column0 and column1 is unique).
{
// clang-format off
const auto lqp =
AggregateNode::make(expression_vector(column_b_0), expression_vector(),
stored_table_node_b);
// clang-format on

const auto expected_lqp = lqp->deep_copy();
const auto actual_lqp = apply_rule(rule, lqp);
EXPECT_LQP_EQ(actual_lqp, expected_lqp);
}

// Do not remove the AggregateNode when the grouped column is unique but there are further aggregates.
{
// clang-format off
const auto lqp =
AggregateNode::make(expression_vector(column_a_0), expression_vector(min_(column_a_1)),
stored_table_node_a);
// clang-format on

const auto expected_lqp = lqp->deep_copy();
const auto actual_lqp = apply_rule(rule, lqp);
EXPECT_LQP_EQ(actual_lqp, expected_lqp);
}
}

} // namespace hyrise
0