hyrise · dey4ss · May 17, 2023 · Apr 12, 2023 · Apr 12, 2023 · Apr 12, 2023
diff --git a/src/lib/optimizer/strategy/dependent_group_by_reduction_rule.cpp b/src/lib/optimizer/strategy/dependent_group_by_reduction_rule.cpp
@@ -2,7 +2,6 @@
 
 #include <unordered_map>
 
-#include "expression/abstract_expression.hpp"
 #include "expression/expression_functional.hpp"
 #include "expression/expression_utils.hpp"
 #include "hyrise.hpp"
@@ -81,19 +80,8 @@ void DependentGroupByReductionRule::_apply_to_plan_without_subqueries(
     }
     auto& aggregate_node = static_cast<AggregateNode&>(*node);
 
-    // Early exit: If there are no functional dependencies, we can skip this rule.
-    const auto& fds = aggregate_node.functional_dependencies();
-    if (fds.empty()) {
-      return LQPVisitation::VisitInputs;
-    }
-
-    // --- Preparation ---
-    // Store a copy of the root's output expressions before applying the rule.
-    const auto root_output_expressions = lqp_root->output_expressions();
-    // Also store a copy of the aggregate's output expressions to verify the output column order later on.
-    const auto initial_aggregate_output_expressions = aggregate_node.output_expressions();
-
-    // Gather group-by columns
+    // --- Preparation --
+    // Gather group-by columns.
     const auto fetch_group_by_columns = [&aggregate_node]() {
       auto group_by_columns = ExpressionUnorderedSet{aggregate_node.aggregate_expressions_begin_idx + 1};
       auto node_expressions_iter = aggregate_node.node_expressions.cbegin();
@@ -103,6 +91,40 @@ void DependentGroupByReductionRule::_apply_to_plan_without_subqueries(
     };
     auto group_by_columns = fetch_group_by_columns();
 
+    // Early exit (i): To guarantee distinct results for SELECT DISTINCT clauses, the SQLTranslator adds an
+    // AggregateNode with the selected attributes as group by columns. If the AggregateNode's input is already unique
+    // for these columns, remove the entire node.
+    // Example: SELECT DISTINCT n_nationkey FROM nation is reflected as [ Aggregate GROUP BY n_nationkey ]. The
+    // AggregateNode does not have more node expressions than the single GROUP BY column and the input (StoredTableNode
+    // in this case) has a UCC for { n_nationkey }.
+    if (group_by_columns.size() == node->node_expressions.size() &&
+        node->left_input()->has_matching_ucc(group_by_columns)) {
+      const auto& output_expressions = aggregate_node.output_expressions();
+      // Remove the AggregateNode if it does not limit or reorder the output expressions.
+      if (expressions_equal(output_expressions, node->left_input()->output_expressions())) {
+        lqp_remove_node(node);
+        return LQPVisitation::VisitInputs;
+      }
+
+      // Else, replace it with a ProjectionNode. For instance, SELECT DISTINCT n_name, n_regionkey FROM nation (the
+      // column order is different than in the original table) turns from [ Aggregate GROUP BY n_name, n_regionkey ] to
+      // [ Projection n_name, n_regionkey ].
+      const auto projection_node = ProjectionNode::make(output_expressions);
+      lqp_replace_node(node, projection_node);
+      return LQPVisitation::VisitInputs;
+    }
+
+    // Early exit (ii): If there are no functional dependencies, we can skip this rule.
+    const auto& fds = aggregate_node.functional_dependencies();
+    if (fds.empty()) {
+      return LQPVisitation::VisitInputs;
+    }
+
+    // Store a copy of the root's output expressions before applying the rule.
+    const auto& root_output_expressions = lqp_root->output_expressions();
+    // Also store a copy of the aggregate's output expressions to verify the output column order later on.
+    const auto& initial_aggregate_output_expressions = aggregate_node.output_expressions();
+
     // Get a sorted list of ColumnIDs from an FD's set of determinants.
     const auto get_column_ids = [&](const auto& determinants) {
       auto column_ids = std::vector<ColumnID>{};

diff --git a/src/lib/optimizer/strategy/dependent_group_by_reduction_rule.hpp b/src/lib/optimizer/strategy/dependent_group_by_reduction_rule.hpp
@@ -1,15 +1,9 @@
 #pragma once
 
 #include "abstract_rule.hpp"
-#include "expression/abstract_expression.hpp"
-#include "expression/lqp_column_expression.hpp"
 
 namespace hyrise {
 
-class AbstractLQPNode;
-class AggregateNode;
-class StoredTableNode;
-
 /**
  * In SQL, the returned columns of an aggregate are either "group-by columns" or "aggregate columns", e.g., SUM(a). As
  * an example, in TPC-H Q10, the aggregate on customer includes both c_custkey as well as c_name (amongst others) in
@@ -38,6 +32,9 @@ class StoredTableNode;
  * determinant column(s) in this case the group is ensured to be of size one). This rule implements choke point 1.4 of
  * "TPC-H Analyzed: Hidden Messages and Lessons Learned from an Influential Benchmark" (Boncz et al.).
  * However, not all queries listed in the paper can be optimized yet, since Hyrise lacks foreign key support.
+ *
+ * Besides, this rule removes AggregateNodes that are inserted for SELECT DISTINCT clauses if the required columns are
+ * already distinct.
  */
 class DependentGroupByReductionRule : public AbstractRule {
  public:

diff --git a/src/lib/sql/sql_translator.cpp b/src/lib/sql/sql_translator.cpp
@@ -1186,11 +1186,11 @@ void SQLTranslator::_translate_select_groupby_having(const hsql::SelectStatement
   //
   // This might create unnecessary aggregate nodes when we already have an aggregation that creates unique results:
   // `SELECT DISTINCT a, MIN(b) FROM t GROUP BY a` would have one aggregate that groups by a and calculates MIN(b), and
-  // one that groups by both a and MIN(b) without calculating anything. Fixing this should be done by an optimizer rule
-  // that checks for each GROUP BY whether it guarantees the results to be unique or not. Doable, but no priority.
+  // one that groups by both a and MIN(b) without calculating anything. Fixing this is done by an optimizer rule
+  // (DependentGroupByReductionRule) that checks if the respective columns are already unique.
   if (select.selectDistinct) {
-    _current_lqp = AggregateNode::make(_unwrap_elements(_inflated_select_list_elements),
-                                       std::vector<std::shared_ptr<AbstractExpression>>{}, _current_lqp);
+    _current_lqp =
+        AggregateNode::make(_unwrap_elements(_inflated_select_list_elements), expression_vector(), _current_lqp);
   }
 }
 

diff --git a/src/test/lib/optimizer/strategy/dependent_group_by_reduction_rule_test.cpp b/src/test/lib/optimizer/strategy/dependent_group_by_reduction_rule_test.cpp
@@ -19,8 +19,10 @@ class DependentGroupByReductionRuleTest : public StrategyBaseTest {
   void SetUp() override {
     auto& storage_manager = Hyrise::get().storage_manager;
 
-    TableColumnDefinitions column_definitions{
-        {"column0", DataType::Int, false}, {"column1", DataType::Int, false}, {"column2", DataType::Int, false}};
+    TableColumnDefinitions column_definitions{{"column0", DataType::Int, false},
+                                              {"column1", DataType::Int, false},
+                                              {"column2", DataType::Int, false},
+                                              {"column3", DataType::Int, false}};
 
     table_a = std::make_shared<Table>(column_definitions, TableType::Data, ChunkOffset{2}, UseMvcc::Yes);
     table_a->add_soft_key_constraint({{ColumnID{0}}, KeyConstraintType::PRIMARY_KEY});
@@ -60,6 +62,7 @@ class DependentGroupByReductionRuleTest : public StrategyBaseTest {
     column_e_0 = stored_table_node_e->get_column("column0");
     column_e_1 = stored_table_node_e->get_column("column1");
     column_e_2 = stored_table_node_e->get_column("column2");
+    column_e_3 = stored_table_node_e->get_column("column3");
 
     rule = std::make_shared<DependentGroupByReductionRule>();
   }
@@ -73,7 +76,7 @@ class DependentGroupByReductionRuleTest : public StrategyBaseTest {
   std::shared_ptr<LQPColumnExpression> column_b_0, column_b_1, column_b_2;
   std::shared_ptr<LQPColumnExpression> column_c_0, column_c_1, column_c_2;
   std::shared_ptr<LQPColumnExpression> column_d_0;
-  std::shared_ptr<LQPColumnExpression> column_e_0, column_e_1, column_e_2;
+  std::shared_ptr<LQPColumnExpression> column_e_0, column_e_1, column_e_2, column_e_3;
 };
 
 // Test simple cases.
@@ -82,8 +85,8 @@ TEST_F(DependentGroupByReductionRuleTest, SimpleCases) {
   {
     const auto lqp = PredicateNode::make(equals_(column_a_0, 17), stored_table_node_a);
 
-    const auto actual_lqp = apply_rule(rule, lqp);
     const auto expected_lqp = lqp->deep_copy();
+    const auto actual_lqp = apply_rule(rule, lqp);
     EXPECT_LQP_EQ(actual_lqp, expected_lqp);
   }
 
@@ -92,8 +95,8 @@ TEST_F(DependentGroupByReductionRuleTest, SimpleCases) {
     const auto lqp =
         AggregateNode::make(expression_vector(column_d_0), expression_vector(sum_(column_d_0)), stored_table_node_d);
 
-    const auto actual_lqp = apply_rule(rule, lqp);
     const auto expected_lqp = lqp->deep_copy();
+    const auto actual_lqp = apply_rule(rule, lqp);
     EXPECT_LQP_EQ(actual_lqp, expected_lqp);
   }
 }
@@ -141,8 +144,8 @@ TEST_F(DependentGroupByReductionRuleTest, IncompleteKey) {
     stored_table_node_b);
   // clang-format on
 
-  const auto actual_lqp = apply_rule(rule, lqp);
   const auto expected_lqp = lqp->deep_copy();
+  const auto actual_lqp = apply_rule(rule, lqp);
 
   EXPECT_LQP_EQ(actual_lqp, expected_lqp);
 }
@@ -155,8 +158,8 @@ TEST_F(DependentGroupByReductionRuleTest, FullKeyGroupBy) {
     stored_table_node_b);
   // clang-format on
 
-  const auto actual_lqp = apply_rule(rule, lqp);
   const auto expected_lqp = lqp->deep_copy();
+  const auto actual_lqp = apply_rule(rule, lqp);
 
   EXPECT_LQP_EQ(actual_lqp, expected_lqp);
 }
@@ -267,8 +270,8 @@ TEST_F(DependentGroupByReductionRuleTest, NoAdaptionForNullableColumns) {
       stored_table_node_b));
   // clang-format on
 
-  const auto actual_lqp = apply_rule(rule, lqp);
   const auto expected_lqp = lqp->deep_copy();
+  const auto actual_lqp = apply_rule(rule, lqp);
 
   EXPECT_LQP_EQ(actual_lqp, expected_lqp);
 }
@@ -277,12 +280,12 @@ TEST_F(DependentGroupByReductionRuleTest, NoAdaptionForNullableColumns) {
 TEST_F(DependentGroupByReductionRuleTest, ShortConstraintsFirst) {
   // clang-format off
   const auto lqp =
-  AggregateNode::make(expression_vector(column_e_0, column_e_1, column_e_2), expression_vector(),
+  AggregateNode::make(expression_vector(column_e_0, column_e_1, column_e_2), expression_vector(min_(column_e_3)),
     stored_table_node_e);
 
   const auto expected_lqp =
-  ProjectionNode::make(expression_vector(column_e_0, column_e_1, column_e_2),
-    AggregateNode::make(expression_vector(column_e_2), expression_vector(any_(column_e_1), any_(column_e_0)),
+  ProjectionNode::make(expression_vector(column_e_0, column_e_1, column_e_2, min_(column_e_3)),
+    AggregateNode::make(expression_vector(column_e_2), expression_vector(min_(column_e_3), any_(column_e_1), any_(column_e_0)),  // NOLINT(whitespace/line_length)
       stored_table_node_e));
   // clang-format on
 
@@ -322,4 +325,109 @@ TEST_F(DependentGroupByReductionRuleTest, MultiKeyReduction) {
   EXPECT_LQP_EQ(actual_lqp, expected_lqp);
 }
 
+TEST_F(DependentGroupByReductionRuleTest, RemoveSuperfluousDistinctAggregateSimple) {
+  // To guarantee distinct results for SELECT DISTINCT clauses, the SQLTranslator adds an AggregateNode with the
+  // selected attributes as group by columns. If the AggregateNode's input is already unique for these columns,
+  // remove the entire node.
+
+  // Basic case: Column is unique due to a constraint (e.g., it is a primary key).
+  // Example query: SELECT DISTINCT column0 FROM table_a;
+  {
+    // clang-format off
+    const auto lqp =
+    AggregateNode::make(expression_vector(column_a_0), expression_vector(),
+      stored_table_node_a);
+    // clang-format on
+
+    stored_table_node_a->set_pruned_column_ids({ColumnID{1}, ColumnID{2}, ColumnID{3}});
+
+    const auto expected_lqp = stored_table_node_a->deep_copy();
+    const auto actual_lqp = apply_rule(rule, lqp);
+    EXPECT_LQP_EQ(actual_lqp, expected_lqp);
+  }
+
+  // More advanced case: Column is unique due to another operation (e.g., we grouped by it before).
+  // Example query: SELECT DISTINCT column1, MIN(column2) FROM table_a GROUP BY column1;
+  {
+    // clang-format off
+    const auto lqp =
+    AggregateNode::make(expression_vector(column_a_1, min_(column_a_2)), expression_vector(),
+      AggregateNode::make(expression_vector(column_a_1), expression_vector(min_(column_a_2)),
+        stored_table_node_a));
+
+    const auto expected_lqp =
+    AggregateNode::make(expression_vector(column_a_1), expression_vector(min_(column_a_2)),
+      stored_table_node_a);
+    // clang-format on
+
+    const auto actual_lqp = apply_rule(rule, lqp);
+    EXPECT_LQP_EQ(actual_lqp, expected_lqp);
+  }
+}
+
+TEST_F(DependentGroupByReductionRuleTest, RemoveSuperfluousDistinctAggregateProjectColumns) {
+  // Remove the AggregateNode completely when it is used to for SELECT DISTINCT on a unique column, but add a
+  // ProjectionNode to output only the desired column.
+  {
+    // clang-format off
+    const auto lqp =
+    AggregateNode::make(expression_vector(column_a_0), expression_vector(),
+      stored_table_node_a);
+
+    const auto expected_lqp =
+    ProjectionNode::make(expression_vector(column_a_0),
+      stored_table_node_a);
+    // clang-format on
+
+    const auto actual_lqp = apply_rule(rule, lqp);
+    EXPECT_LQP_EQ(actual_lqp, expected_lqp);
+  }
+
+  // All columns are part of the grouped columns, but the order changes. Thus, we need a ProjectionNode that changes the
+  // order.
+  {
+    // clang-format off
+    const auto lqp =
+    AggregateNode::make(expression_vector(column_e_3, column_e_2, column_e_1, column_e_0), expression_vector(),
+      stored_table_node_e);
+
+    const auto expected_lqp =
+    ProjectionNode::make(expression_vector(column_e_3, column_e_2, column_e_1, column_e_0),
+      stored_table_node_e);
+    // clang-format on
+
+    const auto actual_lqp = apply_rule(rule, lqp);
+    EXPECT_LQP_EQ(actual_lqp, expected_lqp);
+  }
+}
+
+TEST_F(DependentGroupByReductionRuleTest, DoNotRemoveRequiredDistinctAggregate) {
+  // Do not remove the AggregateNode when the grouped column is not unique (only a combination of the two columns
+  // column0 and column1 is unique).
+  {
+    // clang-format off
+    const auto lqp =
+    AggregateNode::make(expression_vector(column_b_0), expression_vector(),
+      stored_table_node_b);
+    // clang-format on
+
+    const auto expected_lqp = lqp->deep_copy();
+    const auto actual_lqp = apply_rule(rule, lqp);
+    EXPECT_LQP_EQ(actual_lqp, expected_lqp);
+  }
+
+  // Do not remove the AggregateNode when the grouped column is unique but there are further aggregates.
+  {
+    // clang-format off
+    const auto lqp =
+    AggregateNode::make(expression_vector(column_a_0), expression_vector(min_(column_a_1)),
+      stored_table_node_a);
+    // clang-format on
+
+    const auto expected_lqp = lqp->deep_copy();
+    const auto actual_lqp = apply_rule(rule, lqp);
+    EXPECT_LQP_EQ(actual_lqp, expected_lqp);
+  }
+}
+
 }  // namespace hyrise