diff --git a/contrib/pax_storage/src/test/regress/expected/bfv_joins.out b/contrib/pax_storage/src/test/regress/expected/bfv_joins.out index 31515680256..e82e9e158c7 100644 --- a/contrib/pax_storage/src/test/regress/expected/bfv_joins.out +++ b/contrib/pax_storage/src/test/regress/expected/bfv_joins.out @@ -4190,6 +4190,36 @@ INSERT INTO ext_stats_tbl VALUES('tC', true); ANALYZE ext_stats_tbl; explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2; ERROR: FULL JOIN is only supported with merge-joinable or hash-joinable join conditions +-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a +-- scan filter on the outer relation. When the same outer relation feeds +-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there +-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's +-- own outer child, discarding outer rows that should be null-padded. +create table loj_bool_x(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table loj_bool_y1(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table loj_bool_y2(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into loj_bool_x values (true), (false), (false); +insert into loj_bool_y1 values (true); +insert into loj_bool_y2 values (true); +-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2. +-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x. +select loj_bool_x.c1, loj_bool_y2.c1 as y2c1 + from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1 + left join loj_bool_y2 on loj_bool_x.c1 + where loj_bool_y2.c1 is null + order by 1, 2; + c1 | y2c1 +----+------ + f | + f | +(2 rows) + -- Clean up. None of the objects we create are very interesting to keep around. reset search_path; set client_min_messages='warning'; diff --git a/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out b/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out index af48a5dd8d9..6426bf4f8fb 100644 --- a/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out @@ -4215,6 +4215,36 @@ INSERT INTO ext_stats_tbl VALUES('tC', true); ANALYZE ext_stats_tbl; explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2; ERROR: FULL JOIN is only supported with merge-joinable or hash-joinable join conditions +-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a +-- scan filter on the outer relation. When the same outer relation feeds +-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there +-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's +-- own outer child, discarding outer rows that should be null-padded. +create table loj_bool_x(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table loj_bool_y1(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table loj_bool_y2(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into loj_bool_x values (true), (false), (false); +insert into loj_bool_y1 values (true); +insert into loj_bool_y2 values (true); +-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2. +-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x. +select loj_bool_x.c1, loj_bool_y2.c1 as y2c1 + from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1 + left join loj_bool_y2 on loj_bool_x.c1 + where loj_bool_y2.c1 is null + order by 1, 2; + c1 | y2c1 +----+------ + f | + f | +(2 rows) + -- Clean up. None of the objects we create are very interesting to keep around. reset search_path; set client_min_messages='warning'; diff --git a/contrib/pax_storage/src/test/regress/expected/join_optimizer.out b/contrib/pax_storage/src/test/regress/expected/join_optimizer.out index 8dc204ae381..7b87831fce5 100644 --- a/contrib/pax_storage/src/test/regress/expected/join_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/join_optimizer.out @@ -6473,13 +6473,12 @@ select 1 from a t1 -> Broadcast Motion 3:3 (slice2; segments: 3) -> Nested Loop Left Join Join Filter: (t2.id = 1) - -> Index Only Scan using a_pkey on a t2 - Index Cond: (id = 1) + -> Seq Scan on a t2 -> Materialize -> Broadcast Motion 3:3 (slice3; segments: 3) -> Seq Scan on a t3 Optimizer: GPORCA -(15 rows) +(14 rows) -- check join removal works when uniqueness of the join condition is enforced -- by a UNION diff --git a/contrib/pax_storage/src/test/regress/sql/bfv_joins.sql b/contrib/pax_storage/src/test/regress/sql/bfv_joins.sql index edc39f58a7d..cb4acd0a9c6 100644 --- a/contrib/pax_storage/src/test/regress/sql/bfv_joins.sql +++ b/contrib/pax_storage/src/test/regress/sql/bfv_joins.sql @@ -604,6 +604,26 @@ ANALYZE ext_stats_tbl; explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2; +-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a +-- scan filter on the outer relation. When the same outer relation feeds +-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there +-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's +-- own outer child, discarding outer rows that should be null-padded. +create table loj_bool_x(c1 boolean); +create table loj_bool_y1(c1 boolean); +create table loj_bool_y2(c1 boolean); +insert into loj_bool_x values (true), (false), (false); +insert into loj_bool_y1 values (true); +insert into loj_bool_y2 values (true); + +-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2. +-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x. +select loj_bool_x.c1, loj_bool_y2.c1 as y2c1 + from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1 + left join loj_bool_y2 on loj_bool_x.c1 + where loj_bool_y2.c1 is null + order by 1, 2; + -- Clean up. None of the objects we create are very interesting to keep around. reset search_path; set client_min_messages='warning'; diff --git a/src/backend/gporca/libgpopt/src/operators/CNormalizer.cpp b/src/backend/gporca/libgpopt/src/operators/CNormalizer.cpp index 38c8a93a9ab..1a650a3831f 100644 --- a/src/backend/gporca/libgpopt/src/operators/CNormalizer.cpp +++ b/src/backend/gporca/libgpopt/src/operators/CNormalizer.cpp @@ -259,9 +259,60 @@ CNormalizer::PushThruOuterChild(CMemoryPool *mp, CExpression *pexpr, CExpression *pexprInner = (*pexpr)[1]; CExpression *pexprPred = (*pexpr)[2]; + // Strip from the incoming conjunct any conjunct that structurally matches + // a conjunct of the LOJ's own ON predicate. Pushing such a conjunct onto + // the LOJ's outer would duplicate the ON pred as a scan filter on the + // outer relation, which is invalid for LOJ semantics: outer rows that + // don't satisfy the ON pred must still appear in the result (null-padded), + // so they must not be filtered out below the join. + CExpression *pexprConjEffective = pexprConj; + CExpression *pexprConjOwned = nullptr; + { + CExpressionArray *pdrgpexprConjAll = + CPredicateUtils::PdrgpexprConjuncts(mp, pexprConj); + CExpressionArray *pdrgpexprOnConj = + CPredicateUtils::PdrgpexprConjuncts(mp, pexprPred); + CExpressionArray *pdrgpexprFiltered = + GPOS_NEW(mp) CExpressionArray(mp); + BOOL fAnyStripped = false; + for (ULONG ul = 0; ul < pdrgpexprConjAll->Size(); ul++) + { + CExpression *pexprC = (*pdrgpexprConjAll)[ul]; + BOOL fMatchesOn = false; + for (ULONG uo = 0; uo < pdrgpexprOnConj->Size(); uo++) + { + if (pexprC->Matches((*pdrgpexprOnConj)[uo])) + { + fMatchesOn = true; + break; + } + } + if (fMatchesOn) + { + fAnyStripped = true; + continue; + } + pexprC->AddRef(); + pdrgpexprFiltered->Append(pexprC); + } + pdrgpexprConjAll->Release(); + pdrgpexprOnConj->Release(); + + if (fAnyStripped) + { + pexprConjOwned = + CPredicateUtils::PexprConjunction(mp, pdrgpexprFiltered); + pexprConjEffective = pexprConjOwned; + } + else + { + pdrgpexprFiltered->Release(); + } + } + CExpressionArray *pdrgpexprPushable = nullptr; CExpressionArray *pdrgpexprUnpushable = nullptr; - SplitConjunct(mp, pexprOuter, pexprConj, &pdrgpexprPushable, + SplitConjunct(mp, pexprOuter, pexprConjEffective, &pdrgpexprPushable, &pdrgpexprUnpushable); if (0 < pdrgpexprPushable->Size()) @@ -323,6 +374,7 @@ CNormalizer::PushThruOuterChild(CMemoryPool *mp, CExpression *pexpr, pdrgpexprPushable->Release(); pdrgpexprUnpushable->Release(); + CRefCount::SafeRelease(pexprConjOwned); } diff --git a/src/test/regress/expected/bfv_joins.out b/src/test/regress/expected/bfv_joins.out index da6e7481318..ff7947488c8 100644 --- a/src/test/regress/expected/bfv_joins.out +++ b/src/test/regress/expected/bfv_joins.out @@ -4235,6 +4235,36 @@ select (trunc(extract(epoch from now())) - :unix_time1) < 100 is_ok; (1 row) reset optimizer; +-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a +-- scan filter on the outer relation. When the same outer relation feeds +-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there +-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's +-- own outer child, discarding outer rows that should be null-padded. +create table loj_bool_x(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table loj_bool_y1(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table loj_bool_y2(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into loj_bool_x values (true), (false), (false); +insert into loj_bool_y1 values (true); +insert into loj_bool_y2 values (true); +-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2. +-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x. +select loj_bool_x.c1, loj_bool_y2.c1 as y2c1 + from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1 + left join loj_bool_y2 on loj_bool_x.c1 + where loj_bool_y2.c1 is null + order by 1, 2; + c1 | y2c1 +----+------ + f | + f | +(2 rows) + -- Clean up. None of the objects we create are very interesting to keep around. reset search_path; set client_min_messages='warning'; diff --git a/src/test/regress/expected/bfv_joins_optimizer.out b/src/test/regress/expected/bfv_joins_optimizer.out index 934b682492b..d8cb7b7a425 100644 --- a/src/test/regress/expected/bfv_joins_optimizer.out +++ b/src/test/regress/expected/bfv_joins_optimizer.out @@ -4252,6 +4252,36 @@ select (trunc(extract(epoch from now())) - :unix_time1) < 100 is_ok; (1 row) reset optimizer; +-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a +-- scan filter on the outer relation. When the same outer relation feeds +-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there +-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's +-- own outer child, discarding outer rows that should be null-padded. +create table loj_bool_x(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table loj_bool_y1(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table loj_bool_y2(c1 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into loj_bool_x values (true), (false), (false); +insert into loj_bool_y1 values (true); +insert into loj_bool_y2 values (true); +-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2. +-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x. +select loj_bool_x.c1, loj_bool_y2.c1 as y2c1 + from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1 + left join loj_bool_y2 on loj_bool_x.c1 + where loj_bool_y2.c1 is null + order by 1, 2; + c1 | y2c1 +----+------ + f | + f | +(2 rows) + -- Clean up. None of the objects we create are very interesting to keep around. reset search_path; set client_min_messages='warning'; diff --git a/src/test/regress/expected/join_optimizer.out b/src/test/regress/expected/join_optimizer.out index fd987628b10..25859b2b520 100644 --- a/src/test/regress/expected/join_optimizer.out +++ b/src/test/regress/expected/join_optimizer.out @@ -6435,8 +6435,8 @@ select d.* from d left join (select distinct * from b) s explain (costs off) select 1 from a t1 left join (a t2 left join a t3 on t2.id = 1) on t2.id = 1; - QUERY PLAN --------------------------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------------------------- Result -> Gather Motion 3:1 (slice1; segments: 3) -> Nested Loop Left Join @@ -6446,13 +6446,12 @@ select 1 from a t1 -> Broadcast Motion 3:3 (slice2; segments: 3) -> Nested Loop Left Join Join Filter: (t2.id = 1) - -> Index Scan using a_pkey on a t2 - Index Cond: (id = 1) + -> Seq Scan on a t2 -> Materialize -> Broadcast Motion 3:3 (slice3; segments: 3) -> Seq Scan on a t3 Optimizer: GPORCA -(15 rows) +(14 rows) -- check join removal works when uniqueness of the join condition is enforced -- by a UNION diff --git a/src/test/regress/sql/bfv_joins.sql b/src/test/regress/sql/bfv_joins.sql index 3a0fca09fc7..1dca58051c8 100644 --- a/src/test/regress/sql/bfv_joins.sql +++ b/src/test/regress/sql/bfv_joins.sql @@ -649,6 +649,26 @@ select (trunc(extract(epoch from now())) - :unix_time1) < 100 is_ok; reset optimizer; +-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a +-- scan filter on the outer relation. When the same outer relation feeds +-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there +-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's +-- own outer child, discarding outer rows that should be null-padded. +create table loj_bool_x(c1 boolean); +create table loj_bool_y1(c1 boolean); +create table loj_bool_y2(c1 boolean); +insert into loj_bool_x values (true), (false), (false); +insert into loj_bool_y1 values (true); +insert into loj_bool_y2 values (true); + +-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2. +-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x. +select loj_bool_x.c1, loj_bool_y2.c1 as y2c1 + from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1 + left join loj_bool_y2 on loj_bool_x.c1 + where loj_bool_y2.c1 is null + order by 1, 2; + -- Clean up. None of the objects we create are very interesting to keep around. reset search_path; set client_min_messages='warning';