Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions contrib/pax_storage/src/test/regress/expected/bfv_joins.out
Original file line number Diff line number Diff line change
Expand Up @@ -4190,6 +4190,36 @@ INSERT INTO ext_stats_tbl VALUES('tC', true);
ANALYZE ext_stats_tbl;
explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2;
ERROR: FULL JOIN is only supported with merge-joinable or hash-joinable join conditions
-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
-- scan filter on the outer relation. When the same outer relation feeds
-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
-- own outer child, discarding outer rows that should be null-padded.
create table loj_bool_x(c1 boolean);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table loj_bool_y1(c1 boolean);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table loj_bool_y2(c1 boolean);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
insert into loj_bool_x values (true), (false), (false);
insert into loj_bool_y1 values (true);
insert into loj_bool_y2 values (true);
-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
left join loj_bool_y2 on loj_bool_x.c1
where loj_bool_y2.c1 is null
order by 1, 2;
c1 | y2c1
----+------
f |
f |
(2 rows)

-- Clean up. None of the objects we create are very interesting to keep around.
reset search_path;
set client_min_messages='warning';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4215,6 +4215,36 @@ INSERT INTO ext_stats_tbl VALUES('tC', true);
ANALYZE ext_stats_tbl;
explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2;
ERROR: FULL JOIN is only supported with merge-joinable or hash-joinable join conditions
-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
-- scan filter on the outer relation. When the same outer relation feeds
-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
-- own outer child, discarding outer rows that should be null-padded.
create table loj_bool_x(c1 boolean);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table loj_bool_y1(c1 boolean);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table loj_bool_y2(c1 boolean);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
insert into loj_bool_x values (true), (false), (false);
insert into loj_bool_y1 values (true);
insert into loj_bool_y2 values (true);
-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
left join loj_bool_y2 on loj_bool_x.c1
where loj_bool_y2.c1 is null
order by 1, 2;
c1 | y2c1
----+------
f |
f |
(2 rows)

-- Clean up. None of the objects we create are very interesting to keep around.
reset search_path;
set client_min_messages='warning';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6473,13 +6473,12 @@ select 1 from a t1
-> Broadcast Motion 3:3 (slice2; segments: 3)
-> Nested Loop Left Join
Join Filter: (t2.id = 1)
-> Index Only Scan using a_pkey on a t2
Index Cond: (id = 1)
-> Seq Scan on a t2
-> Materialize
-> Broadcast Motion 3:3 (slice3; segments: 3)
-> Seq Scan on a t3
Optimizer: GPORCA
(15 rows)
(14 rows)

-- check join removal works when uniqueness of the join condition is enforced
-- by a UNION
Expand Down
20 changes: 20 additions & 0 deletions contrib/pax_storage/src/test/regress/sql/bfv_joins.sql
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,26 @@ ANALYZE ext_stats_tbl;

explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2;

-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
-- scan filter on the outer relation. When the same outer relation feeds
-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
-- own outer child, discarding outer rows that should be null-padded.
create table loj_bool_x(c1 boolean);
create table loj_bool_y1(c1 boolean);
create table loj_bool_y2(c1 boolean);
insert into loj_bool_x values (true), (false), (false);
insert into loj_bool_y1 values (true);
insert into loj_bool_y2 values (true);

-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
left join loj_bool_y2 on loj_bool_x.c1
where loj_bool_y2.c1 is null
order by 1, 2;

-- Clean up. None of the objects we create are very interesting to keep around.
reset search_path;
set client_min_messages='warning';
Expand Down
54 changes: 53 additions & 1 deletion src/backend/gporca/libgpopt/src/operators/CNormalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,9 +259,60 @@ CNormalizer::PushThruOuterChild(CMemoryPool *mp, CExpression *pexpr,
CExpression *pexprInner = (*pexpr)[1];
CExpression *pexprPred = (*pexpr)[2];

// Strip from the incoming conjunct any conjunct that structurally matches
// a conjunct of the LOJ's own ON predicate. Pushing such a conjunct onto
// the LOJ's outer would duplicate the ON pred as a scan filter on the
// outer relation, which is invalid for LOJ semantics: outer rows that
// don't satisfy the ON pred must still appear in the result (null-padded),
// so they must not be filtered out below the join.
CExpression *pexprConjEffective = pexprConj;
CExpression *pexprConjOwned = nullptr;
{
CExpressionArray *pdrgpexprConjAll =
CPredicateUtils::PdrgpexprConjuncts(mp, pexprConj);
CExpressionArray *pdrgpexprOnConj =
CPredicateUtils::PdrgpexprConjuncts(mp, pexprPred);
CExpressionArray *pdrgpexprFiltered =
GPOS_NEW(mp) CExpressionArray(mp);
BOOL fAnyStripped = false;
for (ULONG ul = 0; ul < pdrgpexprConjAll->Size(); ul++)
{
CExpression *pexprC = (*pdrgpexprConjAll)[ul];
BOOL fMatchesOn = false;
for (ULONG uo = 0; uo < pdrgpexprOnConj->Size(); uo++)
{
if (pexprC->Matches((*pdrgpexprOnConj)[uo]))
{
fMatchesOn = true;
break;
}
}
if (fMatchesOn)
{
fAnyStripped = true;
continue;
}
pexprC->AddRef();
pdrgpexprFiltered->Append(pexprC);
}
pdrgpexprConjAll->Release();
pdrgpexprOnConj->Release();

if (fAnyStripped)
{
pexprConjOwned =
CPredicateUtils::PexprConjunction(mp, pdrgpexprFiltered);
pexprConjEffective = pexprConjOwned;
}
else
{
pdrgpexprFiltered->Release();
}
}

CExpressionArray *pdrgpexprPushable = nullptr;
CExpressionArray *pdrgpexprUnpushable = nullptr;
SplitConjunct(mp, pexprOuter, pexprConj, &pdrgpexprPushable,
SplitConjunct(mp, pexprOuter, pexprConjEffective, &pdrgpexprPushable,
&pdrgpexprUnpushable);

if (0 < pdrgpexprPushable->Size())
Expand Down Expand Up @@ -323,6 +374,7 @@ CNormalizer::PushThruOuterChild(CMemoryPool *mp, CExpression *pexpr,

pdrgpexprPushable->Release();
pdrgpexprUnpushable->Release();
CRefCount::SafeRelease(pexprConjOwned);
}


Expand Down
30 changes: 30 additions & 0 deletions src/test/regress/expected/bfv_joins.out
Original file line number Diff line number Diff line change
Expand Up @@ -4235,6 +4235,36 @@ select (trunc(extract(epoch from now())) - :unix_time1) < 100 is_ok;
(1 row)

reset optimizer;
-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
-- scan filter on the outer relation. When the same outer relation feeds
-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
-- own outer child, discarding outer rows that should be null-padded.
create table loj_bool_x(c1 boolean);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table loj_bool_y1(c1 boolean);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table loj_bool_y2(c1 boolean);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
insert into loj_bool_x values (true), (false), (false);
insert into loj_bool_y1 values (true);
insert into loj_bool_y2 values (true);
-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
left join loj_bool_y2 on loj_bool_x.c1
where loj_bool_y2.c1 is null
order by 1, 2;
c1 | y2c1
----+------
f |
f |
(2 rows)

-- Clean up. None of the objects we create are very interesting to keep around.
reset search_path;
set client_min_messages='warning';
Expand Down
30 changes: 30 additions & 0 deletions src/test/regress/expected/bfv_joins_optimizer.out
Original file line number Diff line number Diff line change
Expand Up @@ -4252,6 +4252,36 @@ select (trunc(extract(epoch from now())) - :unix_time1) < 100 is_ok;
(1 row)

reset optimizer;
-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
-- scan filter on the outer relation. When the same outer relation feeds
-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
-- own outer child, discarding outer rows that should be null-padded.
create table loj_bool_x(c1 boolean);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table loj_bool_y1(c1 boolean);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table loj_bool_y2(c1 boolean);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
insert into loj_bool_x values (true), (false), (false);
insert into loj_bool_y1 values (true);
insert into loj_bool_y2 values (true);
-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
left join loj_bool_y2 on loj_bool_x.c1
where loj_bool_y2.c1 is null
order by 1, 2;
c1 | y2c1
----+------
f |
f |
(2 rows)

-- Clean up. None of the objects we create are very interesting to keep around.
reset search_path;
set client_min_messages='warning';
Expand Down
9 changes: 4 additions & 5 deletions src/test/regress/expected/join_optimizer.out
Original file line number Diff line number Diff line change
Expand Up @@ -6435,8 +6435,8 @@ select d.* from d left join (select distinct * from b) s
explain (costs off)
select 1 from a t1
left join (a t2 left join a t3 on t2.id = 1) on t2.id = 1;
QUERY PLAN
--------------------------------------------------------------------------------------
QUERY PLAN
---------------------------------------------------------------------------------------
Result
-> Gather Motion 3:1 (slice1; segments: 3)
-> Nested Loop Left Join
Expand All @@ -6446,13 +6446,12 @@ select 1 from a t1
-> Broadcast Motion 3:3 (slice2; segments: 3)
-> Nested Loop Left Join
Join Filter: (t2.id = 1)
-> Index Scan using a_pkey on a t2
Index Cond: (id = 1)
-> Seq Scan on a t2
-> Materialize
-> Broadcast Motion 3:3 (slice3; segments: 3)
-> Seq Scan on a t3
Optimizer: GPORCA
(15 rows)
(14 rows)

-- check join removal works when uniqueness of the join condition is enforced
-- by a UNION
Expand Down
20 changes: 20 additions & 0 deletions src/test/regress/sql/bfv_joins.sql
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,26 @@ select (trunc(extract(epoch from now())) - :unix_time1) < 100 is_ok;

reset optimizer;

-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
-- scan filter on the outer relation. When the same outer relation feeds
-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
-- own outer child, discarding outer rows that should be null-padded.
create table loj_bool_x(c1 boolean);
create table loj_bool_y1(c1 boolean);
create table loj_bool_y2(c1 boolean);
insert into loj_bool_x values (true), (false), (false);
insert into loj_bool_y1 values (true);
insert into loj_bool_y2 values (true);

-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
left join loj_bool_y2 on loj_bool_x.c1
where loj_bool_y2.c1 is null
order by 1, 2;

-- Clean up. None of the objects we create are very interesting to keep around.
reset search_path;
set client_min_messages='warning';
Expand Down
Loading