apache · yjhjstz · Jun 29, 2026 · Jun 30, 2026 · Jun 30, 2026
diff --git a/contrib/pax_storage/src/test/regress/expected/bfv_joins.out b/contrib/pax_storage/src/test/regress/expected/bfv_joins.out
@@ -4190,6 +4190,36 @@ INSERT INTO ext_stats_tbl VALUES('tC', true);
 ANALYZE ext_stats_tbl;
 explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2;
 ERROR:  FULL JOIN is only supported with merge-joinable or hash-joinable join conditions
+-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
+-- scan filter on the outer relation. When the same outer relation feeds
+-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
+-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
+-- own outer child, discarding outer rows that should be null-padded.
+create table loj_bool_x(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table loj_bool_y1(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table loj_bool_y2(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into loj_bool_x values (true), (false), (false);
+insert into loj_bool_y1 values (true);
+insert into loj_bool_y2 values (true);
+-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
+-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
+select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
+  from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
+                  left join loj_bool_y2 on loj_bool_x.c1
+ where loj_bool_y2.c1 is null
+ order by 1, 2;
+ c1 | y2c1 
+----+------
+ f  | 
+ f  | 
+(2 rows)
+
 -- Clean up. None of the objects we create are very interesting to keep around.
 reset search_path;
 set client_min_messages='warning';

diff --git a/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out b/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out
@@ -4215,6 +4215,36 @@ INSERT INTO ext_stats_tbl VALUES('tC', true);
 ANALYZE ext_stats_tbl;
 explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2;
 ERROR:  FULL JOIN is only supported with merge-joinable or hash-joinable join conditions
+-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
+-- scan filter on the outer relation. When the same outer relation feeds
+-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
+-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
+-- own outer child, discarding outer rows that should be null-padded.
+create table loj_bool_x(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table loj_bool_y1(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table loj_bool_y2(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into loj_bool_x values (true), (false), (false);
+insert into loj_bool_y1 values (true);
+insert into loj_bool_y2 values (true);
+-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
+-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
+select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
+  from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
+                  left join loj_bool_y2 on loj_bool_x.c1
+ where loj_bool_y2.c1 is null
+ order by 1, 2;
+ c1 | y2c1 
+----+------
+ f  | 
+ f  | 
+(2 rows)
+
 -- Clean up. None of the objects we create are very interesting to keep around.
 reset search_path;
 set client_min_messages='warning';

diff --git a/contrib/pax_storage/src/test/regress/expected/join_optimizer.out b/contrib/pax_storage/src/test/regress/expected/join_optimizer.out
@@ -6473,13 +6473,12 @@ select 1 from a t1
                      ->  Broadcast Motion 3:3  (slice2; segments: 3)
                            ->  Nested Loop Left Join
                                  Join Filter: (t2.id = 1)
-                                 ->  Index Only Scan using a_pkey on a t2
-                                       Index Cond: (id = 1)
+                                 ->  Seq Scan on a t2
                                  ->  Materialize
                                        ->  Broadcast Motion 3:3  (slice3; segments: 3)
                                              ->  Seq Scan on a t3
  Optimizer: GPORCA
-(15 rows)
+(14 rows)
 
 -- check join removal works when uniqueness of the join condition is enforced
 -- by a UNION

diff --git a/contrib/pax_storage/src/test/regress/sql/bfv_joins.sql b/contrib/pax_storage/src/test/regress/sql/bfv_joins.sql
@@ -604,6 +604,26 @@ ANALYZE ext_stats_tbl;
 
 explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2;
 
+-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
+-- scan filter on the outer relation. When the same outer relation feeds
+-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
+-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
+-- own outer child, discarding outer rows that should be null-padded.
+create table loj_bool_x(c1 boolean);
+create table loj_bool_y1(c1 boolean);
+create table loj_bool_y2(c1 boolean);
+insert into loj_bool_x values (true), (false), (false);
+insert into loj_bool_y1 values (true);
+insert into loj_bool_y2 values (true);
+
+-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
+-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
+select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
+  from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
+                  left join loj_bool_y2 on loj_bool_x.c1
+ where loj_bool_y2.c1 is null
+ order by 1, 2;
+
 -- Clean up. None of the objects we create are very interesting to keep around.
 reset search_path;
 set client_min_messages='warning';

diff --git a/src/backend/gporca/libgpopt/src/operators/CNormalizer.cpp b/src/backend/gporca/libgpopt/src/operators/CNormalizer.cpp
@@ -259,9 +259,60 @@ CNormalizer::PushThruOuterChild(CMemoryPool *mp, CExpression *pexpr,
 	CExpression *pexprInner = (*pexpr)[1];
 	CExpression *pexprPred = (*pexpr)[2];
 
+	// Strip from the incoming conjunct any conjunct that structurally matches
+	// a conjunct of the LOJ's own ON predicate. Pushing such a conjunct onto
+	// the LOJ's outer would duplicate the ON pred as a scan filter on the
+	// outer relation, which is invalid for LOJ semantics: outer rows that
+	// don't satisfy the ON pred must still appear in the result (null-padded),
+	// so they must not be filtered out below the join.
+	CExpression *pexprConjEffective = pexprConj;
+	CExpression *pexprConjOwned = nullptr;
+	{
+		CExpressionArray *pdrgpexprConjAll =
+			CPredicateUtils::PdrgpexprConjuncts(mp, pexprConj);
+		CExpressionArray *pdrgpexprOnConj =
+			CPredicateUtils::PdrgpexprConjuncts(mp, pexprPred);
+		CExpressionArray *pdrgpexprFiltered =
+			GPOS_NEW(mp) CExpressionArray(mp);
+		BOOL fAnyStripped = false;
+		for (ULONG ul = 0; ul < pdrgpexprConjAll->Size(); ul++)
+		{
+			CExpression *pexprC = (*pdrgpexprConjAll)[ul];
+			BOOL fMatchesOn = false;
+			for (ULONG uo = 0; uo < pdrgpexprOnConj->Size(); uo++)
+			{
+				if (pexprC->Matches((*pdrgpexprOnConj)[uo]))
+				{
+					fMatchesOn = true;
+					break;
+				}
+			}
+			if (fMatchesOn)
+			{
+				fAnyStripped = true;
+				continue;
+			}
+			pexprC->AddRef();
+			pdrgpexprFiltered->Append(pexprC);
+		}
+		pdrgpexprConjAll->Release();
+		pdrgpexprOnConj->Release();
+
+		if (fAnyStripped)
+		{
+			pexprConjOwned =
+				CPredicateUtils::PexprConjunction(mp, pdrgpexprFiltered);
+			pexprConjEffective = pexprConjOwned;
+		}
+		else
+		{
+			pdrgpexprFiltered->Release();
+		}
+	}
+
 	CExpressionArray *pdrgpexprPushable = nullptr;
 	CExpressionArray *pdrgpexprUnpushable = nullptr;
-	SplitConjunct(mp, pexprOuter, pexprConj, &pdrgpexprPushable,
+	SplitConjunct(mp, pexprOuter, pexprConjEffective, &pdrgpexprPushable,
 				  &pdrgpexprUnpushable);
 
 	if (0 < pdrgpexprPushable->Size())
@@ -323,6 +374,7 @@ CNormalizer::PushThruOuterChild(CMemoryPool *mp, CExpression *pexpr,
 
 	pdrgpexprPushable->Release();
 	pdrgpexprUnpushable->Release();
+	CRefCount::SafeRelease(pexprConjOwned);
 }
 
 

diff --git a/src/test/regress/expected/bfv_joins.out b/src/test/regress/expected/bfv_joins.out
@@ -4235,6 +4235,36 @@ select (trunc(extract(epoch from now())) - :unix_time1) < 100 is_ok;
 (1 row)
 
 reset optimizer;
+-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
+-- scan filter on the outer relation. When the same outer relation feeds
+-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
+-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
+-- own outer child, discarding outer rows that should be null-padded.
+create table loj_bool_x(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table loj_bool_y1(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table loj_bool_y2(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into loj_bool_x values (true), (false), (false);
+insert into loj_bool_y1 values (true);
+insert into loj_bool_y2 values (true);
+-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
+-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
+select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
+  from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
+                  left join loj_bool_y2 on loj_bool_x.c1
+ where loj_bool_y2.c1 is null
+ order by 1, 2;
+ c1 | y2c1 
+----+------
+ f  | 
+ f  | 
+(2 rows)
+
 -- Clean up. None of the objects we create are very interesting to keep around.
 reset search_path;
 set client_min_messages='warning';

diff --git a/src/test/regress/expected/bfv_joins_optimizer.out b/src/test/regress/expected/bfv_joins_optimizer.out
@@ -4252,6 +4252,36 @@ select (trunc(extract(epoch from now())) - :unix_time1) < 100 is_ok;
 (1 row)
 
 reset optimizer;
+-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
+-- scan filter on the outer relation. When the same outer relation feeds
+-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
+-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
+-- own outer child, discarding outer rows that should be null-padded.
+create table loj_bool_x(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table loj_bool_y1(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table loj_bool_y2(c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into loj_bool_x values (true), (false), (false);
+insert into loj_bool_y1 values (true);
+insert into loj_bool_y2 values (true);
+-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
+-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
+select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
+  from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
+                  left join loj_bool_y2 on loj_bool_x.c1
+ where loj_bool_y2.c1 is null
+ order by 1, 2;
+ c1 | y2c1 
+----+------
+ f  | 
+ f  | 
+(2 rows)
+
 -- Clean up. None of the objects we create are very interesting to keep around.
 reset search_path;
 set client_min_messages='warning';

diff --git a/src/test/regress/expected/join_optimizer.out b/src/test/regress/expected/join_optimizer.out
@@ -6435,8 +6435,8 @@ select d.* from d left join (select distinct * from b) s
 explain (costs off)
 select 1 from a t1
   left join (a t2 left join a t3 on t2.id = 1) on t2.id = 1;
-                                      QUERY PLAN                                      
---------------------------------------------------------------------------------------
+                                      QUERY PLAN                                       
+---------------------------------------------------------------------------------------
  Result
    ->  Gather Motion 3:1  (slice1; segments: 3)
          ->  Nested Loop Left Join
@@ -6446,13 +6446,12 @@ select 1 from a t1
                      ->  Broadcast Motion 3:3  (slice2; segments: 3)
                            ->  Nested Loop Left Join
                                  Join Filter: (t2.id = 1)
-                                 ->  Index Scan using a_pkey on a t2
-                                       Index Cond: (id = 1)
+                                 ->  Seq Scan on a t2
                                  ->  Materialize
                                        ->  Broadcast Motion 3:3  (slice3; segments: 3)
                                              ->  Seq Scan on a t3
  Optimizer: GPORCA
-(15 rows)
+(14 rows)
 
 -- check join removal works when uniqueness of the join condition is enforced
 -- by a UNION

diff --git a/src/test/regress/sql/bfv_joins.sql b/src/test/regress/sql/bfv_joins.sql
@@ -649,6 +649,26 @@ select (trunc(extract(epoch from now())) - :unix_time1) < 100 is_ok;
 
 reset optimizer;
 
+-- ORCA bug: a boolean ON-clause of a LEFT JOIN must not be pushed down as a
+-- scan filter on the outer relation. When the same outer relation feeds
+-- multiple LEFT JOINs whose ON-clauses use the same boolean column AND there
+-- is a WHERE on top, the normalizer used to push the ON-pred onto the LOJ's
+-- own outer child, discarding outer rows that should be null-padded.
+create table loj_bool_x(c1 boolean);
+create table loj_bool_y1(c1 boolean);
+create table loj_bool_y2(c1 boolean);
+insert into loj_bool_x values (true), (false), (false);
+insert into loj_bool_y1 values (true);
+insert into loj_bool_y2 values (true);
+
+-- Expect 2 rows: the two FALSE rows in loj_bool_x, with NULL from loj_bool_y2.
+-- The plan must NOT contain "Filter: c1" on Seq Scan of loj_bool_x.
+select loj_bool_x.c1, loj_bool_y2.c1 as y2c1
+  from loj_bool_x left join loj_bool_y1 on loj_bool_x.c1
+                  left join loj_bool_y2 on loj_bool_x.c1
+ where loj_bool_y2.c1 is null
+ order by 1, 2;
+
 -- Clean up. None of the objects we create are very interesting to keep around.
 reset search_path;
 set client_min_messages='warning';