From 0db256de6f869824bd10750d935d861d4cc85f3c Mon Sep 17 00:00:00 2001
From: Jianghua Yang <yjhjstz@gmail.com>
Date: Thu, 1 Jan 2026 00:36:06 +0800
Subject: [PATCH] ORCA: Validate hash function existence in IsOpHashJoinable

Fix execution-time error when operators have oprcanhash=true but lack actual
hash functions. Previously, ORCA would generate HashAgg plans that failed at
runtime with "could not find hash function for hash operator" errors.

This occurred when operators were marked with the 'hashes' option but only
registered in btree operator families, not hash operator families. The fix
adds validation using get_op_hash_functions() to ensure hash functions exist
before allowing HashAgg plans, moving error detection from execution to
planning time.
---
 src/backend/gpopt/gpdbwrappers.cpp       |  19 ++-
 src/test/regress/expected/gp_hashagg.out | 166 +++++++++++++++++++++++
 src/test/regress/sql/gp_hashagg.sql      | 138 +++++++++++++++++++
 3 files changed, 322 insertions(+), 1 deletion(-)

diff --git a/src/backend/gpopt/gpdbwrappers.cpp b/src/backend/gpopt/gpdbwrappers.cpp
index 4e636a0c653..aca95b2cc0a 100644
--- a/src/backend/gpopt/gpdbwrappers.cpp
+++ b/src/backend/gpopt/gpdbwrappers.cpp
@@ -1613,7 +1613,24 @@ gpdb::IsOpHashJoinable(Oid opno, Oid inputtype)
 	GP_WRAP_START;
 	{
 		/* catalog tables: pg_operator */
-		return op_hashjoinable(opno, inputtype);
+		if (!op_hashjoinable(opno, inputtype))
+			return false;
+
+		/*
+		 * Even if oprcanhash is true, we need to verify that hash functions
+		 * actually exist for this operator. This is because oprcanhash can be
+		 * set to true while the operator is only registered in a btree opfamily
+		 * and not in a hash opfamily, which would cause execution-time errors
+		 * when trying to build hash tables.
+		 *
+		 * See get_op_hash_functions() in lsyscache.c which requires operators
+		 * to be registered in a hash opfamily (amopmethod == HASH_AM_OID).
+		 */
+		RegProcedure hash_proc;
+		if (!get_op_hash_functions(opno, &hash_proc, NULL))
+			return false;
+
+		return true;
 	}
 	GP_WRAP_END;
 	return false;
diff --git a/src/test/regress/expected/gp_hashagg.out b/src/test/regress/expected/gp_hashagg.out
index 210ec705398..513aeb09966 100644
--- a/src/test/regress/expected/gp_hashagg.out
+++ b/src/test/regress/expected/gp_hashagg.out
@@ -274,3 +274,169 @@ EXPLAIN (COSTS OFF, VERBOSE) :qry;
         
 (1 row)
 
+drop schema if exists tinc_base_types cascade;
+NOTICE:  schema "tinc_base_types" does not exist, skipping
+create schema tinc_base_types;
+set search_path=tinc_base_types;
+CREATE FUNCTION int42_in(cstring)
+   RETURNS int42
+   AS 'int4in'
+   LANGUAGE internal IMMUTABLE STRICT;
+NOTICE:  type "int42" is not yet defined
+DETAIL:  Creating a shell type definition.
+CREATE FUNCTION int42_out(int42)
+   RETURNS cstring
+   AS 'int4out'
+   LANGUAGE internal IMMUTABLE STRICT;
+NOTICE:  argument type int42 is only a shell
+CREATE TYPE int42 (
+   internallength = 4,
+   input = int42_in,
+   output = int42_out,
+   alignment = int4,
+   default = 42,
+   passedbyvalue
+);
+CREATE TABLE aTABLE(k int, a int42);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'k' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+INSERT INTO aTABLE VALUES(1, '21');
+INSERT INTO aTABLE VALUES(2, '22');
+INSERT INTO aTABLE VALUES(3, '23');
+INSERT INTO aTABLE VALUES(4, '24');
+SELECT * FROM aTABLE;
+ k | a  
+---+----
+ 2 | 22
+ 3 | 23
+ 4 | 24
+ 1 | 21
+(4 rows)
+
+CREATE OR REPLACE FUNCTION my_lt(int42, int42)
+RETURNS boolean AS
+'int4lt'
+language internal;
+CREATE OR REPLACE FUNCTION my_lteq(int42, int42)
+RETURNS boolean AS
+'int4le'
+language internal;
+CREATE OR REPLACE FUNCTION my_gt(int42, int42)
+RETURNS boolean AS
+'int4gt'
+language internal;
+CREATE OR REPLACE FUNCTION my_gteq(int42, int42)
+RETURNS boolean AS
+'int4ge'
+language internal;
+CREATE OR REPLACE FUNCTION my_eq(int42, int42)
+RETURNS boolean AS
+'int4eq'
+language internal;
+CREATE OPERATOR < (
+  leftarg = int42,
+  rightarg = int42,
+  procedure = my_lt,
+  hashes
+);
+CREATE OPERATOR <= (
+  leftarg = int42,
+  rightarg = int42,
+  procedure = my_lteq,
+  hashes
+);
+CREATE OPERATOR = (
+  leftarg = int42,
+  rightarg = int42,
+  procedure = my_eq,
+  hashes
+);
+CREATE OPERATOR > (
+  leftarg = int42,
+  rightarg = int42,
+  procedure = my_gt,
+  hashes
+);
+CREATE OPERATOR >= (
+  leftarg = int42,
+  rightarg = int42,
+  procedure = my_gteq,
+  hashes
+);
+CREATE OR REPLACE FUNCTION my_comp_func(int42, int42)
+RETURNS int AS
+$$
+BEGIN
+IF $1 < $2 THEN
+  return -1;
+ELSIF $1 = $2 THEN
+  return 0;
+ELSE
+  return 1;
+END IF;
+END;
+$$ LANGUAGE PLPGSQL;
+CREATE OPERATOR CLASS my_operator_class_comp_type
+DEFAULT
+FOR TYPE int42 USING btree
+AS
+OPERATOR 1 <,
+OPERATOR 2 <=,
+OPERATOR 3 =,
+OPERATOR 4 >=,
+OPERATOR 5 >,
+FUNCTION 1 my_comp_func(int42, int42);
+CREATE OPERATOR CLASS my_operator_class_comp_type_hash_lt
+DEFAULT
+FOR TYPE int42 USING hash
+AS
+OPERATOR 1 <,
+FUNCTION 1 my_comp_func(int42, int42);
+ERROR:  hash function 1 must have one argument
+CREATE OPERATOR CLASS my_operator_class_comp_type_hash_eq
+FOR TYPE int42 USING hash
+AS
+OPERATOR 1 =,
+FUNCTION 1 my_comp_func(int42, int42);
+ERROR:  hash function 1 must have one argument
+select * from atable group by a,k;
+ k | a  
+---+----
+ 1 | 21
+ 2 | 22
+ 3 | 23
+ 4 | 24
+(4 rows)
+
+-- Before fix: This would fail at runtime with:
+--   ERROR: could not find hash function for hash operator XXXXX (execGrouping.c:118)
+-- After fix: ORCA should detect missing hash function at planning time
+--   and either use GroupAgg or report a clear planning error
+SET optimizer_enable_groupagg=off;
+select * from atable group by a,k;
+ k | a  
+---+----
+ 1 | 21
+ 2 | 22
+ 3 | 23
+ 4 | 24
+(4 rows)
+
+drop schema if exists tinc_base_types cascade;
+NOTICE:  drop cascades to 16 other objects
+DETAIL:  drop cascades to function int42_out(int42)
+drop cascades to type int42
+drop cascades to function int42_in(cstring)
+drop cascades to table atable
+drop cascades to function my_lt(int42,int42)
+drop cascades to function my_lteq(int42,int42)
+drop cascades to function my_gt(int42,int42)
+drop cascades to function my_gteq(int42,int42)
+drop cascades to function my_eq(int42,int42)
+drop cascades to operator <(int42,int42)
+drop cascades to operator <=(int42,int42)
+drop cascades to operator =(int42,int42)
+drop cascades to operator >(int42,int42)
+drop cascades to operator >=(int42,int42)
+drop cascades to function my_comp_func(int42,int42)
+drop cascades to operator family my_operator_class_comp_type for access method btree
diff --git a/src/test/regress/sql/gp_hashagg.sql b/src/test/regress/sql/gp_hashagg.sql
index 36000de8af8..c9b05c1936a 100644
--- a/src/test/regress/sql/gp_hashagg.sql
+++ b/src/test/regress/sql/gp_hashagg.sql
@@ -140,3 +140,141 @@ HAVING max(c) = '31'
 $$ AS qry \gset
 EXPLAIN (COSTS OFF, VERBOSE) :qry;
 :qry;
+
+drop schema if exists tinc_base_types cascade;
+
+create schema tinc_base_types;
+set search_path=tinc_base_types;
+
+
+CREATE FUNCTION int42_in(cstring)
+   RETURNS int42
+   AS 'int4in'
+   LANGUAGE internal IMMUTABLE STRICT;
+CREATE FUNCTION int42_out(int42)
+   RETURNS cstring
+   AS 'int4out'
+   LANGUAGE internal IMMUTABLE STRICT;
+CREATE TYPE int42 (
+   internallength = 4,
+   input = int42_in,
+   output = int42_out,
+   alignment = int4,
+   default = 42,
+   passedbyvalue
+);
+
+CREATE TABLE aTABLE(k int, a int42);
+INSERT INTO aTABLE VALUES(1, '21');
+INSERT INTO aTABLE VALUES(2, '22');
+INSERT INTO aTABLE VALUES(3, '23');
+INSERT INTO aTABLE VALUES(4, '24');
+SELECT * FROM aTABLE;
+
+CREATE OR REPLACE FUNCTION my_lt(int42, int42)
+RETURNS boolean AS
+'int4lt'
+language internal;
+
+CREATE OR REPLACE FUNCTION my_lteq(int42, int42)
+RETURNS boolean AS
+'int4le'
+language internal;
+
+CREATE OR REPLACE FUNCTION my_gt(int42, int42)
+RETURNS boolean AS
+'int4gt'
+language internal;
+
+CREATE OR REPLACE FUNCTION my_gteq(int42, int42)
+RETURNS boolean AS
+'int4ge'
+language internal;
+
+CREATE OR REPLACE FUNCTION my_eq(int42, int42)
+RETURNS boolean AS
+'int4eq'
+language internal;
+
+CREATE OPERATOR < (
+  leftarg = int42,
+  rightarg = int42,
+  procedure = my_lt,
+  hashes
+);
+
+CREATE OPERATOR <= (
+  leftarg = int42,
+  rightarg = int42,
+  procedure = my_lteq,
+  hashes
+);
+
+CREATE OPERATOR = (
+  leftarg = int42,
+  rightarg = int42,
+  procedure = my_eq,
+  hashes
+);
+
+CREATE OPERATOR > (
+  leftarg = int42,
+  rightarg = int42,
+  procedure = my_gt,
+  hashes
+);
+
+CREATE OPERATOR >= (
+  leftarg = int42,
+  rightarg = int42,
+  procedure = my_gteq,
+  hashes
+);
+
+CREATE OR REPLACE FUNCTION my_comp_func(int42, int42)
+RETURNS int AS
+$$
+BEGIN
+IF $1 < $2 THEN
+  return -1;
+ELSIF $1 = $2 THEN
+  return 0;
+ELSE
+  return 1;
+END IF;
+END;
+$$ LANGUAGE PLPGSQL;
+
+CREATE OPERATOR CLASS my_operator_class_comp_type
+DEFAULT
+FOR TYPE int42 USING btree
+AS
+OPERATOR 1 <,
+OPERATOR 2 <=,
+OPERATOR 3 =,
+OPERATOR 4 >=,
+OPERATOR 5 >,
+FUNCTION 1 my_comp_func(int42, int42);
+
+CREATE OPERATOR CLASS my_operator_class_comp_type_hash_lt
+DEFAULT
+FOR TYPE int42 USING hash
+AS
+OPERATOR 1 <,
+FUNCTION 1 my_comp_func(int42, int42);
+
+CREATE OPERATOR CLASS my_operator_class_comp_type_hash_eq
+FOR TYPE int42 USING hash
+AS
+OPERATOR 1 =,
+FUNCTION 1 my_comp_func(int42, int42);
+
+select * from atable group by a,k;
+-- Before fix: This would fail at runtime with:
+--   ERROR: could not find hash function for hash operator XXXXX (execGrouping.c:118)
+-- After fix: ORCA should detect missing hash function at planning time
+--   and either use GroupAgg or report a clear planning error
+SET optimizer_enable_groupagg=off;
+select * from atable group by a,k;
+
+drop schema if exists tinc_base_types cascade;