diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index a254deafd23e004437eb7207ff60aa90f4d04673..36c1599a38621e89ae886eaf78f04a8bad21df85 100755 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -3015,19 +3015,20 @@ SELECT t1.c1, t2.c1 FROM ft1 t1 CROSS JOIN ft2 t2 ORDER BY t1.c1, t2.c1 OFFSET 1 Node 1: EXPLAIN (VERBOSE ON, COSTS OFF) SELECT r1."C 1", r2."C 1" FROM ("S 1"."T 1" r1 INNER JOIN "S 1"."T 1" r2 ON (TRUE)) ORDER BY r1."C 1" ASC NULLS LAST, r2."C 1" ASC NULLS LAST LIMIT 10::bigint OFFSET 100::bigint Limit Output: r1."C 1", r2."C 1" - -> Sort + -> Incremental Sort Output: r1."C 1", r2."C 1" Sort Key: r1."C 1", r2."C 1" + Presorted Key: r1."C 1" -> Nested Loop Output: r1."C 1", r2."C 1" - -> Seq Scan on "S 1"."T 1" r1 - Output: r1."C 1", r1.c2, r1.c3, r1.c4, r1.c5, r1.c6, r1.c7, r1.c8 + -> Index Only Scan using t1_pkey on "S 1"."T 1" r1 + Output: r1."C 1" -> Materialize Output: r2."C 1" -> Seq Scan on "S 1"."T 1" r2 Output: r2."C 1" -(22 rows) +(23 rows) SELECT t1.c1, t2.c1 FROM ft1 t1 CROSS JOIN ft2 t2 ORDER BY t1.c1, t2.c1 OFFSET 100 LIMIT 10; c1 | c1 diff --git a/contrib/postgres_fdw/expected/postgres_fdw_partition.out b/contrib/postgres_fdw/expected/postgres_fdw_partition.out index dc049493e37df33d459c910c10fd791c4b5c0016..ca36743d9ea8c321630332f1372b8c5b51cb9004 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw_partition.out +++ b/contrib/postgres_fdw/expected/postgres_fdw_partition.out @@ -3611,16 +3611,17 @@ SELECT t1.c1, t2.c1 FROM ft1 t1 CROSS JOIN ft2 t2 ORDER BY t1.c1, t2.c1 OFFSET 1 Node 1: EXPLAIN (VERBOSE ON, COSTS OFF) SELECT r1."C 1", r2."C 1" FROM ("S 1"."T 1" r1 INNER JOIN "S 1"."T 1" r2 ON (TRUE)) ORDER BY r1."C 1" ASC NULLS LAST, r2."C 1" ASC NULLS LAST LIMIT 10::bigint OFFSET 100::bigint Limit Output: r1."C 1", r2."C 1" - -> Sort + -> Incremental Sort Output: r1."C 1", r2."C 1" Sort Key: r1."C 1", r2."C 1" + Presorted Key: r1."C 1" -> Nested Loop Output: r1."C 1", r2."C 1" -> Partition Iterator - Output: r1."C 1", r1.c2, r1.c3, r1.c4, r1.c5, r1.c6, r1.c7, r1.c8 + Output: r1."C 1" Iterations: 2 - -> Partitioned Seq Scan on "S 1"."T 1" r1 - Output: r1."C 1", r1.c2, r1.c3, r1.c4, r1.c5, r1.c6, r1.c7, r1.c8 + -> Partitioned Index Only Scan using t1_pkey on "S 1"."T 1" r1 + Output: r1."C 1" Selected Partitions: 1..2 -> Materialize Output: r2."C 1" @@ -3631,7 +3632,7 @@ SELECT t1.c1, t2.c1 FROM ft1 t1 CROSS JOIN ft2 t2 ORDER BY t1.c1, t2.c1 OFFSET 1 Output: r2."C 1" Selected Partitions: 1..2 -(30 rows) +(31 rows) SELECT t1.c1, t2.c1 FROM ft1 t1 CROSS JOIN ft2 t2 ORDER BY t1.c1, t2.c1 OFFSET 100 LIMIT 10; c1 | c1 diff --git a/src/bin/gs_guc/cluster_guc.conf b/src/bin/gs_guc/cluster_guc.conf index 3e4da33f5ba809f7c375ab8110df834ffc400c8a..67005b1cf33b46d2d5aa5af69032953ba4e44e5d 100755 --- a/src/bin/gs_guc/cluster_guc.conf +++ b/src/bin/gs_guc/cluster_guc.conf @@ -256,6 +256,7 @@ enable_seqscan|bool|0,0|NULL|NULL| enable_seqscan_dopcost|bool|0,0|NULL|NULL| enable_show_any_tuples|bool|0,0|NULL|NULL| enable_sort|bool|0,0|NULL|NULL| +enable_incremental_sort|bool|0,0|NULL|NULL| enable_incremental_catchup|bool|0,0|NULL|NULL| wait_dummy_time|int|1,2147483647|NULL|NULL| heap_bulk_read_size|int|0,64|kB|Bulk blocks number for seqscan pre-read.| diff --git a/src/common/backend/nodes/copyfuncs.cpp b/src/common/backend/nodes/copyfuncs.cpp index 121ba73a310693322c6631346fb15b100485de8e..e84aafa272337ff95e9335aeac62455b31e2dfea 100644 --- a/src/common/backend/nodes/copyfuncs.cpp +++ b/src/common/backend/nodes/copyfuncs.cpp @@ -1461,6 +1461,32 @@ static Sort* _copySort(const Sort* from) return newnode; } +/* + * _copyIncrementalSort + */ +static IncrementalSort* _copyIncrementalSort(const IncrementalSort* from) +{ + IncrementalSort* newnode = makeNode(IncrementalSort); + + /* + * copy node superclass fields + */ + CopyPlanFields((const Plan*)from, (Plan*)newnode); + + COPY_SCALAR_FIELD(sort.numCols); + if (from->sort.numCols > 0) { + COPY_POINTER_FIELD(sort.sortColIdx, from->sort.numCols * sizeof(AttrNumber)); + COPY_POINTER_FIELD(sort.sortOperators, from->sort.numCols * sizeof(Oid)); + COPY_POINTER_FIELD(sort.collations, from->sort.numCols * sizeof(Oid)); + COPY_POINTER_FIELD(sort.nullsFirst, from->sort.numCols * sizeof(bool)); + } + + CopyMemInfoFields(&from->sort.mem_info, &newnode->sort.mem_info); + COPY_SCALAR_FIELD(nPresortedCols); + + return newnode; +} + /* * CopySortGroupFields * @@ -8237,9 +8263,12 @@ void* copyObject(const void* from) case T_Sort: retval = _copySort((Sort*)from); break; + case T_IncrementalSort: + retval = _copyIncrementalSort((IncrementalSort*)from); + break; case T_SortGroup: retval = _copySortGroup((SortGroup*)from); - break; + break; case T_Group: retval = _copyGroup((Group*)from); break; diff --git a/src/common/backend/nodes/nodes.cpp b/src/common/backend/nodes/nodes.cpp index 74c5dd90a3498fb2c61200d9e0a62428f60eb1a4..9cd4d7bde38fae5b6da1a06073a0a86c619c075e 100755 --- a/src/common/backend/nodes/nodes.cpp +++ b/src/common/backend/nodes/nodes.cpp @@ -75,6 +75,7 @@ static const TagStr g_tagStrArr[] = {{T_Invalid, "Invalid"}, {T_HashJoin, "HashJoin"}, {T_Material, "Material"}, {T_Sort, "Sort"}, + {T_IncrementalSort, "IncrementalSort"}, {T_SortGroup, "SortGroup"}, {T_Group, "Group"}, {T_Agg, "Agg"}, @@ -150,6 +151,7 @@ static const TagStr g_tagStrArr[] = {{T_Invalid, "Invalid"}, {T_HashJoinState, "HashJoinState"}, {T_MaterialState, "MaterialState"}, {T_SortState, "SortState"}, + {T_IncrementalSortState, "IncrementalSortState"}, {T_SortGroupState, "SortGroupState"}, {T_GroupState, "GroupState"}, {T_AggState, "AggState"}, diff --git a/src/common/backend/nodes/outfuncs.cpp b/src/common/backend/nodes/outfuncs.cpp index dd5281378986040a675ae36dfd9b41b65ab4c6e0..59a6f2d25c9be8f7f038021fdfd5698a700be894 100755 --- a/src/common/backend/nodes/outfuncs.cpp +++ b/src/common/backend/nodes/outfuncs.cpp @@ -2098,6 +2098,44 @@ static void _outSort(StringInfo str, Sort* node) out_mem_info(str, &node->mem_info); } +static void _outIncrementalSort(StringInfo str, IncrementalSort* node) +{ + int i; + + WRITE_NODE_TYPE("INCREMENTALSORT"); + + _outPlanInfo(str, (Plan*)node); + + WRITE_INT_FIELD(sort.numCols); + + appendStringInfo(str, " :sortColIdx"); + for (i = 0; i < node->sort.numCols; i++) { + appendStringInfo(str, " %d", node->sort.sortColIdx[i]); + } + + WRITE_GRPOP_FIELD(sort.sortOperators, sort.numCols); + + appendStringInfo(str, " :collations"); + for (i = 0; i < node->sort.numCols; i++) { + appendStringInfo(str, " %u", node->sort.collations[i]); + } + + for (i = 0; i < node->sort.numCols; i++) { + if (node->sort.collations[i] >= FirstBootstrapObjectId && IsStatisfyUpdateCompatibility(node->sort.collations[i])) { + appendStringInfo(str, " :collname "); + _outToken(str, get_collation_name(node->sort.collations[i])); + } + } + + appendStringInfo(str, " :nullsFirst"); + for (i = 0; i < node->sort.numCols; i++) { + appendStringInfo(str, " %s", booltostr(node->sort.nullsFirst[i])); + } + out_mem_info(str, &node->sort.mem_info); + + WRITE_INT_FIELD(nPresortedCols); +} + static void _outSortGroup(StringInfo str, SortGroup* node) { int i; @@ -6779,6 +6817,9 @@ static void _outNode(StringInfo str, const void* obj) case T_Sort: _outSort(str, (Sort*)obj); break; + case T_IncrementalSort: + _outIncrementalSort(str, (IncrementalSort*)obj); + break; case T_SortGroup: _outSortGroup(str, (SortGroup*)obj); break; diff --git a/src/common/backend/parser/parse_hint.cpp b/src/common/backend/parser/parse_hint.cpp index efacffd1faa97dded7532fe2ca92317a18420395..0e54f47edeedacdc4029aff27a0b567c5d0b872d 100755 --- a/src/common/backend/parser/parse_hint.cpp +++ b/src/common/backend/parser/parse_hint.cpp @@ -3840,6 +3840,7 @@ const char* G_SET_HINT_WHITE_LIST[] = { (char*)"enable_functional_dependency", (char*)"enable_hashagg", (char*)"enable_hashjoin", + (char*)"enable_incremental_sort", (char*)"enable_index_nestloop", (char*)"enable_indexonlyscan", (char*)"enable_indexscan", diff --git a/src/common/backend/utils/misc/guc/guc_sql.cpp b/src/common/backend/utils/misc/guc/guc_sql.cpp index c638ac4a44b0af48f5f818ac655de74cb3b4478c..c594867115617c3424248e02a1841b8d5a349d47 100755 --- a/src/common/backend/utils/misc/guc/guc_sql.cpp +++ b/src/common/backend/utils/misc/guc/guc_sql.cpp @@ -789,6 +789,17 @@ static void InitSqlConfigureNamesBool() NULL, NULL, NULL}, + {{"enable_incremental_sort", + PGC_USERSET, + NODE_ALL, + QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of incremental sort steps."), + NULL}, + &u_sess->attr.attr_sql.enable_incremental_sort, + true, + NULL, + NULL, + NULL}, {{"enable_compress_spill", PGC_USERSET, diff --git a/src/common/backend/utils/mmgr/mcxt.cpp b/src/common/backend/utils/mmgr/mcxt.cpp index 9de135b1200bfbae9522de7601dcbdffdd31f15a..528423ebaefa8231a44a28cf2aef8d75435cfbb6 100644 --- a/src/common/backend/utils/mmgr/mcxt.cpp +++ b/src/common/backend/utils/mmgr/mcxt.cpp @@ -85,6 +85,7 @@ static McxtAllocationMethods StdMcxtAllocMtd = { static McxtOperationMethods StdMcxtOpMtd = { std_MemoryContextReset, + std_MemoryContextResetOnly, std_MemoryContextDelete, std_MemoryContextDeleteChildren, std_MemoryContextDestroyAtThreadExit, @@ -271,6 +272,37 @@ void std_MemoryContextReset(MemoryContext context) } } +/* + * std_MemoryContextResetOnly + * Release all space allocated within a context + * but don't delete the contexts themselves. + * + * The type-specific reset routine handles the context itself, but we + * have to do the recursion for the children. + */ +void std_MemoryContextResetOnly(MemoryContext context) +{ + AssertArg(MemoryContextIsValid(context)); + +#ifdef MEMORY_CONTEXT_CHECKING + PreventActionOnSealedContext(context); +#endif + + /* Do not delete children */ + +#ifdef MEMORY_CONTEXT_CHECKING + /* Memory Context Checking */ + MemoryContextCheck(context, context->session_id > 0); +#endif + + /* Nothing to do if no pallocs since startup or last reset */ + if (!context->isReset) { + RemoveMemoryContextInfo(context); + (*context->methods->reset)(context); + context->isReset = true; + } +} + /* * MemoryContextResetChildren * Release all space allocated within a context's descendants, diff --git a/src/common/backend/utils/mmgr/opt_mcxt.cpp b/src/common/backend/utils/mmgr/opt_mcxt.cpp index 3fe3b547b22e20bac672dda0459435e86f3807c2..5b697cf75dc500f0d930600ab66ae027e078d6db 100644 --- a/src/common/backend/utils/mmgr/opt_mcxt.cpp +++ b/src/common/backend/utils/mmgr/opt_mcxt.cpp @@ -49,6 +49,7 @@ static McxtAllocationMethods OptMcxtAllocMtd = { static McxtOperationMethods OptMcxtOpMtd = { opt_MemoryContextReset, + opt_MemoryContextResetOnly, opt_MemoryContextDelete, opt_MemoryContextDeleteChildren, opt_MemoryContextDestroyAtThreadExit, @@ -405,6 +406,19 @@ void opt_MemoryContextReset(MemoryContext context) } } +void opt_MemoryContextResetOnly(MemoryContext context) +{ + AssertArg(IsOptAllocSetContext(context)); + + /* do not reset children */ + + /* Nothing to do if no pallocs since startup or last reset */ + if (!context->isReset) { + (*context->methods->reset)(context); + context->isReset = true; + } +} + void opt_MemoryContextDestroyAtThreadExit(MemoryContext context) { /* Delete all its decendents */ diff --git a/src/common/backend/utils/sort/batchsort.cpp b/src/common/backend/utils/sort/batchsort.cpp index 15d258174b152da44f0f82303665f477392ead7b..c24bd96d3a59fd777630731e5a9625889af85a67 100644 --- a/src/common/backend/utils/sort/batchsort.cpp +++ b/src/common/backend/utils/sort/batchsort.cpp @@ -679,31 +679,32 @@ void batchsort_restorepos(Batchsortstate* state) * printable summary information about how the sort was performed. * spaceUsed is measured in kilobytes. */ -void batchsort_get_stats(Batchsortstate* state, int* sortMethodId, int* spaceTypeId, long* spaceUsed) +void batchsort_get_stats(Batchsortstate *state, TuplesortInstrumentation *stats) { if (state->m_tapeset) { - *spaceTypeId = SORT_IN_DISK; - *spaceUsed = LogicalTapeSetBlocks(state->m_tapeset) * (BLCKSZ / 1024); + stats->spaceType = SORT_SPACE_TYPE_DISK; + stats->spaceUsed = LogicalTapeSetBlocks(state->m_tapeset) * (BLCKSZ / 1024); } else { - *spaceTypeId = SORT_IN_MEMORY; - *spaceUsed = (state->m_allowedMem - state->m_availMem + 1023) / 1024; + stats->spaceType = SORT_SPACE_TYPE_MEMORY; + stats->spaceUsed = (state->m_allowedMem - state->m_availMem + 1023) / 1024; } switch (state->m_status) { case BS_SORTEDINMEM: - if (state->m_boundUsed) - *sortMethodId = (int)HEAPSORT; - else - *sortMethodId = (int)QUICKSORT; + if (state->m_boundUsed) { + stats->sortMethod = SORT_TYPE_TOP_N_HEAPSORT; + } else { + stats->sortMethod = SORT_TYPE_QUICKSORT; + } break; case BS_SORTEDONTAPE: - *sortMethodId = (int)EXTERNALSORT; + stats->sortMethod = SORT_TYPE_EXTERNAL_SORT; break; case BS_FINALMERGE: - *sortMethodId = (int)EXTERNALMERGE; + stats->sortMethod = SORT_TYPE_EXTERNAL_MERGE; break; default: - *sortMethodId = (int)STILLINPROGRESS; + stats->sortMethod = SORT_TYPE_NONE; break; } } diff --git a/src/common/backend/utils/sort/tuplesort.cpp b/src/common/backend/utils/sort/tuplesort.cpp index ca3898c0584f78f23fd0efd78587292b406df55d..dbd358597aef213ed01ce0d79a56284aaaccbb3c 100644 --- a/src/common/backend/utils/sort/tuplesort.cpp +++ b/src/common/backend/utils/sort/tuplesort.cpp @@ -160,6 +160,8 @@ bool u_sess->attr.attr_sql.optimize_bounded_sort = true; #define MINIMAL_SLOTS_PER_TAPE 16 #define MINIMAL_MERGE_SORT_MEMORY 16384 // 16MB +#define INITIAL_MEMTUPSIZE 1024 + /* * The objects we actually sort are SortTuple structs. These contain * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), @@ -248,6 +250,14 @@ struct Tuplesortstate { int64 allowedMem; /* total memory allowed, in bytes */ int maxTapes; /* number of tapes (Knuth's T) */ int tapeRange; /* maxTapes-1 (Knuth's P) */ + int64 maxSpace; /* maximum amount of space occupied among sort + * of groups, either in-memory or on-disk */ + bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk + * space, false when it's value for in-memory + * space */ + TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */ + MemoryContext maincontext; /* memory context for tuple sort metadata that + * persists across multiple batches */ MemoryContext sortcontext; /* memory context holding all sort data */ MemoryContext tuplecontext; /* memory context holding tuple data */ LogicalTapeSet* tapeset; /* logtape.c object for tapes in a temp file */ @@ -677,6 +687,7 @@ static bool AutoSpreadMem(Tuplesortstate* state, double* growRatio) } while (0) static Tuplesortstate* tuplesort_begin_common(int64 workMem, bool randomAccess, SortCoordinate coordinate = NULL); +static void tuplesort_begin_batch(Tuplesortstate *state); static void puttuple_common(Tuplesortstate* state, SortTuple* tuple); static bool consider_abort_common(Tuplesortstate* state); static void inittapes(Tuplesortstate* state, bool mergeruns); @@ -725,6 +736,8 @@ static void reversedirection_datum(Tuplesortstate* state); static void free_sort_tuple(Tuplesortstate* state, SortTuple* stup); static void dumpbatch(Tuplesortstate *state, bool alltuples); static void tuplesort_sort_memtuples(Tuplesortstate *state); +static void tuplesort_free(Tuplesortstate *state); +static void tuplesort_updatemax(Tuplesortstate *state); /* * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts @@ -797,34 +810,43 @@ void sort_count(Tuplesortstate* state) static Tuplesortstate* tuplesort_begin_common(int64 workMem, bool randomAccess, SortCoordinate coordinate) { Tuplesortstate* state = NULL; + MemoryContext maincontext; MemoryContext sortcontext; - MemoryContext tuplecontext; MemoryContext oldcontext; /* See leader_takeover_tapes() remarks on randomAccess support */ if (coordinate && randomAccess) elog(ERROR, "random access disallowed under parallel sort"); + /* + * Memory context surviving tuplesort_reset. This memory context holds + * data which is useful to keep while sorting multiple similar batches. + */ + maincontext = AllocSetContextCreate(CurrentMemoryContext, + "TupleSort main", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE, + STANDARD_CONTEXT, + workMem * 1024L); + /* * Create a working memory context for this sort operation. All data * needed by the sort will live inside this context. */ - sortcontext = AllocSetContextCreate(CurrentMemoryContext, "TupleSort main", ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE, STANDARD_CONTEXT, workMem * 1024L); - - tuplecontext = AllocSetContextCreate(sortcontext, - "Caller tuples", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE, - STANDARD_CONTEXT, - workMem * 1024L); + sortcontext = AllocSetContextCreate(maincontext, + "TupleSort Sort", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE, + STANDARD_CONTEXT, + workMem * 1024L); /* - * Make the Tuplesortstate within the per-sort context. This way, we + * Make the Tuplesortstate within the per-sortstate context. This way, we * don't need a separate pfree_ext() operation for it at shutdown. */ - oldcontext = MemoryContextSwitchTo(sortcontext); + oldcontext = MemoryContextSwitchTo(maincontext); state = (Tuplesortstate*)palloc0(sizeof(Tuplesortstate)); @@ -833,38 +855,29 @@ static Tuplesortstate* tuplesort_begin_common(int64 workMem, bool randomAccess, pg_rusage_init(&state->ru_start); #endif - state->status = TSS_INITIAL; state->randomAccess = randomAccess; - state->bounded = false; + state->tuples = true; - state->boundUsed = false; + + /* + * workMem is forced to be at least 64KB, the current minimum valid value + * for the work_mem GUC. This is a defense against parallel sort callers + * that divide out memory among many workers in a way that leaves each + * with very little memory. + */ state->allowedMem = Max(workMem, 64) * (int64) 1024; - state->availMem = state->allowedMem; state->sortcontext = sortcontext; - state->tuplecontext = tuplecontext; - state->tapeset = NULL; - - state->memtupcount = 0; - state->memtupsize = 1024; /* initial guess */ - state->growmemtuples = true; - state->slabAllocatorUsed = false; - state->memtuples = (SortTuple*)palloc(state->memtupsize * sizeof(SortTuple)); - - USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + state->maincontext = maincontext; - /* workMem must be large enough for the minimal memtuples array */ - if (LACKMEM(state)) - ereport(ERROR, (errmodule(MOD_EXECUTOR), - (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("insufficient memory allowed for sort")))); - - state->currentRun = 0; + state->memtupsize = INITIAL_MEMTUPSIZE; + state->memtuples = NULL; /* - * maxTapes, tapeRange, and Algorithm D variables will be initialized by - * inittapes(), if needed + * After all of the other non-parallel-related state, we setup all of the + * state needed for each batch. */ + tuplesort_begin_batch(state); - state->result_tape = -1; /* flag that result tape has not been formed */ /* * Initialize parallel-related state based on coordination information * from caller @@ -893,6 +906,80 @@ static Tuplesortstate* tuplesort_begin_common(int64 workMem, bool randomAccess, return state; } +/* + * tuplesort_begin_batch + * + * Setup, or reset, all state need for processing a new set of tuples with this + * sort state. Called both from tuplesort_begin_common (the first time sorting + * with this sort state) and tuplesort_reset (for subsequent usages). + */ +static void tuplesort_begin_batch(Tuplesortstate *state) +{ + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. + */ + state->tuplecontext = AllocSetContextCreate(state->sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE, + STANDARD_CONTEXT, + state->allowedMem); + + state->status = TSS_INITIAL; + state->bounded = false; + state->boundUsed = false; + + state->availMem = state->allowedMem; + + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->growmemtuples = true; + state->slabAllocatorUsed = false; + if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE) { + pfree_ext(state->memtuples); + state->memtuples = NULL; + state->memtupsize = INITIAL_MEMTUPSIZE; + } + + if (state->memtuples == NULL) { + state->memtuples = (SortTuple *)palloc(state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + } + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) { + ereport(ERROR, (errmodule(MOD_EXECUTOR), + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("insufficient memory allowed for sort")))); + } + + state->currentRun = 0; + + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ + state->result_tape = -1; /* flag that result tape has not been formed */ + + MemoryContextSwitchTo(oldcontext); +} + Tuplesortstate* tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber* attNums, Oid* sortOperators, Oid* sortCollations, const bool* nullsFirstFlags, int64 workMem, bool randomAccess, int64 maxMem, int planId, int dop, SortCoordinate coordinate) @@ -901,7 +988,7 @@ Tuplesortstate* tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber* a MemoryContext oldcontext; int i; - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); AssertArg(nkeys > 0); @@ -989,7 +1076,7 @@ Tuplesortstate* tuplesort_begin_cluster( MemoryContext oldcontext; Assert(OID_IS_BTREE(indexRel->rd_rel->relam)); - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); #ifdef TRACE_SORT if (u_sess->attr.attr_common.trace_sort) { @@ -1051,7 +1138,7 @@ Tuplesortstate* tuplesort_begin_index_btree( Tuplesortstate* state = tuplesort_begin_common(workMem, randomAccess, coordinate); MemoryContext oldcontext; - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); #ifdef TRACE_SORT if (u_sess->attr.attr_common.trace_sort) { @@ -1094,7 +1181,7 @@ Tuplesortstate* tuplesort_begin_index_hash( Tuplesortstate* state = tuplesort_begin_common(workMem, randomAccess); MemoryContext oldcontext; - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); #ifdef TRACE_SORT if (u_sess->attr.attr_common.trace_sort) { @@ -1140,7 +1227,7 @@ Tuplesortstate* tuplesort_begin_datum( int16 typlen; bool typbyval = false; - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); #ifdef TRACE_SORT if (u_sess->attr.attr_common.trace_sort) { @@ -1262,15 +1349,21 @@ void tuplesort_set_bound(Tuplesortstate* state, int64 bound) } /* - * tuplesort_end + * tuplesort_used_bound * - * Release resources and clean up. + * Allow callers to find out if the sort state was able to use a bound. + */ +bool tuplesort_used_bound(Tuplesortstate *state) +{ + return state->boundUsed; +} + +/* + * tuplesort_free * - * NOTE: after calling this, any pointers returned by tuplesort_getXXX are - * pointing to garbage. Be careful not to attempt to use or free such - * pointers afterwards! + * Internal routine for freeing resources of tuplesort. */ -void tuplesort_end(Tuplesortstate* state) +static void tuplesort_free(Tuplesortstate *state) { /* context swap probably not needed, but let's be safe */ MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); @@ -1327,10 +1420,98 @@ void tuplesort_end(Tuplesortstate* state) (void)MemoryContextSwitchTo(oldcontext); /* - * Free the per-sort memory context, thereby releasing all working memory, - * including the Tuplesortstate struct itself. + * Free the per-sort memory context, thereby releasing all working memory. */ - MemoryContextDelete(state->sortcontext); + MemoryContextResetAndDeleteChildren(state->sortcontext); +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void tuplesort_end(Tuplesortstate* state) +{ + tuplesort_free(state); + + /* + * Free the main memory context, including the Tuplesortstate struct + * itself. + */ + MemoryContextDelete(state->maincontext); +} + +/* + * tuplesort_updatemax + * + * Update maximum resource usage statistics. + */ +static void tuplesort_updatemax(Tuplesortstate *state) +{ + int64 spaceUsed; + bool isSpaceDisk; + + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) { + isSpaceDisk = true; + spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ; + } else { + isSpaceDisk = false; + spaceUsed = state->allowedMem - state->availMem; + } + + /* + * Sort evicts data to the disk when it wasn't able to fit that data into + * main memory. This is why we assume space used on the disk to be more + * important for tracking resource usage than space used in memory. Note + * that the amount of space occupied by some tupleset on the disk might be + * less than amount of space occupied by the same tupleset in memory due + * to more compact representation. + */ + if ((isSpaceDisk && !state->isMaxSpaceDisk) || + (isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace)) + { + state->maxSpace = spaceUsed; + state->isMaxSpaceDisk = isSpaceDisk; + state->maxSpaceStatus = state->status; + } +} + +/* + * tuplesort_reset + * + * Reset the tuplesort. Reset all the data in the tuplesort, but leave the + * meta-information in. After tuplesort_reset, tuplesort is ready to start + * a new sort. This allows avoiding recreation of tuple sort states (and + * save resources) when sorting multiple small batches. + */ +void tuplesort_reset(Tuplesortstate *state) +{ + tuplesort_updatemax(state); + tuplesort_free(state); + + /* + * After we've freed up per-batch memory, re-setup all of the state common + * to both the first batch and any subsequent batch. + */ + tuplesort_begin_batch(state); + + state->lastReturnedTuple = NULL; + state->slabMemoryBegin = NULL; + state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; } /* @@ -2608,8 +2789,7 @@ static void mergeruns(Tuplesortstate* state) * Reset tuple memory. We've freed all the tuples that we previously * allocated. We will use the slab allocator from now on. */ - MemoryContextDelete(state->tuplecontext); - state->tuplecontext = NULL; + MemoryContextResetOnly(state->tuplecontext); /* * We no longer need a large memtuples array. (We will allocate a smaller @@ -2678,7 +2858,8 @@ static void mergeruns(Tuplesortstate* state) * from each input tape. */ state->memtupsize = numInputTapes; - state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple)); + state->memtuples = (SortTuple *) MemoryContextAlloc(state->maincontext, + numInputTapes * sizeof(SortTuple)); USEMEM(state, GetMemoryChunkSpace(state->memtuples)); /* End of step D2: rewind all output tapes to prepare for merging */ @@ -3118,7 +3299,7 @@ void tuplesort_restorepos(Tuplesortstate* state) * printable summary information about how the sort was performed. * spaceUsed is measured in kilobytes. */ -void tuplesort_get_stats(Tuplesortstate* state, int* sortMethodId, int* spaceTypeId, long* spaceUsed) +void tuplesort_get_stats(Tuplesortstate *state, TuplesortInstrumentation *stats) { /* * Note: it might seem we should provide both memory and disk usage for a @@ -3129,35 +3310,66 @@ void tuplesort_get_stats(Tuplesortstate* state, int* sortMethodId, int* spaceTyp * to fix. Is it worth creating an API for the memory context code to * tell us how much is actually used in sortcontext? */ + tuplesort_updatemax(state); - if (state->tapeset != NULL) { - *spaceTypeId = SORT_IN_DISK; - *spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024); + if (state->isMaxSpaceDisk) { + stats->spaceType = SORT_SPACE_TYPE_DISK; } else { - *spaceTypeId = SORT_IN_MEMORY; - *spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; + stats->spaceType = SORT_SPACE_TYPE_MEMORY; } - switch (state->status) { + stats->spaceUsed = (state->maxSpace + 1023) / 1024; + + switch (state->maxSpaceStatus) { case TSS_SORTEDINMEM: if (state->boundUsed) { - *sortMethodId = (int)HEAPSORT; + stats->sortMethod = SORT_TYPE_TOP_N_HEAPSORT; } else { - *sortMethodId = (int)QUICKSORT; + stats->sortMethod = SORT_TYPE_QUICKSORT; } break; case TSS_SORTEDONTAPE: - *sortMethodId = (int)EXTERNALSORT; + stats->sortMethod = SORT_TYPE_EXTERNAL_SORT; break; case TSS_FINALMERGE: - *sortMethodId = (int)EXTERNALMERGE; + stats->sortMethod = SORT_TYPE_EXTERNAL_MERGE; break; default: - *sortMethodId = (int)STILLINPROGRESS; + stats->sortMethod = SORT_TYPE_NONE; break; } } +/* + * Convert TuplesortMethod to a string. + */ +const char *tuplesort_method_name(TuplesortMethod m) +{ + switch (m) { + case SORT_TYPE_NONE: + return "none"; + case SORT_TYPE_TOP_N_HEAPSORT: + return "top-N heapsort"; + case SORT_TYPE_QUICKSORT: + return "quicksort"; + case SORT_TYPE_EXTERNAL_SORT: + return "external sort"; + case SORT_TYPE_EXTERNAL_MERGE: + return "external merge"; + } + + return "unknown"; +} + +/* + * Convert TuplesortSpaceType to a string. + */ +const char *tuplesort_space_type_name(TuplesortSpaceType t) +{ + Assert(t == SORT_SPACE_TYPE_DISK || t == SORT_SPACE_TYPE_MEMORY); + return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory"; +} + /* * Convert the existing unordered array of SortTuples to a bounded heap, * discarding all but the smallest "state->bound" tuples. diff --git a/src/gausskernel/optimizer/commands/explain.cpp b/src/gausskernel/optimizer/commands/explain.cpp index ee73a9c7e8a36185f58f18590cb82a5950945536..d589ac133c7c881c361c11737c80fc3e50458a62 100755 --- a/src/gausskernel/optimizer/commands/explain.cpp +++ b/src/gausskernel/optimizer/commands/explain.cpp @@ -91,7 +91,6 @@ THR_LOCAL bool IsExplainPlanSelectForUpdateStmt = false; THR_LOCAL explain_get_index_name_hook_type explain_get_index_name_hook = NULL; extern TrackDesc trackdesc[]; -extern sortMessage sortmessage[]; extern DestReceiver* CreateDestReceiver(CommandDest dest); /* Array to record plan table column names, type, etc */ @@ -162,10 +161,12 @@ static void show_pushdown_qual(PlanState* planstate, List* ancestors, ExplainSta static void show_upper_qual(List* qual, const char* qlabel, PlanState* planstate, List* ancestors, ExplainState* es); static void show_groupby_keys(AggState* aggstate, List* ancestors, ExplainState* es); static void show_sort_keys(SortState* sortstate, List* ancestors, ExplainState* es); +static void show_incremental_sort_keys(IncrementalSortState *incrsortstate, List *ancestors, ExplainState *es); static void show_merge_append_keys(MergeAppendState* mstate, List* ancestors, ExplainState* es); static void show_merge_sort_keys(PlanState* state, List* ancestors, ExplainState* es); static void show_startwith_pseudo_entries(PlanState* state, List* ancestors, ExplainState* es); static void show_sort_info(SortState* sortstate, ExplainState* es); +static void show_incremental_sort_info(IncrementalSortState *incrsortstate, ExplainState *es); static void show_sort_group_info(SortGroupState *state, ExplainState *es); static void show_hash_info(HashState* hashstate, ExplainState* es); static void show_vechash_info(VecHashJoinState* hashstate, ExplainState* es); @@ -244,8 +245,9 @@ static bool get_execute_mode(const ExplainState* es, int idx); static void show_setop_info(SetOpState* setopstate, ExplainState* es); static void show_grouping_sets(PlanState* planstate, Agg* agg, List* ancestors, ExplainState* es); static void show_group_keys(GroupState* gstate, List* ancestors, ExplainState* es); -static void show_sort_group_keys(PlanState* planstate, const char* qlabel, int nkeys, const AttrNumber* keycols, - const Oid* sortOperators, const Oid* collations, const bool* nullsFirst, List* ancestors, ExplainState* es); +static void show_sort_group_keys(PlanState* planstate, const char* qlabel, int nkeys, int nPresortedKeys, + const AttrNumber* keycols, const Oid* sortOperators, const Oid* collations, const bool* nullsFirst, + List* ancestors, ExplainState* es); static void show_sortorder_options(StringInfo buf, const Node* sortexpr, Oid sortOperator, Oid collation, bool nullsFirst); static void show_grouping_set_keys(PlanState* planstate, Agg* aggnode, Sort* sortnode, List* context, bool useprefix, List* ancestors, ExplainState* es); @@ -3019,9 +3021,13 @@ static void ExplainNode( show_sort_info((SortState*)planstate, es); show_llvm_info(planstate, es); break; + case T_IncrementalSort: + show_incremental_sort_keys((IncrementalSortState*)planstate, ancestors, es); + show_incremental_sort_info((IncrementalSortState*)planstate, es); + break; case T_SortGroup: { SortGroup *plan = (SortGroup *)planstate->plan; - show_sort_group_keys(planstate, "Sorted Group Key", plan->numCols, plan->sortColIdx, plan->sortOperators, + show_sort_group_keys(planstate, "Sorted Group Key", plan->numCols, 0, plan->sortColIdx, plan->sortOperators, plan->collations, plan->nullsFirst, ancestors, es); show_sort_group_info(castNode(SortGroupState, planstate), es); break; @@ -3479,6 +3485,7 @@ static void CalculateProcessedRows( break; case T_Agg: case T_Sort: + case T_IncrementalSort: case T_SetOp: case T_VecSetOp: case T_VecAgg: @@ -3957,18 +3964,20 @@ static void show_groupby_keys(AggState* aggstate, List* ancestors, ExplainState* /* The key columns refer to the tlist of the child plan */ ancestors = lcons(aggstate, ancestors); - if (plan->groupingSets) + if (plan->groupingSets) { show_grouping_sets(outerPlanState(aggstate), plan, ancestors, es); - else + } else { show_sort_group_keys(outerPlanState(aggstate), - "Group By Key", - plan->numCols, - plan->grpColIdx, - NULL, - NULL, - NULL, - ancestors, - es); + "Group By Key", + plan->numCols, + 0, + plan->grpColIdx, + NULL, + NULL, + NULL, + ancestors, + es); + } ancestors = list_delete_first(ancestors); } @@ -4012,14 +4021,15 @@ static void show_grouping_set_keys(PlanState* planstate, Agg* aggnode, Sort* sor if (sortnode != NULL) { show_sort_group_keys(planstate, - "Sort Key", - sortnode->numCols, - sortnode->sortColIdx, - sortnode->sortOperators, - sortnode->collations, - sortnode->nullsFirst, - ancestors, - es); + "Sort Key", + sortnode->numCols, + 0, + sortnode->sortColIdx, + sortnode->sortOperators, + sortnode->collations, + sortnode->nullsFirst, + ancestors, + es); if (es->format == EXPLAIN_FORMAT_TEXT) es->indent++; } @@ -4071,8 +4081,16 @@ static void show_group_keys(GroupState* gstate, List* ancestors, ExplainState* e /* The key columns refer to the tlist of the child plan */ ancestors = lcons(gstate, ancestors); - show_sort_group_keys( - outerPlanState(gstate), "Group By Key", plan->numCols, plan->grpColIdx, NULL, NULL, NULL, ancestors, es); + show_sort_group_keys(outerPlanState(gstate), + "Group By Key", + plan->numCols, + 0, + plan->grpColIdx, + NULL, + NULL, + NULL, + ancestors, + es); ancestors = list_delete_first(ancestors); } @@ -4084,14 +4102,34 @@ static void show_sort_keys(SortState* sortstate, List* ancestors, ExplainState* Sort* plan = (Sort*)sortstate->ss.ps.plan; show_sort_group_keys((PlanState*)sortstate, - "Sort Key", - plan->numCols, - plan->sortColIdx, - plan->sortOperators, - plan->collations, - plan->nullsFirst, - ancestors, - es); + "Sort Key", + plan->numCols, + 0, + plan->sortColIdx, + plan->sortOperators, + plan->collations, + plan->nullsFirst, + ancestors, + es); +} + +/* + * Show the sort keys for a IncrementalSort node. + */ +static void show_incremental_sort_keys(IncrementalSortState *incrsortstate, + List *ancestors, ExplainState *es) +{ + IncrementalSort *plan = (IncrementalSort*)incrsortstate->ss.ps.plan; + + show_sort_group_keys((PlanState*)incrsortstate, + "Sort Key", + plan->sort.numCols, + plan->nPresortedCols, + plan->sort.sortColIdx, + plan->sort.sortOperators, + plan->sort.collations, + plan->sort.nullsFirst, + ancestors, es); } /* @@ -4102,14 +4140,15 @@ static void show_merge_append_keys(MergeAppendState* mstate, List* ancestors, Ex MergeAppend* plan = (MergeAppend*)mstate->ps.plan; show_sort_group_keys((PlanState*)mstate, - "Sort Key", - plan->numCols, - plan->sortColIdx, - plan->sortOperators, - plan->collations, - plan->nullsFirst, - ancestors, - es); + "Sort Key", + plan->numCols, + 0, + plan->sortColIdx, + plan->sortOperators, + plan->collations, + plan->nullsFirst, + ancestors, + es); } /* @@ -4130,14 +4169,15 @@ static void show_merge_sort_keys(PlanState* state, List* ancestors, ExplainState return; show_sort_group_keys((PlanState*)state, - "Merge Sort Key", - sort->numCols, - sort->sortColIdx, - sort->sortOperators, - sort->sortCollations, - sort->nullsFirst, - ancestors, - es); + "Merge Sort Key", + sort->numCols, + 0, + sort->sortColIdx, + sort->sortOperators, + sort->sortCollations, + sort->nullsFirst, + ancestors, + es); } static void show_startwith_pseudo_entries(PlanState* state, List* ancestors, ExplainState* es) @@ -4370,8 +4410,6 @@ static void show_sort_info(SortState* sortstate, ExplainState* es) long min_diskUsed = MIN_DISK_USED; long max_memoryUsed = 0; long min_memoryUsed = u_sess->attr.attr_memory.work_mem; - int sortMethodId = 0; - int spaceTypeId = 0; if (es->detail) { for (int i = 0; i < u_sess->instr_cxt.global_instr->getInstruNodeNum(); i++) { @@ -4379,9 +4417,9 @@ static void show_sort_info(SortState* sortstate, ExplainState* es) /* * when this operator does not actual executed, we will not show sort info. */ - if (instr == NULL || instr->sorthashinfo.sortMethodId < (int)HEAPSORT || - instr->sorthashinfo.sortMethodId > (int)STILLINPROGRESS) + if (instr == NULL || instr->sorthashinfo.sinstrument.sortMethod == SORT_TYPE_NONE) { continue; + } #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); #else @@ -4393,15 +4431,9 @@ static void show_sort_info(SortState* sortstate, ExplainState* es) sprintf(node_name, "%d", i); } #endif - - sortMethodId = instr->sorthashinfo.sortMethodId; - spaceTypeId = instr->sorthashinfo.spaceTypeId; - sortMethod = sortmessage[sortMethodId].sortName; - spaceUsed = instr->sorthashinfo.spaceUsed; - if (spaceTypeId == SORT_IN_DISK) - spaceType = "Disk"; - else - spaceType = "Memory"; + sortMethod = tuplesort_method_name(instr->sorthashinfo.sinstrument.sortMethod); + spaceType = tuplesort_space_type_name(instr->sorthashinfo.sinstrument.spaceType); + spaceUsed = instr->sorthashinfo.sinstrument.spaceUsed; if (es->planinfo != NULL && es->planinfo->m_runtimeinfo != NULL) { es->planinfo->m_runtimeinfo->put(i, 0, SORT_METHOD, PointerGetDatum(cstring_to_text(sortMethod))); @@ -4435,17 +4467,13 @@ static void show_sort_info(SortState* sortstate, ExplainState* es) } else { for (int i = 0; i < u_sess->instr_cxt.global_instr->getInstruNodeNum(); i++) { instr = u_sess->instr_cxt.global_instr->getInstrSlot(i, planstate->plan->plan_node_id); - if (instr == NULL || instr->sorthashinfo.sortMethodId < (int)HEAPSORT || - instr->sorthashinfo.sortMethodId > (int)STILLINPROGRESS) + if (instr == NULL || instr->sorthashinfo.sinstrument.sortMethod == SORT_TYPE_NONE) { continue; - - sortMethodId = instr->sorthashinfo.sortMethodId; - spaceTypeId = instr->sorthashinfo.spaceTypeId; - sortMethod = sortmessage[sortMethodId].sortName; - spaceUsed = instr->sorthashinfo.spaceUsed; - if (spaceTypeId == SORT_IN_DISK) { - spaceType = "Disk"; - + } + sortMethod = tuplesort_method_name(instr->sorthashinfo.sinstrument.sortMethod); + spaceType = tuplesort_space_type_name(instr->sorthashinfo.sinstrument.spaceType); + spaceUsed = instr->sorthashinfo.sinstrument.spaceUsed; + if (instr->sorthashinfo.sinstrument.spaceType == SORT_SPACE_TYPE_DISK) { max_diskUsed = rtl::max(spaceUsed, max_diskUsed); min_diskUsed = rtl::min(spaceUsed, min_diskUsed); if (min_diskUsed == MIN_DISK_USED) @@ -4471,13 +4499,13 @@ static void show_sort_info(SortState* sortstate, ExplainState* es) if (max_diskUsed != 0) { es->planinfo->m_staticInfo->set_plan_name(); + appendStringInfo(es->planinfo->m_staticInfo->info_str, + "Sort Method: %s %s: %ldkB ~ %ldkB\n", + sortMethod, + "Disk", + min_diskUsed, + max_diskUsed); } - appendStringInfo(es->planinfo->m_staticInfo->info_str, - "Sort Method: %s %s: %ldkB ~ %ldkB\n", - sortMethod, - "Disk", - min_diskUsed, - max_diskUsed); } else { if (max_memoryUsed != 0) { if (es->format == EXPLAIN_FORMAT_TEXT) @@ -4493,23 +4521,19 @@ static void show_sort_info(SortState* sortstate, ExplainState* es) } } } else { - char* sortMethod = NULL; - char* spaceType = NULL; - int spaceTypeId = 0; - if (es->analyze && sortstate->sort_Done && sortstate->sortMethodId >= (int)HEAPSORT && - sortstate->sortMethodId <= (int)STILLINPROGRESS && - (sortstate->spaceTypeId == SORT_IN_DISK || sortstate->spaceTypeId == SORT_IN_MEMORY)) { - sortMethod = sortmessage[sortstate->sortMethodId].sortName; - spaceTypeId = sortstate->spaceTypeId; - if (spaceTypeId == SORT_IN_DISK) - spaceType = "Disk"; - else - spaceType = "Memory"; + const char* sortMethod = NULL; + const char* spaceType = NULL; + if (es->analyze && sortstate->sort_Done && + sortstate->sinstrument.sortMethod != SORT_TYPE_NONE && + (sortstate->sinstrument.spaceType == SORT_SPACE_TYPE_DISK || + sortstate->sinstrument.spaceType == SORT_SPACE_TYPE_MEMORY)) { + sortMethod = tuplesort_method_name(sortstate->sinstrument.sortMethod); + spaceType = tuplesort_space_type_name(sortstate->sinstrument.spaceType); if (es->planinfo && es->planinfo->m_runtimeinfo) { es->planinfo->m_runtimeinfo->put(-1, -1, SORT_METHOD, PointerGetDatum(cstring_to_text(sortMethod))); es->planinfo->m_runtimeinfo->put(-1, -1, SORT_TYPE, PointerGetDatum(cstring_to_text(spaceType))); - es->planinfo->m_runtimeinfo->put(-1, -1, SORT_SPACE, Int64GetDatum(sortstate->spaceUsed)); + es->planinfo->m_runtimeinfo->put(-1, -1, SORT_SPACE, Int64GetDatum(sortstate->sinstrument.spaceUsed)); } if (es->detail == false && t_thrd.explain_cxt.explain_perf_mode != EXPLAIN_NORMAL && es->planinfo && @@ -4519,12 +4543,12 @@ static void show_sort_info(SortState* sortstate, ExplainState* es) "Sort Method: %s %s: %ldkB\n", sortMethod, spaceType, - sortstate->spaceUsed); + sortstate->sinstrument.spaceUsed); } else { if (es->format == EXPLAIN_FORMAT_TEXT) appendStringInfoSpaces(es->str, es->indent * 2); - show_detail_sortinfo(es, sortMethod, spaceType, sortstate->spaceUsed); + show_detail_sortinfo(es, sortMethod, spaceType, sortstate->sinstrument.spaceUsed); } } } @@ -4679,6 +4703,187 @@ static void show_llvm_info(const PlanState* planstate, ExplainState* es) } } +/* + * Incremental sort nodes sort in (a potentially very large number of) batches, + * so EXPLAIN ANALYZE needs to roll up the tuplesort stats from each batch into + * an intelligible summary. + */ +static void show_incremental_sort_group_info(IncrementalSortGroupInfo *groupInfo, + const char *groupLabel, bool indent, + ExplainState *es) +{ + ListCell *methodCell = NULL; + List *methodNames = NIL; + + /* Generate a list of sort methods used across all groups. */ + for (int bit = 0; bit < NUM_TUPLESORTMETHODS; bit++) { + TuplesortMethod sortMethod = (TuplesortMethod)(1 << bit); + + if (groupInfo->sortMethods & sortMethod) { + const char *methodName = tuplesort_method_name(sortMethod); + methodNames = lappend(methodNames, (char *)(methodName)); + } + } + + if (es->format == EXPLAIN_FORMAT_TEXT) { + if (indent) { + appendStringInfoSpaces(es->str, es->indent * 2); + } + + appendStringInfo(es->str, "%s Groups: " INT64_FORMAT " Sort Method", groupLabel, + groupInfo->groupCount); + + /* plural/singular based on methodNames size */ + if (list_length(methodNames) > 1) { + appendStringInfo(es->str, "s: "); + } else { + appendStringInfo(es->str, ": "); + } + + foreach(methodCell, methodNames) { + appendStringInfo(es->str, "%s", (char *)methodCell->data.ptr_value); + if (methodCell->next != NULL) { + appendStringInfo(es->str, ", "); + } + } + + if (groupInfo->maxMemorySpaceUsed > 0) { + long avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount; + const char *spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY); + + appendStringInfo(es->str, " Average %s: %ldkB Peak %s: %ldkB", + spaceTypeName, avgSpace, + spaceTypeName, groupInfo->maxMemorySpaceUsed); + } + + if (groupInfo->maxDiskSpaceUsed > 0) { + long avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount; + const char *spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK); + + appendStringInfo(es->str, " Average %s: %ldkB Peak %s: %ldkB", + spaceTypeName, avgSpace, + spaceTypeName, groupInfo->maxDiskSpaceUsed); + } + + if (t_thrd.explain_cxt.explain_perf_mode != EXPLAIN_NORMAL && es->planinfo != NULL && + es->planinfo->m_staticInfo != NULL) { + if (indent) { + es->planinfo->m_staticInfo->set_plan_name(); + } + + appendStringInfo(es->planinfo->m_staticInfo->info_str, + "%s Groups: " INT64_FORMAT " Sort Method", + groupLabel, + groupInfo->groupCount); + + /* plural/singular based on methodNames size */ + if (list_length(methodNames) > 1) { + appendStringInfo(es->planinfo->m_staticInfo->info_str, "s: "); + } else { + appendStringInfo(es->planinfo->m_staticInfo->info_str, ": "); + } + + foreach(methodCell, methodNames) { + appendStringInfo(es->planinfo->m_staticInfo->info_str, + "%s", (char *)methodCell->data.ptr_value); + if (methodCell->next != NULL) { + appendStringInfo(es->planinfo->m_staticInfo->info_str, ", "); + } + } + + if (groupInfo->maxMemorySpaceUsed > 0) { + long avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount; + const char *spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY); + + appendStringInfo(es->planinfo->m_staticInfo->info_str, " %s: %ldkB ~ %ldkB", + spaceTypeName, avgSpace, + groupInfo->maxMemorySpaceUsed); + } + + if (groupInfo->maxDiskSpaceUsed > 0) { + long avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount; + const char *spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK); + + appendStringInfo(es->planinfo->m_staticInfo->info_str, " %s: %ldkB ~ %ldkB", + spaceTypeName, avgSpace, + groupInfo->maxDiskSpaceUsed); + } + appendStringInfo(es->planinfo->m_staticInfo->info_str, "\n"); + } + } else { + StringInfoData groupName; + + initStringInfo(&groupName); + appendStringInfo(&groupName, "%s Groups", groupLabel); + ExplainOpenGroup("Incremental Sort Groups", groupName.data, true, es); + ExplainPropertyInteger("Group Count", groupInfo->groupCount, es); + + ExplainPropertyList("Sort Methods Used", methodNames, es); + + if (groupInfo->maxMemorySpaceUsed > 0) { + long avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount; + const char *spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY); + StringInfoData memoryName; + + initStringInfo(&memoryName); + appendStringInfo(&memoryName, "Sort Space %s", spaceTypeName); + ExplainOpenGroup("Sort Space", memoryName.data, true, es); + + ExplainPropertyInteger("Average Sort Space Used", avgSpace, es); + ExplainPropertyInteger("Maximum Sort Space Used", + groupInfo->maxMemorySpaceUsed, es); + + ExplainCloseGroup("Sort Spaces", memoryName.data, true, es); + } + + if (groupInfo->maxDiskSpaceUsed > 0) { + long avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount; + const char *spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK); + StringInfoData diskName; + + initStringInfo(&diskName); + appendStringInfo(&diskName, "Sort Space %s", spaceTypeName); + ExplainOpenGroup("Sort Space", diskName.data, true, es); + + ExplainPropertyInteger("Average Sort Space Used", avgSpace, es); + ExplainPropertyInteger("Maximum Sort Space Used", + groupInfo->maxDiskSpaceUsed, es); + + ExplainCloseGroup("Sort Spaces", diskName.data, true, es); + } + + ExplainCloseGroup("Incremental Sort Groups", groupName.data, true, es); + } +} + +/* + * If it's EXPLAIN ANALYZE, show tuplesort stats for a incremental sort node + */ +static void show_incremental_sort_info(IncrementalSortState *incrsortstate, + ExplainState *es) +{ + IncrementalSortGroupInfo *fullsortGroupInfo; + IncrementalSortGroupInfo *prefixsortGroupInfo; + + fullsortGroupInfo = &incrsortstate->incsort_info.fullsortGroupInfo; + + if (!(es->analyze && fullsortGroupInfo->groupCount > 0)) { + return; + } + + show_incremental_sort_group_info(fullsortGroupInfo, "Full-sort", true, es); + prefixsortGroupInfo = &incrsortstate->incsort_info.prefixsortGroupInfo; + if (prefixsortGroupInfo->groupCount > 0) { + if (es->format == EXPLAIN_FORMAT_TEXT) { + appendStringInfo(es->str, "\n"); + } + show_incremental_sort_group_info(prefixsortGroupInfo, "Pre-sorted", true, es); + } + if (es->format == EXPLAIN_FORMAT_TEXT) { + appendStringInfo(es->str, "\n"); + } +} + /* * @Description: show datanode filenum and respill info * @in es: current explainstate @@ -5392,8 +5597,7 @@ static void show_vechash_info(VecHashJoinState* hashstate, ExplainState* es) sprintf(node_name, "%d", i); } #endif - - spaceUsed = (instr->sorthashinfo.spaceUsed + 1023) / 1024; + spaceUsed = (instr->sorthashinfo.sinstrument.spaceUsed + 1023) / 1024; spillSize = (instr->sorthashinfo.spill_size + 1023) / 1024; file_num = instr->sorthashinfo.hash_FileNum; spillNum = instr->sorthashinfo.hash_spillNum; @@ -5508,7 +5712,7 @@ static void show_vechash_info(VecHashJoinState* hashstate, ExplainState* es) if (!is_execute) is_execute = true; file_num = instr->sorthashinfo.hash_FileNum; - spaceUsed = (instr->sorthashinfo.spaceUsed + 1023) / 1024; + spaceUsed = (instr->sorthashinfo.sinstrument.spaceUsed + 1023) / 1024; max_file_num = rtl::max(max_file_num, file_num); min_file_num = rtl::min((int64)min_file_num, file_num); @@ -10877,11 +11081,14 @@ static inline void appendCSV(StringInfo buf, const char* data) * as arrays of targetlist indexes. If it's a sort key rather than a group * key, also pass sort operators/collations/nullsFirst arrays. */ -static void show_sort_group_keys(PlanState* planstate, const char* qlabel, int nkeys, const AttrNumber* keycols, - const Oid* sortOperators, const Oid* collations, const bool* nullsFirst, List* ancestors, ExplainState* es) +static void show_sort_group_keys(PlanState* planstate, const char* qlabel, int nkeys, + int nPresortedKeys, const AttrNumber* keycols, + const Oid* sortOperators, const Oid* collations, const bool* nullsFirst, + List* ancestors, ExplainState* es) { Plan* plan = planstate->plan; List* context = NIL; + List* resultPresorted = NIL; List* result = NIL; StringInfoData sortkeybuf; bool useprefix = false; @@ -10925,6 +11132,9 @@ static void show_sort_group_keys(PlanState* planstate, const char* qlabel, int n &sortkeybuf, (Node*)target->expr, sortOperators[keyno], collations[keyno], nullsFirst[keyno]); /* Emit one property-list item per sort key */ result = lappend(result, pstrdup(sortkeybuf.data)); + if (keyno < nPresortedKeys) { + resultPresorted = lappend(resultPresorted, exprstr); + } } if (t_thrd.explain_cxt.explain_perf_mode != EXPLAIN_NORMAL && es->planinfo->m_verboseInfo) { @@ -10932,8 +11142,17 @@ static void show_sort_group_keys(PlanState* planstate, const char* qlabel, int n appendStringInfo(es->planinfo->m_verboseInfo->info_str, "%s: ", qlabel); ExplainPrettyList(result, es); + if (nPresortedKeys > 0) { + es->planinfo->m_verboseInfo->set_plan_name(); + + appendStringInfo(es->planinfo->m_verboseInfo->info_str, "%s: ", "Presorted Key"); + ExplainPrettyList(resultPresorted, es); + } } else { ExplainPropertyList(qlabel, result, es); + if (nPresortedKeys > 0) { + ExplainPropertyList("Presorted Key", resultPresorted, es); + } } } diff --git a/src/gausskernel/optimizer/path/costsize.cpp b/src/gausskernel/optimizer/path/costsize.cpp index 624ccdf475287312d7548b59bcd052ba523931cc..37db2b4cc657c13339904e52277a4bdda20e6454 100755 --- a/src/gausskernel/optimizer/path/costsize.cpp +++ b/src/gausskernel/optimizer/path/costsize.cpp @@ -2214,9 +2214,9 @@ void cost_recursive_union(Plan* runion, Plan* nrterm, Plan* rterm) } /* - * cost_sort - * Determines and returns the cost of sorting a relation, including - * the cost of reading the input data. + * cost_tuplesort + * Determines and returns the cost of sorting a relation using tuplesort, + * not including the cost of reading the input data. * * If the total volume of data to sort is less than sort_mem, we will do * an in-memory sort, which requires no I/O and about t*log2(t) tuple @@ -2225,11 +2225,11 @@ void cost_recursive_union(Plan* runion, Plan* nrterm, Plan* rterm) * If the total volume exceeds sort_mem, we switch to a tape-style merge * algorithm. There will still be about t*log2(t) tuple comparisons in * total, but we will also need to write and read each tuple once per - * merge pass. We expect about ceil(logM(r)) merge passes where r is the + * merge pass. We expect about ceil(logM(r)) merge passes where r is the * number of initial runs formed and M is the merge order used by tuplesort.c. - * Since the average initial run should be about twice sort_mem, we have - * disk traffic = 2 * relsize * ceil(logM(p / (2*sort_mem))) - * cpu = comparison_cost * t * log2(t) + * Since the average initial run should be about sort_mem, we have + * disk traffic = 2 * relsize * ceil(logM(p / sort_mem)) + * cpu = comparison_cost * t * log2(t) * * If the sort is bounded (i.e., only the first k result tuples are needed) * and k tuples can fit into sort_mem, we use a heap method that keeps only @@ -2239,41 +2239,26 @@ void cost_recursive_union(Plan* runion, Plan* nrterm, Plan* rterm) * accesses (XXX can't we refine that guess?) * * By default, we charge two operator evals per tuple comparison, which should - * be in the right ballpark in most cases. The caller can tweak this by + * be in the right ballpark in most cases. The caller can tweak this by * specifying nonzero comparison_cost; typically that's used for any extra * work that has to be done to prepare the inputs to the comparison operators. * - * 'pathkeys' is a list of sort keys - * 'input_cost' is the total cost for reading the input data * 'tuples' is the number of tuples in the relation * 'width' is the average tuple width in bytes * 'comparison_cost' is the extra cost per comparison, if any * 'sort_mem' is the number of kilobytes of work memory allowed for the sort * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound * 'mem_info' is operator max and min info used by memory control module - * - * NOTE: some callers currently pass NIL for pathkeys because they - * can't conveniently supply the sort keys. Since this routine doesn't - * currently do anything with pathkeys anyway, that doesn't matter... - * but if it ever does, it should react gracefully to lack of key data. - * (Actually, the thing we'd most likely be interested in is just the number - * of sort keys, which all callers *could* supply.) */ -void cost_sort(Path* path, List* pathkeys, Cost input_cost, double tuples, int width, Cost comparison_cost, - int sort_mem, double limit_tuples, bool col_store, int dop, OpMemInfo* mem_info, bool index_sort) +static void cost_tuplesort(Cost *startup_cost, Cost *run_cost, double tuples, int width, Cost comparison_cost, + int sort_mem, double limit_tuples, bool col_store, int dop, OpMemInfo *mem_info, + bool index_sort) { - Cost startup_cost = input_cost; - Cost run_cost = 0; double input_bytes = relation_byte_size(tuples, width, col_store, true, true, index_sort) / SET_DOP(dop); double output_bytes; double output_tuples; long sort_mem_bytes = sort_mem * 1024L / SET_DOP(dop); - dop = SET_DOP(dop); - - if (!u_sess->attr.attr_sql.enable_sort) - startup_cost += g_instance.cost_cxt.disable_cost; - /* * We want to be sure the cost of a sort is never estimated as zero, even * if passed-in tuple count is zero. Besides, mustn't do log(0)... @@ -2300,10 +2285,10 @@ void cost_sort(Path* path, List* pathkeys, Cost input_cost, double tuples, int w * * Assume about N log2 N comparisons */ - startup_cost += comparison_cost * tuples * LOG2(tuples); + *startup_cost = comparison_cost * tuples * LOG2(tuples); /* Disk costs */ - startup_cost += compute_sort_disk_cost(input_bytes, sort_mem_bytes); + *startup_cost += compute_sort_disk_cost(input_bytes, sort_mem_bytes); } else { if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes) { /* @@ -2312,14 +2297,16 @@ void cost_sort(Path* path, List* pathkeys, Cost input_cost, double tuples, int w * factor is a bit higher than for quicksort. Tweak it so that the * cost curve is continuous at the crossover point. */ - startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples); + *startup_cost = comparison_cost * tuples * LOG2(2.0 * output_tuples); } else { /* We'll use plain quicksort on all the input tuples */ - startup_cost += comparison_cost * tuples * LOG2(tuples); + *startup_cost = comparison_cost * tuples * LOG2(tuples); } } if (mem_info != NULL) { + dop = SET_DOP(dop); + mem_info->opMem = u_sess->opt_cxt.op_work_mem; mem_info->maxMem = output_bytes / 1024L * dop; mem_info->minMem = mem_info->maxMem / SORT_MAX_DISK_SIZE; @@ -2339,7 +2326,175 @@ void cost_sort(Path* path, List* pathkeys, Cost input_cost, double tuples, int w * here --- the upper LIMIT will pro-rate the run cost so we'd be double * counting the LIMIT otherwise. */ - run_cost += u_sess->attr.attr_sql.cpu_operator_cost * tuples; + *run_cost = u_sess->attr.attr_sql.cpu_operator_cost * tuples; +} + +/* + * cost_incremental_sort + * Determines and returns the cost of sorting a relation incrementally, when + * the input path is presorted by a prefix of the pathkeys. + * + * 'presorted_keys' is the number of leading pathkeys by which the input path + * is sorted. + * + * We estimate the number of groups into which the relation is divided by the + * leading pathkeys, and then calculate the cost of sorting a single group + * with tuplesort using cost_tuplesort(). + */ +void cost_incremental_sort(Path *path, PlannerInfo *root, List *pathkeys, int presorted_keys, Cost input_startup_cost, + Cost input_total_cost, double input_tuples, int width, Cost comparison_cost, int sort_mem, double limit_tuples) +{ + Cost startup_cost; + Cost run_cost; + Cost input_run_cost = input_total_cost - input_startup_cost; + double group_tuples; + double input_groups; + Cost group_startup_cost = 0; + Cost group_run_cost = 0; + Cost group_input_run_cost = 0; + List *presortedExprs = NIL; + ListCell *l; + int i = 0; + bool unknown_varno = false; + unsigned int num_datanodes = ng_get_dest_num_data_nodes(path); + + Assert(presorted_keys > 0 && presorted_keys < list_length(pathkeys)); + + /* + * We want to be sure the cost of a sort is never estimated as zero, even + * if passed-in tuple count is zero. Besides, mustn't do log(0)... + */ + if (input_tuples < 2.0) + input_tuples = 2.0; + + /* Default estimate of number of groups, capped to one group per row. */ + input_groups = Min(input_tuples, DEFAULT_NUM_DISTINCT); + + /* + * Extract presorted keys as list of expressions. + * + * We need to be careful about Vars containing "varno 0" which might have + * been introduced by generate_append_tlist, which would confuse + * estimate_num_groups (in fact it'd fail for such expressions). See + * recurse_set_operations which has to deal with the same issue. + * + * Unlike recurse_set_operations we can't access the original target list + * here, and even if we could it's not very clear how useful would that be + * for a set operation combining multiple tables. So we simply detect if + * there are any expressions with "varno 0" and use the default + * DEFAULT_NUM_DISTINCT in that case. + * + * We might also use either 1.0 (a single group) or input_tuples (each row + * being a separate group), pretty much the worst and best case for + * incremental sort. But those are extreme cases and using something in + * between seems reasonable. Furthermore, generate_append_tlist is used + * for set operations, which are likely to produce mostly unique output + * anyway - from that standpoint the DEFAULT_NUM_DISTINCT is defensive + * while maintaining lower startup cost. + */ + foreach(l, pathkeys) { + PathKey *key = (PathKey *) lfirst(l); + EquivalenceMember *member = linitial_node(EquivalenceMember, + key->pk_eclass->ec_members); + + /* + * Check if the expression contains Var with "varno 0" so that we + * don't call estimate_num_groups in that case. + */ + if (bms_is_member(0, pull_varnos((Node *) member->em_expr))) { + unknown_varno = true; + break; + } + + /* expression not containing any Vars with "varno 0" */ + presortedExprs = lappend(presortedExprs, member->em_expr); + + i++; + if (i >= presorted_keys) { + break; + } + } + + /* Estimate the number of groups with equal presorted keys. */ + if (!unknown_varno) { + input_groups = estimate_num_groups(root, presortedExprs, input_tuples, + num_datanodes); + } + + group_tuples = input_tuples / input_groups; + group_input_run_cost = input_run_cost / input_groups; + + /* + * Estimate the average cost of sorting of one group where presorted keys + * are equal. + */ + cost_tuplesort(&group_startup_cost, &group_run_cost, + group_tuples, width, comparison_cost, sort_mem, + limit_tuples, root->glob->vectorized, + 1, NULL, false); + + /* + * Startup cost of incremental sort is the startup cost of its first group + * plus the cost of its input. + */ + startup_cost = group_startup_cost + input_startup_cost + group_input_run_cost; + + /* + * After we started producing tuples from the first group, the cost of + * producing all the tuples is given by the cost to finish processing this + * group, plus the total cost to process the remaining groups, plus the + * remaining cost of input. + */ + run_cost = group_run_cost + (group_run_cost + group_startup_cost) * + (input_groups - 1) + group_input_run_cost * (input_groups - 1); + + /* + * Incremental sort adds some overhead by itself. Firstly, it has to + * detect the sort groups. This is roughly equal to one extra copy and + * comparison per tuple. + */ + run_cost += (u_sess->attr.attr_sql.cpu_tuple_cost + comparison_cost) * input_tuples; + + /* + * Additionally, we charge double cpu_tuple_cost for each input group to + * account for the tuplesort_reset that's performed after each group. + */ + run_cost += 2.0 * u_sess->attr.attr_sql.cpu_tuple_cost * input_groups; + + path->rows = input_tuples; + path->startup_cost = startup_cost; + path->total_cost = startup_cost + run_cost; +} + +/* + * cost_sort + * Determines and returns the cost of sorting a relation, including + * the cost of reading the input data. + * + * NOTE: some callers currently pass NIL for pathkeys because they + * can't conveniently supply the sort keys. Since this routine doesn't + * currently do anything with pathkeys anyway, that doesn't matter... + * but if it ever does, it should react gracefully to lack of key data. + * (Actually, the thing we'd most likely be interested in is just the number + * of sort keys, which all callers *could* supply.) + */ +void cost_sort(Path* path, List* pathkeys, Cost input_cost, double tuples, int width, Cost comparison_cost, + int sort_mem, double limit_tuples, bool col_store, int dop, OpMemInfo* mem_info, bool index_sort) +{ + Cost startup_cost; + Cost run_cost; + + cost_tuplesort(&startup_cost, &run_cost, + tuples, width, + comparison_cost, sort_mem, + limit_tuples, + col_store, dop, + mem_info, index_sort); + + if (!u_sess->attr.attr_sql.enable_sort) + startup_cost += g_instance.cost_cxt.disable_cost; + + startup_cost += input_cost; path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; diff --git a/src/gausskernel/optimizer/path/pathkeys.cpp b/src/gausskernel/optimizer/path/pathkeys.cpp index 2c67b3b988de0fccf057715aad1c0539fefa99ce..29ba51c5547dd2fe50e22e5c215ecb3948d48034 100644 --- a/src/gausskernel/optimizer/path/pathkeys.cpp +++ b/src/gausskernel/optimizer/path/pathkeys.cpp @@ -438,6 +438,65 @@ bool pathkeys_contained_in(List* keys1, List* keys2) return false; } +/* + * pathkeys_count_contained_in + * Same as pathkeys_contained_in, but also sets length of longest + * common prefix of keys1 and keys2. + */ +bool pathkeys_count_contained_in(List *keys1, List *keys2, int *n_common) +{ + int n = 0; + ListCell *key1; + ListCell *key2; + + /* + * See if we can avoiding looping through both lists. This optimization + * gains us several percent in planning time in a worst-case test. + */ + if (keys1 == keys2) { + *n_common = list_length(keys1); + return true; + } else if (keys1 == NIL) { + *n_common = 0; + return true; + } else if (keys2 == NIL) { + *n_common = 0; + return false; + } + + /* + * If both lists are non-empty, iterate through both to find out how many + * items are shared. + */ + forboth(key1, keys1, key2, keys2) { + PathKey *pathkey1 = (PathKey *)lfirst(key1); + PathKey *pathkey2 = (PathKey *)lfirst(key2); + + if (pathkey1 == NULL && pathkey2 != NULL) { + /* no need to keep looking */ + *n_common = n; + return false; + } + if (pathkey1 != NULL && pathkey2 == NULL) { + /* no need to keep looking */ + *n_common = n; + return false; + } + if (pathkey1->type != pathkey2->type || !OpFamilyEquals(pathkey1->pk_opfamily, pathkey2->pk_opfamily) || + pathkey1->pk_eclass != pathkey2->pk_eclass || pathkey1->pk_strategy != pathkey2->pk_strategy || + pathkey1->pk_nulls_first != pathkey2->pk_nulls_first) { + /* no need to keep looking */ + *n_common = n; + return false; + } + n++; + } + + /* If we ended with a null value, then we've processed the whole list. */ + *n_common = n; + return (key1 == NULL); +} + /* * get_cheapest_path_for_pathkeys * Find the cheapest path (according to the specified criterion) that @@ -495,8 +554,13 @@ Path* get_cheapest_fractional_path_for_pathkeys(List* paths, List* pathkeys, Rel /* * Since cost comparison is a lot cheaper than pathkey comparison, do * that first. (XXX is that still true?) + * + * NOTE: we should prioritize hint value comparison like what we did in + * generate_cheapest_and_sorted_path. */ - if (matched_path != NULL && compare_fractional_path_costs(matched_path, path, fraction) <= 0) + if (matched_path != NULL && + (matched_path->hint_value >= path->hint_value || + compare_fractional_path_costs(matched_path, path, fraction) <= 0)) continue; if (pathkeys_contained_in(pathkeys, path->pathkeys) && bms_is_subset(PATH_REQ_OUTER(path), required_outer)) @@ -1526,24 +1590,24 @@ static bool right_merge_direction(PlannerInfo* root, PathKey* pathkey) * Count the number of pathkeys that are useful for meeting the * query's requested output ordering. * - * Unlike merge pathkeys, this is an all-or-nothing affair: it does us - * no good to order by just the first key(s) of the requested ordering. - * So the result is always either 0 or list_length(root->query_pathkeys). + * Because we the have the possibility of incremental sort, a prefix list of + * keys is potentially useful for improving the performance of the requested + * ordering. Thus we return 0, if no valuable keys are found, or the number + * of leading keys shared by the list and the requested ordering.. */ static int pathkeys_useful_for_ordering(PlannerInfo* root, List* pathkeys) { + int n_common_pathkeys; + if (root->query_pathkeys == NIL) return 0; /* no special ordering requested */ if (pathkeys == NIL) return 0; /* unordered path */ - if (pathkeys_contained_in(root->query_pathkeys, pathkeys)) { - /* It's useful ... or at least the first N keys are */ - return list_length(root->query_pathkeys); - } + (void)pathkeys_count_contained_in(root->query_pathkeys, pathkeys, &n_common_pathkeys); - return 0; /* path ordering not useful */ + return n_common_pathkeys; /* path ordering not useful */ } /* diff --git a/src/gausskernel/optimizer/plan/createplan.cpp b/src/gausskernel/optimizer/plan/createplan.cpp index 982af13e193ea7b2d425948af0c7f85852c76f0d..3071386439b4731ac4880cb511f3d0519018eebf 100755 --- a/src/gausskernel/optimizer/plan/createplan.cpp +++ b/src/gausskernel/optimizer/plan/createplan.cpp @@ -1684,14 +1684,11 @@ static Material* create_material_plan(PlannerInfo* root, MaterialPath* best_path plan = make_material(subplan, best_path->materialize_all); plan->plan.hasUniqueResults = subplan->hasUniqueResults; + plan->plan.ispwj = root->isPartIteratorPlanning; copy_path_costsize(&plan->plan, (Path*)best_path); copy_mem_info(&plan->mem_info, &best_path->mem_info); - if (root->isPartIteratorPlanning) { - plan->plan.ispwj = true; - } - return plan; } @@ -3549,6 +3546,7 @@ static void ModifyWorktableWtParam(Node* planNode, int oldWtParam, int newWtPara case T_PartIterator: case T_SetOp: case T_Sort: + case T_IncrementalSort: case T_Stream: case T_Unique: case T_WindowAgg: { @@ -4773,6 +4771,7 @@ static void search_var_and_mark_bloomfilter(PlannerInfo* root, Expr* expr, Plan* } case T_Material: case T_Sort: + case T_IncrementalSort: case T_Unique: case T_SetOp: case T_Limit: @@ -7314,15 +7313,59 @@ Sort* make_sort(PlannerInfo* root, Plan* lefttree, int numCols, AttrNumber* sort plan->righttree = NULL; plan->hasUniqueResults = lefttree->hasUniqueResults; plan->dop = lefttree->dop; + plan->ispwj = root->isPartIteratorPlanning; node->numCols = numCols; node->sortColIdx = sortColIdx; node->sortOperators = sortOperators; node->collations = collations; node->nullsFirst = nullsFirst; - if (root->isPartIteratorPlanning) { - node->plan.ispwj = true; - } + return node; +} + +/* + * make_incrementalsort --- basic routine to build an IncrementalSort plan node + * + * Caller must have built the sortColIdx, sortOperators, collations, and + * nullsFirst arrays already. + */ +IncrementalSort *make_incrementalsort(PlannerInfo* root, Plan *lefttree, List *pathkeys, + int numCols, int nPresortedCols, AttrNumber *sortColIdx, + Oid *sortOperators, Oid *collations, bool *nullsFirst, + double limit_tuples) +{ + IncrementalSort *node; + Plan *plan; + Path sort_path; /* dummy for result of cost_sort */ + int width = get_plan_actual_total_width(lefttree, + root->glob->vectorized, + OP_SORT); + + node = makeNode(IncrementalSort); + plan = &node->sort.plan; + + /* For stream plan node info */ + inherit_plan_locator_info((Plan*)node, lefttree); + + copy_plan_costsize(plan, lefttree); /* only care about copying size */ + cost_incremental_sort(&sort_path, root, pathkeys, nPresortedCols, + lefttree->startup_cost, lefttree->total_cost, + PLAN_LOCAL_ROWS(lefttree), width, 0.0, + u_sess->opt_cxt.op_work_mem, limit_tuples); + + plan->startup_cost = sort_path.startup_cost; + plan->total_cost = sort_path.total_cost; + plan->targetlist = lefttree->targetlist; + plan->qual = NIL; + plan->lefttree = lefttree; + plan->righttree = NULL; + plan->ispwj = root->isPartIteratorPlanning; + node->nPresortedCols = nPresortedCols; + node->sort.numCols = numCols; + node->sort.sortColIdx = sortColIdx; + node->sort.sortOperators = sortOperators; + node->sort.collations = collations; + node->sort.nullsFirst = nullsFirst; return node; } @@ -7715,6 +7758,41 @@ Sort* make_sort_from_pathkeys(PlannerInfo* root, Plan* lefttree, List* pathkeys, return make_sort(root, lefttree, numsortkeys, sortColIdx, sortOperators, collations, nullsFirst, limit_tuples); } +/* + * make_incrementalsort_from_pathkeys + * Create sort plan to sort according to given pathkeys + * + * 'lefttree' is the node which yields input tuples + * 'pathkeys' is the list of pathkeys by which the result is to be sorted + * 'relids' is the set of relations required by prepare_sort_from_pathkeys() + * 'nPresortedCols' is the number of presorted columns in input tuples + */ +IncrementalSort *make_incrementalsort_from_pathkeys(PlannerInfo* root, Plan *lefttree, List *pathkeys, + int nPresortedCols, double limit_tuples) +{ + int numsortkeys; + AttrNumber *sortColIdx; + Oid *sortOperators; + Oid *collations; + bool *nullsFirst; + + /* Compute sort column info, and adjust lefttree as needed */ + lefttree = prepare_sort_from_pathkeys(root, lefttree, pathkeys, + NULL, + NULL, + false, + &numsortkeys, + &sortColIdx, + &sortOperators, + &collations, + &nullsFirst); + + /* Now build the Sort node */ + return make_incrementalsort(root, lefttree, pathkeys, numsortkeys, + nPresortedCols, sortColIdx, sortOperators, + collations, nullsFirst, limit_tuples); +} + /* * make_sort_from_sortclauses * Create sort plan to sort according to given sortclauses @@ -7855,6 +7933,66 @@ SortGroup* make_sort_group_from_groupcols(PlannerInfo* root, List* groupcls, Att return make_sortgroup(root, lefttree, numsortkeys, sortColIdx, sortOperators, collations, nullsFirst, dNumGroup); } +/* + * make_incrementalsort_from_groupcols + * Create incrementalsort plan to sort based on grouping columns + * + * 'groupcls' is the list of SortGroupClauses + * 'grpColIdx' gives the column numbers to use + * + * This might look like it could be merged with make_sort_from_sortclauses, + * but presently we *must* use the grpColIdx[] array to locate sort columns, + * because the child plan's tlist is not marked with ressortgroupref info + * appropriate to the grouping node. So, only the sort ordering info + * is used from the SortGroupClause entries. + */ +IncrementalSort* make_incrementalsort_from_groupcols(PlannerInfo* root, + List* groupcls, + AttrNumber* grpColIdx, + Plan* lefttree, + List* pathkeys, + int nPresortedCols) +{ + List* sub_tlist = lefttree->targetlist; + ListCell* l = NULL; + int numsortkeys; + AttrNumber* sortColIdx = NULL; + Oid* sortOperators = NULL; + Oid* collations = NULL; + bool* nullsFirst = NULL; + + /* Convert list-ish representation to arrays wanted by executor */ + numsortkeys = list_length(groupcls); + sortColIdx = (AttrNumber*)palloc(numsortkeys * sizeof(AttrNumber)); + sortOperators = (Oid*)palloc(numsortkeys * sizeof(Oid)); + collations = (Oid*)palloc(numsortkeys * sizeof(Oid)); + nullsFirst = (bool*)palloc(numsortkeys * sizeof(bool)); + + numsortkeys = 0; + foreach (l, groupcls) { + SortGroupClause* grpcl = (SortGroupClause*)lfirst(l); + TargetEntry* tle = get_tle_by_resno(sub_tlist, grpColIdx[numsortkeys]); + + if (tle == NULL) { + /* just break if we cannot find TargetEntry for SortGroupClause */ + ereport(ERROR, + (errmodule(MOD_OPT), + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("fail to find TargetEntry referenced by SortGroupClause")))); + } + + sortColIdx[numsortkeys] = tle->resno; + sortOperators[numsortkeys] = grpcl->sortop; + collations[numsortkeys] = exprCollation((Node*)tle->expr); + nullsFirst[numsortkeys] = grpcl->nulls_first; + numsortkeys++; + } + + return make_incrementalsort(root, lefttree, pathkeys, numsortkeys, + nPresortedCols, sortColIdx, sortOperators, + collations, nullsFirst, -1.0); +} + /* * make_sort_from_targetlist * Create sort plan to sort based on input plan's targetlist @@ -8726,7 +8864,7 @@ static Plan* parallel_limit_sort( * When sort + limit is parallelized, we need to add another sort * to make sure the data we send to CN is sorted. */ - if (root->sort_pathkeys && (IsA(lefttree, Sort) || IsA(lefttree, VecSort))) { + if (root->sort_pathkeys && (IsA(lefttree, Sort) || IsA(lefttree, IncrementalSort) || IsA(lefttree, VecSort))) { plan = (Plan*)make_limit(root, lefttree, limitOffset, limitCount, offset_est, count_est, false); plan = create_local_gather(plan); plan = (Plan*)make_sort_from_pathkeys(root, plan, root->sort_pathkeys, -1.0); @@ -9702,6 +9840,7 @@ bool is_projection_capable_plan(Plan* plan) case T_Hash: case T_Material: case T_Sort: + case T_IncrementalSort: case T_SortGroup: case T_Unique: case T_SetOp: diff --git a/src/gausskernel/optimizer/plan/planagg.cpp b/src/gausskernel/optimizer/plan/planagg.cpp index 4cfa9741f24e64fe1b54b529a3bab949b2252674..f24c52ec08171ffb01fda5321752d8684c2a75df 100644 --- a/src/gausskernel/optimizer/plan/planagg.cpp +++ b/src/gausskernel/optimizer/plan/planagg.cpp @@ -223,6 +223,8 @@ Plan* optimize_minmax_aggregates(PlannerInfo* root, List* tlist, const AggClause Plan* sub_plan = NULL; Node* hqual = NULL; ListCell* lc = NULL; + Distribution* distribution; + int total_hint_value; errno_t rc; /* Nothing to do if preprocess_minmax_aggs rejected the query */ @@ -239,15 +241,23 @@ Plan* optimize_minmax_aggregates(PlannerInfo* root, List* tlist, const AggClause * Note that we don't include evaluation cost of the tlist here; this is * OK since it isn't included in best_path's cost either, and should be * the same in either case. + * + * And, like all other comparison functions, we need to make sure the hint + * value got considered first. */ + total_hint_value = 0; total_cost = 0; foreach (lc, root->minmax_aggs) { MinMaxAggInfo* mminfo = (MinMaxAggInfo*)lfirst(lc); + total_hint_value += mminfo->path->hint_value; total_cost += mminfo->pathcost; } - Distribution* distribution = ng_get_dest_distribution(best_path); + if (total_hint_value < best_path->hint_value) + return NULL; /* hinted less (but we do prefer minmax paths) */ + + distribution = ng_get_dest_distribution(best_path); ng_copy_distribution(&agg_path.distribution, distribution); cost_agg(&agg_path, root, @@ -498,9 +508,7 @@ static bool build_minmax_path(PlannerInfo* root, MinMaxAggInfo* mminfo, Oid eqop TargetEntry* tle = NULL; NullTest* ntest = NULL; SortGroupClause* sortcl = NULL; - Path* cheapest_path = NULL; Path* sorted_path = NULL; - double dNumGroups[2] = {1, 1}; Cost path_cost; double path_fraction; errno_t errorno = EOK; @@ -575,8 +583,6 @@ static bool build_minmax_path(PlannerInfo* root, MinMaxAggInfo* mminfo, Oid eqop * Generate the best paths for this query, telling query_planner that we * have LIMIT 1. */ - - /* Make tuple_fraction, limit_tuples accessible to lower-level routines */ subroot->tuple_fraction = 1.0; subroot->limit_tuples = 1.0; @@ -587,37 +593,28 @@ static bool build_minmax_path(PlannerInfo* root, MinMaxAggInfo* mminfo, Oid eqop final_rel = query_planner(subroot, parse->targetList, standard_qp_callback, &qp_extra); - /* - * In the following, generate the best unsorted and presorted paths for - * this Query (but note there may not be any presorted path). + /* + * For partitioned index scans, try keeping the pathkey by adding a + * Sort node on top of the PartIterator. */ - bool has_groupby = true; - - /* First of all, estimate the number of groups in the query. */ - has_groupby = get_number_of_groups(subroot, - final_rel, - dNumGroups); - - /* Then update the tuple_fraction by the number of groups in the query. */ - update_tuple_fraction(subroot, - final_rel, - dNumGroups); - - /* Partition table optimization */ if (subroot->sort_pathkeys) { get_pathkeys_for_partiteratorpath(final_rel, (Expr*)mminfo->target); } - /* - * Finally, generate the best unsorted and presorted paths for - * this Query. + /* + * Get the best presorted path, that being the one that's cheapest for + * fetching just one row. If there's no such path, fail. */ - generate_cheapest_and_sorted_path(subroot, - final_rel, - &cheapest_path, - &sorted_path, - dNumGroups, - has_groupby); + if (RELOPTINFO_LOCAL_FIELD(subroot, final_rel, rows) > 1.0) + path_fraction = 1.0 / RELOPTINFO_LOCAL_FIELD(subroot, final_rel, rows); + else + path_fraction = 1.0; + + sorted_path = + get_cheapest_fractional_path_for_pathkeys(final_rel->pathlist, + subroot->query_pathkeys, + NULL, + path_fraction); /* @@ -627,24 +624,17 @@ static bool build_minmax_path(PlannerInfo* root, MinMaxAggInfo* mminfo, Oid eqop * simplifies life for grouping_planner, so leave it be.) */ if (sorted_path == NULL) { - if (cheapest_path && pathkeys_contained_in(subroot->sort_pathkeys, cheapest_path->pathkeys)) - sorted_path = cheapest_path; - else - return false; + return false; } /* * Determine cost to get just the first row of the presorted path. * - * Note: cost calculation here should - * match compare_fractional_path_costs(). + * Note: cost calculation here should match + * compare_fractional_path_costs(). */ - if (RELOPTINFO_LOCAL_FIELD(subroot, sorted_path->parent, rows) > 1.0) - path_fraction = 1.0 / RELOPTINFO_LOCAL_FIELD(subroot, sorted_path->parent, rows); - else - path_fraction = 1.0; - - path_cost = sorted_path->startup_cost + path_fraction * (sorted_path->total_cost - sorted_path->startup_cost); + path_cost = sorted_path->startup_cost + + path_fraction * (sorted_path->total_cost - sorted_path->startup_cost); /* Save state for further processing */ mminfo->subroot = subroot; diff --git a/src/gausskernel/optimizer/plan/planmain.cpp b/src/gausskernel/optimizer/plan/planmain.cpp index 6c06838b9b698fbbb4b52820cd936f89dbcf371d..3572896ff45fa652bbcd7720aed85bbc7668c6df 100755 --- a/src/gausskernel/optimizer/plan/planmain.cpp +++ b/src/gausskernel/optimizer/plan/planmain.cpp @@ -36,9 +36,6 @@ #include "optimizer/orclauses.h" #include "utils/selfuncs.h" -/* Local functions */ -static void debug_print_log(PlannerInfo* root, Path* sortedpath, int debug_log_level); - /* * query_planner * Generate a path (that is, a simplified plan) for a basic query, @@ -444,8 +441,8 @@ bool get_number_of_groups(PlannerInfo* root, * whether a bounded sort can be used at runtime. */ void update_tuple_fraction(PlannerInfo* root, - RelOptInfo* final_rel, - double* num_groups) + RelOptInfo* final_rel, + double* num_groups) { Query* parse = root->parse; double tuple_fraction = root->tuple_fraction; @@ -540,6 +537,40 @@ void update_tuple_fraction(PlannerInfo* root, root->limit_tuples = limit_tuples; } +/* + * Debugging message macros for generate_cheapest_and_sorted_path() + * + * XXX these wrappers are here to reduce the code complexity for + * logging upper paths in general. + */ +#define debug1_accept_upper(_kind_, root, path) \ + do { \ + if (log_min_messages <= DEBUG1) { \ + ereport(DEBUG1, (errmodule(MOD_OPT), \ + errmsg("A new %s path is accepted with " \ + "cost = %lf .. %lf", \ + _kind_, \ + path->startup_cost, \ + path->total_cost))); \ + debug1_print_new_path(root, path, false); \ + } \ + } while(0) + +/* not accepted */ +#define debug1_reject_upper(_kind_, root, path) \ + do { \ + if (log_min_messages <= DEBUG1) { \ + ereport(DEBUG1, (errmodule(MOD_OPT), \ + errmsg("A new %s path is not accepted with " \ + "cost = %lf .. %lf", \ + _kind_, \ + path->startup_cost, \ + path->total_cost))); \ + debug1_print_new_path(root, path, false); \ + } \ + } while(0) + + /* * generate_cheapest_and_sorted_path * Generate the best unsorted and presorted paths for this Query. @@ -554,19 +585,24 @@ void update_tuple_fraction(PlannerInfo* root, * Output parameters: * *cheapest_path receives the overall-cheapest path for the query * *sorted_path receives the cheapest presorted path for the query, - * if any (NULL if there is no useful presorted path). + * if any (NULL if there is no useful presorted path). + * *partial_sorted_path receives the cheapest partial sorted path + * for the query, if any (NULL if there is no useful + * partial sorted path). */ -void generate_cheapest_and_sorted_path(PlannerInfo* root, - RelOptInfo* final_rel, - Path** cheapest_path, - Path** sorted_path, - double* num_groups, - bool has_groupby) +void generate_cheapest_and_sorted_path(PlannerInfo* root, + RelOptInfo* final_rel, + Path** cheapest_path, + Path** sorted_path, + Path** partial_sorted_path, + double* num_groups, + bool has_groupby) { - Path* cheapestpath = NULL; - Path* sortedpath = NULL; - double tuple_fraction = root->tuple_fraction; - double limit_tuples = root->limit_tuples; + Path* final_cheapest_path = NULL; + Path* presorted_path = NULL; + Path* postsorted_path = NULL; + Path dummy_path; + ListCell* l; /* * Pick out the cheapest-total path and the cheapest presorted path for @@ -582,72 +618,166 @@ void generate_cheapest_and_sorted_path(PlannerInfo* root, * decides to use hashed aggregation, so we return it separately even if * this routine thinks the presorted path is the winner. */ - cheapestpath = get_cheapest_path(root, final_rel, num_groups, has_groupby); + final_cheapest_path = get_cheapest_path(root, final_rel, num_groups, has_groupby); /* * For these cases, we don't need sorted path: * (1) random plan; (2) subplan; (3) global path */ - if (OPTIMIZE_PLAN != u_sess->attr.attr_sql.plan_mode_seed || - (root->parent_root != NULL && root->parent_root->plan_params != NIL) || - cheapestpath != linitial(final_rel->cheapest_total_path)) - sortedpath = NULL; - else - sortedpath = - get_cheapest_fractional_path_for_pathkeys(final_rel->pathlist, root->query_pathkeys, NULL, tuple_fraction); - - /* Don't return same path in both guises; just wastes effort */ - if (sortedpath == NULL || sortedpath == cheapestpath || sortedpath->hint_value < cheapestpath->hint_value) { - sortedpath = NULL; + if (u_sess->attr.attr_sql.plan_mode_seed != OPTIMIZE_PLAN || + (root->parent_root && root->parent_root->plan_params) || + final_cheapest_path != linitial(final_rel->cheapest_total_path)) { + *cheapest_path = final_cheapest_path; + *sorted_path = NULL; + *partial_sorted_path = NULL; + return; } + /* Setup dummy path */ + dummy_path.startup_cost = final_cheapest_path->startup_cost; + dummy_path.total_cost = final_cheapest_path->total_cost; + /* - * Forget about the presorted path if it would be cheaper to sort the - * cheapest-total path. Here we need consider only the behavior at the - * tuple fraction point. + * Try add explicit sorts on top of generated paths. */ - if (sortedpath != NULL) { - Path sort_path; /* dummy for result of cost_sort */ + foreach (l, final_rel->pathlist) { + Path* input_path = (Path*)lfirst(l); + Path local_dummy_path; /* dummy for sort compare */ + int presorted_keys; + bool is_sorted; + + /* Skip path if parameterization is not satisfied */ + if (!bms_is_subset(PATH_REQ_OUTER(input_path), NULL)) { + continue; + } + + is_sorted = pathkeys_count_contained_in(root->query_pathkeys, + input_path->pathkeys, + &presorted_keys); - if (root->query_pathkeys == NIL || pathkeys_contained_in(root->query_pathkeys, cheapestpath->pathkeys)) { - /* No sort needed for cheapest path */ - sort_path.startup_cost = cheapestpath->startup_cost; - sort_path.total_cost = cheapestpath->total_cost; + if (is_sorted) { + /* If the path is already sorted, just find the cheapest */ + if (presorted_path == NULL || + compare_fractional_path_costs(presorted_path, + input_path, + root->tuple_fraction) > 0) { + presorted_path = input_path; + + debug1_accept_upper("presorted", root, presorted_path); + } } else { - /* Figure cost for sorting */ - cost_sort(&sort_path, - root->query_pathkeys, - cheapestpath->total_cost, - RELOPTINFO_LOCAL_FIELD(root, final_rel, rows), - final_rel->reltarget->width, - 0.0, - u_sess->opt_cxt.op_work_mem, - limit_tuples, - root->glob->vectorized); - } + bool try_incremental_sort = false; + + if (presorted_keys > 0 && + input_path->dop <= 1 && + u_sess->attr.attr_sql.enable_incremental_sort) + try_incremental_sort = true; + + /* + * Try at least sorting the cheapest path and also try + * incrementally sorting any path which is partially sorted + * already (no need to deal with paths which have presorted keys + * when incremental sort is disabled unless it's the cheapest + * input path). + */ + if (input_path != final_cheapest_path && !try_incremental_sort) + continue; + + /* + * Consider regular sort for any path that's not presorted or if + * incremental sort is disabled. We've no need to consider both + * sort and incremental sort on the same path. We assume that + * incremental sort is always faster when there are presorted + * keys. + * + * This is not redundant with the gather paths created in + * generate_gather_paths, because that doesn't generate ordered + * output. Here we add an explicit sort to match the useful + * ordering. + */ + if (!try_incremental_sort) { + cost_sort(&local_dummy_path, + root->query_pathkeys, + final_cheapest_path->total_cost, + RELOPTINFO_LOCAL_FIELD(root, final_rel, rows), + final_rel->reltarget->width, + 0.0, + u_sess->opt_cxt.op_work_mem, + root->limit_tuples, + root->glob->vectorized); + } else { + cost_incremental_sort(&local_dummy_path, + root, root->query_pathkeys, + presorted_keys, + input_path->startup_cost, + input_path->total_cost, + RELOPTINFO_LOCAL_FIELD(root, final_rel, rows), + final_rel->reltarget->width, + 0.0, + u_sess->opt_cxt.op_work_mem, + root->limit_tuples); + } - if (compare_fractional_path_costs(sortedpath, &sort_path, tuple_fraction) > 0) { - /* Presorted path is a loser */ - debug_print_log(root, sortedpath, DEBUG2); - sortedpath = NULL; + /* + * Find the best none/partial sorted path. The path that won are + * the best postsorted path. Note that we still need to consider + * the hint value. + */ + if (postsorted_path == NULL || + postsorted_path->hint_value < input_path->hint_value || + compare_fractional_path_costs(&dummy_path, + &local_dummy_path, + root->tuple_fraction) > 0) + { + /* save the new best postsorted path */ + postsorted_path = input_path; + dummy_path.startup_cost = local_dummy_path.startup_cost; + dummy_path.total_cost = local_dummy_path.total_cost; + + debug1_accept_upper("sortable", root, postsorted_path); + } } } - *cheapest_path = cheapestpath; - *sorted_path = sortedpath; -} - -static void debug_print_log(PlannerInfo* root, Path* sortedpath, int debug_log_level) -{ - if (log_min_messages > debug_log_level) - return; + /* + * If presorted path exists, we need to compare it with our + * newly generated post sorted path. Note that we don not care + * about how it compare with the sorting real cheapest path, + * since all we need is an ordered path. + */ + if (presorted_path && postsorted_path) { + /* Note that presorted path cannot be postsorted path */ + if (presorted_path->hint_value < postsorted_path->hint_value || + compare_fractional_path_costs(presorted_path, + &dummy_path, + root->tuple_fraction) > 0) + { + /* Presorted path is less beneficial than postsorted path */ + debug1_reject_upper("presorted", root, presorted_path); + presorted_path = NULL; + } else { + /* Postsorted path is less beneficial than presorted path */ + debug1_reject_upper("sortable", root, postsorted_path); + postsorted_path = NULL; + } + } - ereport(debug_log_level, - (errmodule(MOD_OPT), - (errmsg("Presorted path is not accepted with cost = %lf .. %lf", - sortedpath->startup_cost, - sortedpath->total_cost)))); + /* + * Any path is the cheapest, just keep the cheapest. + */ + if (presorted_path == final_cheapest_path) { + debug1_reject_upper("presorted", root, presorted_path); + presorted_path = NULL; + } else if (postsorted_path == final_cheapest_path) { + /* + * If post-sorting the path is more beneficial, we need to keep + * it unless it is already the cheapest. + */ + debug1_reject_upper("sortable", root, postsorted_path); + postsorted_path = NULL; + } - /* print more details */ - debug1_print_new_path(root, sortedpath, false); + *cheapest_path = final_cheapest_path; + *sorted_path = presorted_path; + *partial_sorted_path = postsorted_path; } diff --git a/src/gausskernel/optimizer/plan/planner.cpp b/src/gausskernel/optimizer/plan/planner.cpp index f2fadf7c5b6a56f8c8462b2c10bf5513cd99e68e..5b6f57f67929d8382fb20b923f079fc2234e49ba 100755 --- a/src/gausskernel/optimizer/plan/planner.cpp +++ b/src/gausskernel/optimizer/plan/planner.cpp @@ -153,14 +153,14 @@ static void process_rowMarks(Query* parse, Plan** resultPlan, PlannerInfo* root, static bool grouping_is_can_hash(Query* parse, AggClauseCosts* agg_costs); static Size compute_hash_entry_size(bool vectorized, Path* cheapest_path, int path_width, AggClauseCosts* agg_costs); static bool choose_hashed_grouping(PlannerInfo* root, double tuple_fraction, double limit_tuples, int path_width, - Path* cheapest_path, Path* sorted_path, const double* dNumGroups, AggClauseCosts* agg_costs, Size* hash_entry_size); -static void compute_distinct_sorted_path_cost(Path* sorted_p, List* sorted_pathkeys, Query* parse, PlannerInfo* root, - int numDistinctCols, Cost sorted_startup_cost, Cost sorted_total_cost, double path_rows, - Distribution* sorted_distribution, int path_width, double dNumDistinctRows, double limit_tuples); + Path* cheapest_path, Path* sorted_path, Path* partial_sorted_path, + const double* dNumGroups, AggClauseCosts* agg_costs, Size* hash_entry_size); +static void compute_distinct_sorted_path_cost(PlannerInfo* root, Path* sorted_path, Path* partial_sorted_path, + Path* sorted_p, int numDistinctCols, double path_rows, int path_width, double dNumDistinctRows, + double limit_tuples); static bool choose_hashed_distinct(PlannerInfo* root, double tuple_fraction, double limit_tuples, double path_rows, - int path_width, Cost cheapest_startup_cost, Cost cheapest_total_cost, Distribution* cheapest_distribution, - Cost sorted_startup_cost, Cost sorted_total_cost, Distribution* sorted_distribution, List* sorted_pathkeys, - double dNumDistinctRows, Size hashentrysize); + int path_width, Path* cheapest_path, Path* sorted_path, Path* partial_sorted_path, double dNumDistinctRows, + Size hashentrysize); static List* make_subplanTargetList(PlannerInfo* root, List* tlist, AttrNumber** groupColIdx, bool* need_tlist_eval, Oid** gruopCollations); static void locate_grouping_columns(PlannerInfo* root, List* tlist, List* sub_tlist, AttrNumber* groupColIdx); @@ -2825,30 +2825,36 @@ static void rebuild_pathkey_for_groupingSet( } } -static inline Path* choose_best_path(bool use_cheapest_path, PlannerInfo* root, Path* cheapest_path, Path* sorted_path) +FORCE_INLINE +Path* choose_best_path(bool prefer_cheapest_path, PlannerInfo* root, + Path* cheapest_path, Path* sorted_path, Path* partial_sorted_path) { Path* best_path; - - if (sorted_path != NULL && cheapest_path->hint_value > sorted_path->hint_value) { - ereport(DEBUG2, (errmodule(MOD_OPT), (errmsg("Use cheapest path for hint.")))); - return cheapest_path; - } else if (sorted_path != NULL && sorted_path->hint_value > cheapest_path->hint_value) { - ereport(DEBUG2, (errmodule(MOD_OPT), (errmsg("Use presorted path for hint.")))); - if (log_min_messages <= DEBUG2) { - debug1_print_new_path(root, sorted_path, false); - } - return sorted_path; + bool prefer_cheapest = prefer_cheapest_path; + + /* choose via hint value */ + if (sorted_path) { + if (cheapest_path->hint_value > sorted_path->hint_value) + return cheapest_path; + else if (sorted_path->hint_value > cheapest_path->hint_value) + return sorted_path; + } else if (partial_sorted_path) { + if (cheapest_path->hint_value > partial_sorted_path->hint_value) + return cheapest_path; + else if (partial_sorted_path->hint_value > cheapest_path->hint_value) + return partial_sorted_path; + } else { + /* no sorted/partial sorted paths */ + prefer_cheapest = true; } - if (use_cheapest_path) { + /* final thought */ + if (prefer_cheapest) best_path = cheapest_path; - } else { + else if (partial_sorted_path) + best_path = partial_sorted_path; + else best_path = sorted_path; - ereport(DEBUG2, (errmodule(MOD_OPT), (errmsg("Use presorted path instead of cheapest path.")))); - /* print more details */ - if (log_min_messages <= DEBUG2) - debug1_print_new_path(root, best_path, false); - } return best_path; } @@ -2877,6 +2883,10 @@ static void process_sort(Query* parse, PlannerInfo* root, PlannerTargets* planne * the right order, add an explicit sort step. */ if (parse->sortClause) { + int presorted_keys = 0; + bool is_sorted = false; + bool parallel_need_sort = (*resultPlan)->dop > 1; + if (parse->is_flt_frame && parse->hasTargetSRFs) { tlist = build_plan_tlist(root, plannerTargets->final_target); (*resultPlan)->targetlist = tlist; @@ -2889,10 +2899,28 @@ static void process_sort(Query* parse, PlannerInfo* root, PlannerTargets* planne */ rebuild_pathkey_for_groupingSet(root, tlist, NULL, collectiveGroupExpr); + is_sorted = pathkeys_count_contained_in(root->sort_pathkeys, *currentPathKeys, &presorted_keys); + /* we also need to add sort if the sub node is parallized. */ - if (!pathkeys_contained_in(root->sort_pathkeys, *currentPathKeys) || - ((*resultPlan)->dop > 1 && root->sort_pathkeys)) { - *resultPlan = (Plan*)make_sort_from_pathkeys(root, *resultPlan, root->sort_pathkeys, limitTuples); + if (root->sort_pathkeys && (!is_sorted || parallel_need_sort)) { + /* + * Add a sort operator on top of the existing plan if the cheapest + * path is used. Optionally, we can use incremental sort if the plan + * is presorted (num of presorted keys is greater than zero). + */ + if (!is_sorted && presorted_keys > 0 && (*resultPlan)->dop == 1 && + u_sess->attr.attr_sql.enable_incremental_sort) { + *resultPlan = (Plan*)make_incrementalsort_from_pathkeys(root, + *resultPlan, + root->sort_pathkeys, + presorted_keys, + limitTuples); + } else { + *resultPlan = (Plan*)make_sort_from_pathkeys(root, + *resultPlan, + root->sort_pathkeys, + limitTuples); + } #ifdef STREAMPLAN if (IS_STREAM_PLAN && check_sort_for_upsert(root)) *resultPlan = make_stream_sort(root, *resultPlan); @@ -2979,6 +3007,7 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) Plan* result_plan = NULL; List* current_pathkeys = NIL; double dNumGroups[2] = {1, 1}; /* dNumGroups[0] is local distinct, dNumGroups[1] is global distinct. */ + bool prefer_cheapest_path = true; bool use_hashed_distinct = false; bool tested_hashed_distinct = false; bool needs_stream = false; @@ -3105,13 +3134,13 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) } else { /* No set operations, do regular planning */ List* sub_tlist = NIL; - double sub_limit_tuples; AttrNumber* groupColIdx = NULL; Oid* groupCollation = NULL; bool need_try_fdw_plan = false; bool need_tlist_eval = true; Path* cheapest_path = NULL; Path* sorted_path = NULL; + Path* partial_sorted_path = NULL; Path* best_path = NULL; double numGroups[2] = {1, 1}; long localNumGroup = 1; @@ -3126,6 +3155,7 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) List* rollup_lists = NIL; List* rollup_groupclauses = NIL; bool needSecondLevelAgg = true; /* For olap function*/ + bool has_groupby = true; List* superset_key = root->dis_keys.superset_keys; Size hash_entry_size = 0; char PathContextName[NAMEDATALEN] = {0}; @@ -3319,13 +3349,12 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) (parse->groupClause || parse->groupingSets || parse->distinctClause || parse->hasAggs || parse->hasWindowFuncs || root->hasHavingQual || (parse->is_flt_frame && parse->hasTargetSRFs))) - sub_limit_tuples = -1.0; + root->limit_tuples = -1.0; else - sub_limit_tuples = limit_tuples; + root->limit_tuples = limit_tuples; /* Make tuple_fraction, limit_tuples accessible to lower-level routines */ root->tuple_fraction = tuple_fraction; - root->limit_tuples = sub_limit_tuples; /* Set up data needed by standard_qp_callback */ standard_qp_init(root, (void*)&qp_extra, tlist, @@ -3334,9 +3363,12 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) (rollup_groupclauses ? (List*)llast(rollup_groupclauses) : NIL) : parse->groupClause); - /* - * Generate pathlist by query_planner for final_rel and canonicalize - * all the pathkeys. + /* + * Generate the best unsorted and presorted paths for the scan/join + * portion of this Query, ie the processing represented by the + * FROM/WHERE clauses. (Note there may not be any presorted paths.) + * We also generate (in standard_qp_callback) pathkey representations + * of the query's sort clause, distinct clause, etc. */ final_rel = query_planner(root, sub_tlist, standard_qp_callback, &qp_extra); @@ -3392,35 +3424,33 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) planner_targets->scanjoin_targets_contain_srfs); } - /* - * In the following, generate the best unsorted and presorted paths for - * this Query (but note there may not be any presorted path). - */ - bool has_groupby = true; - /* First of all, estimate the number of groups in the query. */ - has_groupby = get_number_of_groups(root, - final_rel, + has_groupby = get_number_of_groups(root, + final_rel, dNumGroups, - rollup_groupclauses, + rollup_groupclauses, rollup_lists); /* Then update the tuple_fraction by the number of groups in the query. */ - update_tuple_fraction(root, - final_rel, + update_tuple_fraction(root, + final_rel, dNumGroups); /* * Finally, generate the best unsorted and presorted paths for * this Query. */ - generate_cheapest_and_sorted_path(root, + generate_cheapest_and_sorted_path(root, final_rel, - &cheapest_path, - &sorted_path, - dNumGroups, + &cheapest_path, + &sorted_path, + &partial_sorted_path, + dNumGroups, has_groupby); + + /* for plugin upper paths */ for_plugin_rel = final_rel; + /* restore superset keys */ root->dis_keys.superset_keys = superset_key; @@ -3462,6 +3492,7 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) path_width, cheapest_path, sorted_path, + partial_sorted_path, dNumGroups, &agg_costs, &hash_entry_size); @@ -3469,7 +3500,8 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) numGroups[0] = dNumGroups[0]; numGroups[1] = dNumGroups[1]; localNumGroup = (long)Min(dNumGroups[0], (double)LONG_MAX); - } else if (parse->distinctClause && sorted_path && !root->hasHavingQual && !parse->hasAggs && + } else if (parse->distinctClause && (sorted_path || partial_sorted_path) && + !root->hasHavingQual && !parse->hasAggs && (wflists == NULL || !wflists->activeWindows)) { Size hashentrysize; @@ -3492,13 +3524,9 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) limit_tuples, path_rows, path_width, - cheapest_path->startup_cost, - cheapest_path->total_cost, - ng_get_dest_distribution(cheapest_path), - sorted_path->startup_cost, - sorted_path->total_cost, - ng_get_dest_distribution(sorted_path), - sorted_path->pathkeys, + cheapest_path, + sorted_path, + partial_sorted_path, dNumGroups[0], hashentrysize); tested_hashed_distinct = true; @@ -3510,9 +3538,15 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) * then use the cheapest-total path. * Otherwise, trust query_planner's decision about which to use. */ - best_path = choose_best_path((use_hashed_grouping || use_hashed_distinct || - sorted_path == NULL || permit_gather(root)), - root, cheapest_path, sorted_path); + prefer_cheapest_path = use_hashed_grouping || + use_hashed_distinct || + permit_gather(root); + + best_path = choose_best_path(prefer_cheapest_path, + root, + cheapest_path, + sorted_path, + partial_sorted_path); (void)MemoryContextSwitchTo(PlanGenerateContext); @@ -3539,6 +3573,8 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) * results. */ bool need_sort_for_grouping = false; + bool is_sorted = false; + int presorted_keys = 0; /* TOD for some cases, we need backup the original subtlist, e.g. we will add */ root->origin_tlist = sub_tlist; @@ -3608,11 +3644,15 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) result_plan = create_plan(root, best_path); } } + current_pathkeys = best_path->pathkeys; + is_sorted = pathkeys_count_contained_in(root->group_pathkeys, + current_pathkeys, + &presorted_keys); + /* Detect if we'll need an explicit sort for grouping */ - if (parse->groupClause && !use_hashed_grouping && - !pathkeys_contained_in(root->group_pathkeys, current_pathkeys)) { + if (parse->groupClause && !use_hashed_grouping && !is_sorted) { need_sort_for_grouping = true; /* @@ -3880,7 +3920,7 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) planner_targets->grouping_targets, planner_targets->grouping_targets_contain_srfs); } - } else if (use_hashed_grouping) { + } else if (use_hashed_grouping) { /* Hashed aggregate plan --- no sort needed */ if (IS_STREAM_PLAN && is_execute_on_datanodes(result_plan) && @@ -4147,13 +4187,33 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) result_plan = create_local_redistribute(root, result_plan, result_plan->distributed_keys, 0); } + if (IsA(result_plan, Stream)) { + /* Stream plan is not sorted which destroys pathkeys and presorted keys */ + presorted_keys = 0; + } + if (need_sort_for_grouping && partial_plan == NULL && (IS_STREAM_PLAN || parse->groupingSets == NULL)) { if (root->consider_sortgroup_agg) { result_plan = (Plan*) make_sort_group_from_groupcols(root, parse->groupClause, groupColIdx, result_plan, dNumGroups[0]); } else { - result_plan = - (Plan*)make_sort_from_groupcols(root, parse->groupClause, groupColIdx, result_plan); + /* + * Add a sort operator based on the grouping attributes if the cheapest + * path is used. Optionally, we can use incremental sort if the plan + * is presorted (num of presorted keys is greater than zero). + */ + if (!is_sorted && presorted_keys > 0 && result_plan->dop <= 1 && + u_sess->attr.attr_sql.enable_incremental_sort) { + result_plan = (Plan*)make_incrementalsort_from_groupcols(root, parse->groupClause, + groupColIdx, result_plan, + root->group_pathkeys, + presorted_keys); + } else { + result_plan = (Plan*)make_sort_from_groupcols(root, + parse->groupClause, + groupColIdx, + result_plan); + } } current_pathkeys = root->group_pathkeys; } @@ -4331,9 +4391,29 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) if (!needs_stream && result_plan->dop > 1) result_plan = create_local_redistribute(root, result_plan, result_plan->distributed_keys, 0); + if (IsA(result_plan, Stream)) { + /* Stream plan is not sorted which destroys pathkeys and presorted keys */ + presorted_keys = 0; + } + if (need_sort_for_grouping) { - result_plan = - (Plan*)make_sort_from_groupcols(root, parse->groupClause, groupColIdx, result_plan); + /* + * Add a sort operator on top of the existing plan if the cheapest + * path is used. Optionally, we can use incremental sort if the plan + * is presorted (num of presorted keys is greater than zero). + */ + if (!is_sorted && presorted_keys > 0 && result_plan->dop == 1 && + u_sess->attr.attr_sql.enable_incremental_sort) { + result_plan = (Plan*)make_incrementalsort_from_groupcols(root, parse->groupClause, + groupColIdx, result_plan, + root->group_pathkeys, + presorted_keys); + } else { + result_plan = (Plan*)make_sort_from_groupcols(root, + parse->groupClause, + groupColIdx, + result_plan); + } current_pathkeys = root->group_pathkeys; } @@ -4535,14 +4615,26 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) */ if (window_pathkeys != NIL) { Sort* sort_plan = NULL; + int presorted_keys = 0; + bool is_sorted = false; - /* - * If the window func has 'partitin by', - * then we can parallelize it. - */ - sort_plan = - make_sort_from_pathkeys(root, result_plan, window_pathkeys, -1.0, (wc->partitionClause != NIL)); - if (!pathkeys_contained_in(window_pathkeys, current_pathkeys)) { + is_sorted = pathkeys_count_contained_in(window_pathkeys, current_pathkeys, &presorted_keys); + + if (!is_sorted && presorted_keys > 0 && result_plan->dop == 1 && + u_sess->attr.attr_sql.enable_incremental_sort) { + sort_plan = (Sort*)make_incrementalsort_from_pathkeys(root, result_plan, window_pathkeys, + presorted_keys, -1.0); + } else { + /* + * If the window func has 'partitin by', + * then we can parallelize it. + */ + sort_plan = make_sort_from_pathkeys(root, result_plan, + window_pathkeys, -1.0, + (wc->partitionClause != NIL)); + } + + if (!is_sorted) { /* we do indeed need to sort */ result_plan = (Plan*)sort_plan; current_pathkeys = window_pathkeys; @@ -4648,6 +4740,8 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) /* Choose implementation method if we didn't already */ if (!tested_hashed_distinct) { Size hashentrysize; + Path dummy_path; + errno_t rc = EOK; /* * Don't do it if it doesn't look like the hashtable will fit into @@ -4658,6 +4752,15 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) else hashentrysize = get_hash_entry_size(result_plan->plan_width); + /* Setup dummy paths for choose_hashed_distinct */ + rc = memset_s(&dummy_path, sizeof(dummy_path), 0, sizeof(dummy_path)); + securec_check(rc, "\0", "\0"); + + dummy_path.startup_cost = result_plan->startup_cost; + dummy_path.total_cost = result_plan->total_cost; + dummy_path.pathkeys = current_pathkeys; + ng_copy_distribution(&dummy_path.distribution, ng_get_dest_distribution(result_plan)); + /* * At this point, either hashed or sorted grouping will have to * work from result_plan, so we pass that as both "cheapest" and @@ -4668,13 +4771,9 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) limit_tuples, PLAN_LOCAL_ROWS(result_plan), result_plan->plan_width, - result_plan->startup_cost, - result_plan->total_cost, - ng_get_dest_distribution(result_plan), - result_plan->startup_cost, - result_plan->total_cost, - ng_get_dest_distribution(result_plan), - current_pathkeys, + &dummy_path, + &dummy_path, + &dummy_path, dNumDistinctRows[0], hashentrysize); } @@ -4754,7 +4853,6 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) } else /* Hashed aggregation produces randomly-ordered results */ current_pathkeys = NIL; } else { - /* * Set group_set and again build pathkeys, data's value can be altered groupingSet after, * so equal expr can not be deleted from pathkeys. Rebuild pathkey EquivalenceClass's ec_group_set @@ -4763,20 +4861,21 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) rebuild_pathkey_for_groupingSet(root, tlist, NULL, collectiveGroupExpr); rebuild_pathkey_for_groupingSet(root, tlist, NULL, collectiveGroupExpr); - if (likely(parse->hasDistinctOn || - pathkeys_contained_in(root->sort_pathkeys, root->distinct_pathkeys))) { + if (likely(parse->hasDistinctOn || pathkeys_contained_in(root->sort_pathkeys, root->distinct_pathkeys))) { /* - * Use a Unique node to implement DISTINCT. Add an explicit sort - * if we couldn't make the path come out the way the Unique node - * needs it. If we do have to sort, always sort by the more - * rigorous of DISTINCT and ORDER BY, to avoid a second sort - * below. However, for regular DISTINCT, don't sort now if we - * don't have to --- sorting afterwards will likely be cheaper, - * and also has the possibility of optimizing via LIMIT. But for - * DISTINCT ON, we *must* force the final sort now, else it won't - * have the desired behavior. - */ + * Use a Unique node to implement DISTINCT. Add an explicit sort + * if we couldn't make the path come out the way the Unique node + * needs it. If we do have to sort, always sort by the more + * rigorous of DISTINCT and ORDER BY, to avoid a second sort + * below. However, for regular DISTINCT, don't sort now if we + * don't have to --- sorting afterwards will likely be cheaper, + * and also has the possibility of optimizing via LIMIT. But for + * DISTINCT ON, we *must* force the final sort now, else it won't + * have the desired behavior. + */ List* needed_pathkeys = NIL; + int presorted_keys = 0; + bool is_sorted = false; if (parse->hasDistinctOn && list_length(root->distinct_pathkeys) < list_length(root->sort_pathkeys)) { needed_pathkeys = root->sort_pathkeys; @@ -4784,9 +4883,10 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) needed_pathkeys = root->distinct_pathkeys; } + is_sorted = pathkeys_count_contained_in(needed_pathkeys, current_pathkeys, &presorted_keys); + /* we also need to add sort if the sub node is parallized. */ - if (!pathkeys_contained_in(needed_pathkeys, current_pathkeys) || - (result_plan->dop > 1 && needed_pathkeys)) { + if (!is_sorted || (result_plan->dop > 1 && needed_pathkeys)) { if (list_length(root->distinct_pathkeys) >= list_length(root->sort_pathkeys)) { current_pathkeys = root->distinct_pathkeys; } else { @@ -4797,23 +4897,36 @@ static Plan* internal_grouping_planner(PlannerInfo* root, double tuple_fraction) "the parser does not mess up when adding sort for pathkeys."); } - result_plan = (Plan*)make_sort_from_pathkeys(root, result_plan, current_pathkeys, -1.0); + if (!is_sorted && presorted_keys > 0 && result_plan->dop == 1 && + u_sess->attr.attr_sql.enable_incremental_sort) { + result_plan = (Plan*)make_incrementalsort_from_pathkeys(root, result_plan, current_pathkeys, + presorted_keys, -1.0); + } else { + result_plan = (Plan*)make_sort_from_pathkeys(root, result_plan, current_pathkeys, -1.0); + } } result_plan = (Plan*)make_unique(result_plan, parse->distinctClause); } else { - if (!pathkeys_contained_in(root->distinct_pathkeys, current_pathkeys) || - (result_plan->dop > 1 && root->distinct_pathkeys)) { - result_plan = (Plan*)make_sort_from_pathkeys(root, result_plan, root->distinct_pathkeys, -1.0); + int presorted_keys = 0; + bool is_sorted = false; + + is_sorted = pathkeys_count_contained_in(root->distinct_pathkeys, current_pathkeys, &presorted_keys); + + if (!is_sorted || (result_plan->dop > 1 && root->distinct_pathkeys)) { current_pathkeys = root->distinct_pathkeys; - } - result_plan = (Plan*)make_unique(result_plan, parse->distinctClause); - if (!pathkeys_contained_in(root->sort_pathkeys, current_pathkeys) || - (result_plan->dop > 1 && root->sort_pathkeys)) { - result_plan = (Plan*)make_sort_from_pathkeys(root, result_plan, root->sort_pathkeys, -1.0); - current_pathkeys = root->sort_pathkeys; + if (!is_sorted && presorted_keys > 0 && result_plan->dop == 1 && + u_sess->attr.attr_sql.enable_incremental_sort) { + result_plan = (Plan*)make_incrementalsort_from_pathkeys(root, result_plan, current_pathkeys, + presorted_keys, -1.0); + } else { + result_plan = (Plan*)make_sort_from_pathkeys(root, result_plan, current_pathkeys, -1.0); + } + + current_pathkeys = root->distinct_pathkeys; } + result_plan = (Plan*)make_unique(result_plan, parse->distinctClause); } set_plan_rows( @@ -6626,13 +6739,14 @@ static void compute_hashed_path_cost(PlannerInfo* root, double limit_tuples, int * @in hashentrysize: hash entry size include space for per tuple width, space for pass-by-ref transition values, * the per-hash-entry overhead * @in numGroupCols: how many cols in group by clause + * @in presorted_keys: number of presorted keys in partial sorted path * @in/out sorted_p: result sort path with total cost * * Returns: void */ static void get_optimal_sorted_path(PlannerInfo* root, Path* sorted_p, int path_width, AggClauseCosts* agg_costs, int numGroupCols, const double* dNumGroups, Size hashentrysize, double limit_tuples, bool needs_stream, - bool need_sort_for_grouping) + bool need_sort_for_grouping, int presorted_keys) { Query* parse = root->parse; Node* distinct_node = NULL; @@ -6795,15 +6909,30 @@ static void get_optimal_sorted_path(PlannerInfo* root, Path* sorted_p, int path_ /* compute groupagg path. */ if (need_sort_for_grouping) { - cost_sort(sorted_p, - root->group_pathkeys, - top_level_path->total_cost, - PATH_LOCAL_ROWS(top_level_path), - path_width, - 0.0, - u_sess->opt_cxt.op_work_mem, - -1.0, - root->glob->vectorized); + if (presorted_keys > 0 && !has_stream && !has_local_stream && + u_sess->attr.attr_sql.enable_incremental_sort) { + cost_incremental_sort(sorted_p, + root, + root->group_pathkeys, + presorted_keys, + top_level_path->startup_cost, + top_level_path->total_cost, + PATH_LOCAL_ROWS(top_level_path), + path_width, + 0.0, + u_sess->opt_cxt.op_work_mem, + -1.0); + } else { + cost_sort(sorted_p, + root->group_pathkeys, + top_level_path->total_cost, + PATH_LOCAL_ROWS(top_level_path), + path_width, + 0.0, + u_sess->opt_cxt.op_work_mem, + -1.0, + root->glob->vectorized); + } copy_path_costsize(top_level_path, sorted_p); } @@ -6882,7 +7011,8 @@ static void get_optimal_sorted_path(PlannerInfo* root, Path* sorted_p, int path_ * @in path_rows: the parent's rows of cheapest path * @in path_width: the parent's width of cheapest path * @in cheapest_path: the cheapest path - * @in sorted_path: the initial sort path + * @in sorted_path: the sorted path + * @in partial_sorted_path: the partially sorted path * @in dNumGroups: the distinct for group by clause * @in agg_costs: the execution costs of the aggregates' input expressions * @in hashentrysize: hash entry size include space for per tuple width, space for pass-by-ref transition values, @@ -6893,40 +7023,44 @@ static void get_optimal_sorted_path(PlannerInfo* root, Path* sorted_p, int path_ * Returns: void */ static void compute_sorted_path_cost(PlannerInfo* root, double limit_tuples, int path_width, Path* cheapest_path, - Path* sorted_path, const double* dNumGroups, AggClauseCosts* agg_costs, Size hashentrysize, List* target_pathkeys, - Path* sorted_p) + Path* sorted_path, Path* partial_sorted_path, const double* dNumGroups, AggClauseCosts* agg_costs, + Size hashentrysize, List* target_pathkeys, Path* sorted_p) { Query* parse = root->parse; + Path* choosed_path = NULL; int numGroupCols = list_length(parse->groupClause); List* current_pathkeys = NIL; StreamPath stream_p; bool needs_stream = false; bool need_sort_for_grouping = false; + bool is_replicate = false; + int presorted_keys = 0; errno_t rc = EOK; rc = memset_s(&stream_p, sizeof(stream_p), 0, sizeof(stream_p)); securec_check(rc, "\0", "\0"); - /* use sorted path if it exists, other wise we use cheapest path. */ + /* use sorted or partial sorted path if it exists, other wise we use cheapest path. */ if (sorted_path != NULL) { - copy_path_costsize(sorted_p, sorted_path); - sorted_p->distribute_keys = sorted_path->distribute_keys; - current_pathkeys = sorted_path->pathkeys; - Distribution* distribution = ng_get_dest_distribution(sorted_path); - ng_copy_distribution(&sorted_p->distribution, distribution); - sorted_p->dop = sorted_path->dop; - sorted_p->locator_type = sorted_path->locator_type; + choosed_path = sorted_path; + } else if (partial_sorted_path != NULL) { + choosed_path = partial_sorted_path; } else { - copy_path_costsize(sorted_p, cheapest_path); - sorted_p->distribute_keys = cheapest_path->distribute_keys; - current_pathkeys = cheapest_path->pathkeys; - Distribution* distribution = ng_get_dest_distribution(cheapest_path); - ng_copy_distribution(&sorted_p->distribution, distribution); - sorted_p->dop = cheapest_path->dop; - sorted_p->locator_type = cheapest_path->locator_type; + choosed_path = cheapest_path; } - if (!pathkeys_contained_in(root->group_pathkeys, current_pathkeys)) { + copy_path_costsize(sorted_p, choosed_path); + sorted_p->distribute_keys = choosed_path->distribute_keys; + current_pathkeys = choosed_path->pathkeys; + ng_copy_distribution(&sorted_p->distribution, ng_get_dest_distribution(choosed_path)); + sorted_p->dop = choosed_path->dop; + sorted_p->locator_type = choosed_path->locator_type; + + if (!IS_STREAM_PLAN || sorted_p->dop <= 1 || sorted_p->locator_type == LOCATOR_TYPE_REPLICATED) { + is_replicate = true; + } + + if (!pathkeys_count_contained_in(root->group_pathkeys, current_pathkeys, &presorted_keys)) { current_pathkeys = root->group_pathkeys; need_sort_for_grouping = true; @@ -6939,21 +7073,32 @@ static void compute_sorted_path_cost(PlannerInfo* root, double limit_tuples, int root->consider_sortgroup_agg = false; } - bool is_replicate = (!IS_STREAM_PLAN || - sorted_p->dop <= 1 || - sorted_p->locator_type == LOCATOR_TYPE_REPLICATED); - if (is_replicate || !parse->hasAggs) { if (need_sort_for_grouping) { - cost_sort(sorted_p, - root->group_pathkeys, - sorted_p->total_cost, - PATH_LOCAL_ROWS(sorted_p), - path_width, - 0.0, - u_sess->opt_cxt.op_work_mem, - -1.0, - root->glob->vectorized); + if (presorted_keys > 0 && choosed_path->dop <= 1 && + u_sess->attr.attr_sql.enable_incremental_sort) { + cost_incremental_sort(sorted_p, + root, + root->group_pathkeys, + presorted_keys, + sorted_p->startup_cost, + sorted_p->total_cost, + PATH_LOCAL_ROWS(sorted_p), + path_width, + 0.0, + u_sess->opt_cxt.op_work_mem, + -1.0); + } else { + cost_sort(sorted_p, + root->group_pathkeys, + sorted_p->total_cost, + PATH_LOCAL_ROWS(sorted_p), + path_width, + 0.0, + u_sess->opt_cxt.op_work_mem, + -1.0, + root->glob->vectorized); + } } } @@ -6984,7 +7129,8 @@ static void compute_sorted_path_cost(PlannerInfo* root, double limit_tuples, int hashentrysize, limit_tuples, needs_stream, - need_sort_for_grouping); + need_sort_for_grouping, + presorted_keys); } else { cost_group(sorted_p, root, @@ -7029,16 +7175,32 @@ static void compute_sorted_path_cost(PlannerInfo* root, double limit_tuples, int } /* The Agg or Group node will preserve ordering */ - if (target_pathkeys && !pathkeys_contained_in(target_pathkeys, current_pathkeys)) - cost_sort(sorted_p, - target_pathkeys, - sorted_p->total_cost, - dNumGroups[0], - path_width, - 0.0, - u_sess->opt_cxt.op_work_mem, - limit_tuples, - root->glob->vectorized); + if (target_pathkeys && !pathkeys_count_contained_in(target_pathkeys, current_pathkeys, &presorted_keys)) { + if (presorted_keys > 0 && choosed_path->dop <= 1 && + u_sess->attr.attr_sql.enable_incremental_sort) { + cost_incremental_sort(sorted_p, + root, + target_pathkeys, + presorted_keys, + sorted_p->startup_cost, + sorted_p->total_cost, + dNumGroups[0], + path_width, + 0.0, + u_sess->opt_cxt.op_work_mem, + limit_tuples); + } else { + cost_sort(sorted_p, + target_pathkeys, + sorted_p->total_cost, + dNumGroups[0], + path_width, + 0.0, + u_sess->opt_cxt.op_work_mem, + limit_tuples, + root->glob->vectorized); + } + } ereport(DEBUG1, (errmodule(MOD_OPT_AGG), (errmsg("[final sorted path total cost]: %lf", sorted_p->total_cost)))); } @@ -7138,7 +7300,8 @@ static Size compute_hash_entry_size(bool vectorized, Path* cheapest_path, int pa * Returns TRUE to select hashing, FALSE to select sorting. */ static bool choose_hashed_grouping(PlannerInfo* root, double tuple_fraction, double limit_tuples, int path_width, - Path* cheapest_path, Path* sorted_path, const double* dNumGroups, AggClauseCosts* agg_costs, Size* hash_entry_size) + Path* cheapest_path, Path* sorted_path, Path* partial_sorted_path, const double* dNumGroups, + AggClauseCosts* agg_costs, Size* hash_entry_size) { Query* parse = root->parse; bool can_hash = false; @@ -7255,6 +7418,7 @@ static bool choose_hashed_grouping(PlannerInfo* root, double tuple_fraction, dou path_width, cheapest_path, sorted_path, + partial_sorted_path, dNumGroups, agg_costs, hashentrysize, @@ -7292,50 +7456,99 @@ static bool choose_hashed_grouping(PlannerInfo* root, double tuple_fraction, dou return false; } -static void compute_distinct_sorted_path_cost(Path* sorted_p, List* sorted_pathkeys, Query* parse, PlannerInfo* root, - int numDistinctCols, Cost sorted_startup_cost, Cost sorted_total_cost, double path_rows, - Distribution* sorted_distribution, int path_width, double dNumDistinctRows, double limit_tuples) +static void compute_distinct_sorted_path_cost(PlannerInfo* root, Path* sorted_path, Path* partial_sorted_path, + Path* sorted_p, int numDistinctCols, double path_rows, int path_width, double dNumDistinctRows, + double limit_tuples) { + Query* parse = root->parse; + Path* choosed_path; List* current_pathkeys = NIL; List* needed_pathkeys = NIL; + bool is_sorted = false; + int presorted_keys = 0; + + if (sorted_path) + choosed_path = sorted_path; + else + choosed_path = partial_sorted_path; + + sorted_p->startup_cost = choosed_path->startup_cost; + sorted_p->total_cost = choosed_path->total_cost; + ng_copy_distribution(&sorted_p->distribution, ng_get_dest_distribution(choosed_path)); + current_pathkeys = choosed_path->pathkeys; - sorted_p->startup_cost = sorted_startup_cost; - sorted_p->total_cost = sorted_total_cost; - ng_copy_distribution(&sorted_p->distribution, sorted_distribution); - current_pathkeys = sorted_pathkeys; if (parse->hasDistinctOn && list_length(root->distinct_pathkeys) < list_length(root->sort_pathkeys)) { needed_pathkeys = root->sort_pathkeys; } else { needed_pathkeys = root->distinct_pathkeys; } - if (!pathkeys_contained_in(needed_pathkeys, current_pathkeys)) { + + is_sorted = pathkeys_count_contained_in(needed_pathkeys, current_pathkeys, &presorted_keys); + + if (!is_sorted) { if (list_length(root->distinct_pathkeys) >= list_length(root->sort_pathkeys)) { current_pathkeys = root->distinct_pathkeys; } else { current_pathkeys = root->sort_pathkeys; } - cost_sort(sorted_p, - current_pathkeys, - sorted_p->total_cost, - path_rows, - path_width, - 0.0, - u_sess->opt_cxt.op_work_mem, - -1.0, - root->glob->vectorized); + + if (presorted_keys > 0 && u_sess->attr.attr_sql.enable_incremental_sort) { + cost_incremental_sort(sorted_p, + root, + current_pathkeys, + presorted_keys, + sorted_p->startup_cost, + sorted_p->total_cost, + path_rows, + path_width, + 0.0, + u_sess->opt_cxt.op_work_mem, + -1.0); + } else { + cost_sort(sorted_p, + current_pathkeys, + sorted_p->total_cost, + path_rows, + path_width, + 0.0, + u_sess->opt_cxt.op_work_mem, + -1.0, + root->glob->vectorized); + } } - cost_group( - sorted_p, root, numDistinctCols, dNumDistinctRows, sorted_p->startup_cost, sorted_p->total_cost, path_rows); - if (parse->sortClause && !pathkeys_contained_in(root->sort_pathkeys, current_pathkeys)) { - cost_sort(sorted_p, - root->sort_pathkeys, - sorted_p->total_cost, - dNumDistinctRows, - path_width, - 0.0, - u_sess->opt_cxt.op_work_mem, - limit_tuples, - root->glob->vectorized); + + cost_group(sorted_p, + root, + numDistinctCols, + dNumDistinctRows, + sorted_p->startup_cost, + sorted_p->total_cost, + path_rows); + + if (parse->sortClause && !pathkeys_count_contained_in(root->sort_pathkeys, current_pathkeys, &presorted_keys)) { + if (presorted_keys > 0 && u_sess->attr.attr_sql.enable_incremental_sort) { + cost_incremental_sort(sorted_p, + root, + current_pathkeys, + presorted_keys, + sorted_p->startup_cost, + sorted_p->total_cost, + dNumDistinctRows, + path_width, + 0.0, + u_sess->opt_cxt.op_work_mem, + -1.0); + } else { + cost_sort(sorted_p, + root->sort_pathkeys, + sorted_p->total_cost, + dNumDistinctRows, + path_width, + 0.0, + u_sess->opt_cxt.op_work_mem, + limit_tuples, + root->glob->vectorized); + } } } @@ -7359,9 +7572,8 @@ static void compute_distinct_sorted_path_cost(Path* sorted_p, List* sorted_pathk * Returns TRUE to select hashing, FALSE to select sorting. */ static bool choose_hashed_distinct(PlannerInfo* root, double tuple_fraction, double limit_tuples, double path_rows, - int path_width, Cost cheapest_startup_cost, Cost cheapest_total_cost, Distribution* cheapest_distribution, - Cost sorted_startup_cost, Cost sorted_total_cost, Distribution* sorted_distribution, List* sorted_pathkeys, - double dNumDistinctRows, Size hashentrysize) + int path_width, Path* cheapest_path, Path* sorted_path, Path* partial_sorted_path, double dNumDistinctRows, + Size hashentrysize) { Query* parse = root->parse; int numDistinctCols = list_length(parse->distinctClause); @@ -7381,9 +7593,9 @@ static bool choose_hashed_distinct(PlannerInfo* root, double tuple_fraction, dou * enforces the expected behavior of DISTINCT ON. */ can_sort = grouping_is_sortable(parse->distinctClause); - if (can_sort && parse->hasDistinctOn) + if (can_sort && parse->hasDistinctOn) { return false; - + } can_hash = grouping_is_hashable(parse->distinctClause); /* Quick out if only one choice is workable */ @@ -7428,15 +7640,15 @@ static bool choose_hashed_distinct(PlannerInfo* root, double tuple_fraction, dou * These path variables are dummies that just hold cost fields; we don't * make actual Paths for these steps. */ - ng_copy_distribution(&hashed_p.distribution, cheapest_distribution); + ng_copy_distribution(&hashed_p.distribution, ng_get_dest_distribution(cheapest_path)); cost_agg(&hashed_p, root, AGG_HASHED, NULL, numDistinctCols, dNumDistinctRows, - cheapest_startup_cost, - cheapest_total_cost, + cheapest_path->startup_cost, + cheapest_path->total_cost, path_rows, path_width, hashentrysize); @@ -7460,15 +7672,12 @@ static bool choose_hashed_distinct(PlannerInfo* root, double tuple_fraction, dou * Now for the GROUP case. See comments in grouping_planner about the * sorting choices here --- this code should match that code. */ - compute_distinct_sorted_path_cost(&sorted_p, - sorted_pathkeys, - parse, - root, + compute_distinct_sorted_path_cost(root, + sorted_path, + partial_sorted_path, + &sorted_p, numDistinctCols, - sorted_startup_cost, - sorted_total_cost, path_rows, - sorted_distribution, path_width, dNumDistinctRows, limit_tuples); @@ -10083,6 +10292,7 @@ static bool vector_engine_walker_internal(Plan* result_plan, bool check_rescan, case T_Material: case T_Hash: case T_Sort: + case T_IncrementalSort: if (vector_engine_walker_internal(result_plan->lefttree, check_rescan, planContext)) return true; break; @@ -10329,6 +10539,7 @@ static Plan* fallback_plan(Plan* result_plan) case T_ProjectSet: case T_Sort: case T_SortGroup: + case T_IncrementalSort: case T_Stream: case T_Material: case T_StartWithOp: @@ -10516,6 +10727,13 @@ Plan* vectorize_plan(Plan* result_plan, bool ignore_remotequery, bool forceVecto return make_rowtove_plan(result_plan); } break; + case T_IncrementalSort: { + result_plan->lefttree = vectorize_plan(result_plan->lefttree, ignore_remotequery, forceVectorEngine); + if (result_plan->lefttree && IsVecOutput(result_plan->lefttree)) { + result_plan->lefttree = (Plan*)make_vectorow(result_plan->lefttree); + } + return result_plan; + } break; case T_SortGroup: { result_plan->lefttree = vectorize_plan(result_plan->lefttree, ignore_remotequery, forceVectorEngine); @@ -13042,7 +13260,7 @@ static void copy_path_costsize(Path* dest, Path* src) dest->total_cost = src->total_cost; dest->locator_type = src->locator_type; } -#endif +#endif /* STREAMPLAN */ static ExecNodes* initExecNodes() { diff --git a/src/gausskernel/optimizer/plan/planstartwith.cpp b/src/gausskernel/optimizer/plan/planstartwith.cpp index f350a7473f13dec79cf765ab6c6a6c5e214a14a1..ff1da78071e5d1d0cf4ca76827e62ef552ed0979 100644 --- a/src/gausskernel/optimizer/plan/planstartwith.cpp +++ b/src/gausskernel/optimizer/plan/planstartwith.cpp @@ -1970,6 +1970,7 @@ static void BindPlanNodePseudoEntries(PlannerInfo *root, Plan *plan, case T_NestLoop: case T_MergeJoin: case T_Sort: + case T_IncrementalSort: case T_Agg: case T_BaseResult: case T_Material: diff --git a/src/gausskernel/optimizer/plan/setrefs.cpp b/src/gausskernel/optimizer/plan/setrefs.cpp index 79fc52214f1b8c1d7a99d437a06ec58843afc9a3..1ef7d9c8ad73cc98e7ba9f0cfbf7eeb9cafe8300 100644 --- a/src/gausskernel/optimizer/plan/setrefs.cpp +++ b/src/gausskernel/optimizer/plan/setrefs.cpp @@ -581,6 +581,7 @@ static Plan* set_plan_refs(PlannerInfo* root, Plan* plan, int rtoffset) case T_Material: case T_VecMaterial: case T_Sort: + case T_IncrementalSort: case T_SortGroup: case T_VecSort: case T_Unique: diff --git a/src/gausskernel/optimizer/plan/subselect.cpp b/src/gausskernel/optimizer/plan/subselect.cpp index 78a6d7cf38e7b8443dd6f7b97d1083f47d2adbe2..a184abf383eb9af5bc28173b63d5de5af5c442da 100644 --- a/src/gausskernel/optimizer/plan/subselect.cpp +++ b/src/gausskernel/optimizer/plan/subselect.cpp @@ -3342,6 +3342,7 @@ static Bitmapset* finalize_plan(PlannerInfo* root, Plan* plan, Bitmapset* valid_ case T_Hash: case T_Material: case T_Sort: + case T_IncrementalSort: case T_SortGroup: case T_Unique: case T_SetOp: diff --git a/src/gausskernel/optimizer/util/optcommon.cpp b/src/gausskernel/optimizer/util/optcommon.cpp index a81a937bd8dabcfc6070e0f59c23507722e2817c..73cf16875f5edf36c92d609c20c49e4a078ba2e3 100755 --- a/src/gausskernel/optimizer/util/optcommon.cpp +++ b/src/gausskernel/optimizer/util/optcommon.cpp @@ -404,6 +404,9 @@ void GetPlanNodePlainText( case T_VecSort: *pname = *sname = *pt_operation = "Vector Sort"; break; + case T_IncrementalSort: + *pname = *sname = *pt_operation = "Incremental Sort"; + break; case T_Group: *pname = *sname = *pt_operation = "Group"; break; diff --git a/src/gausskernel/optimizer/util/planmem_walker.cpp b/src/gausskernel/optimizer/util/planmem_walker.cpp index e84eee481369930c865453295710ebed8b65065b..cde371eb0159102d056295b81fe5aee87a147ef1 100644 --- a/src/gausskernel/optimizer/util/planmem_walker.cpp +++ b/src/gausskernel/optimizer/util/planmem_walker.cpp @@ -444,6 +444,7 @@ bool plan_tree_walker(Node* node, MethodWalker walker, void* context) case T_VecSort: case T_Sort: + case T_IncrementalSort: case T_SortGroup: if (walk_plan_node_fields((Plan*)node, walker, context)) return true; diff --git a/src/gausskernel/runtime/executor/Makefile b/src/gausskernel/runtime/executor/Makefile index 23d369995ebd7312fe0943d866e670e08b4fc3be..12abf70d31c42bcc1c5ddab0d7f552802214d1f8 100644 --- a/src/gausskernel/runtime/executor/Makefile +++ b/src/gausskernel/runtime/executor/Makefile @@ -39,7 +39,7 @@ OBJS = execAmi.o execCurrent.o execGrouping.o execJunk.o execMain.o \ execUtils.o functions.o instrument.o nodeAppend.o nodeAgg.o \ nodeBitmapAnd.o nodeBitmapOr.o \ nodeBitmapHeapscan.o nodeBitmapIndexscan.o nodeHash.o \ - nodeHashjoin.o nodeIndexscan.o nodeIndexonlyscan.o nodeAnnIndexscan.o\ + nodeHashjoin.o nodeIncrementalSort.o nodeIndexscan.o nodeIndexonlyscan.o nodeAnnIndexscan.o\ nodeLimit.o nodeLockRows.o \ nodeMaterial.o nodeMergeAppend.o nodeMergejoin.o nodeModifyTable.o \ nodeNestloop.o nodeFunctionscan.o nodeRecursiveunion.o nodeResult.o \ diff --git a/src/gausskernel/runtime/executor/execAmi.cpp b/src/gausskernel/runtime/executor/execAmi.cpp index 17826af40b06a0cf7707f1105a4211b41319f6a5..33809982bad97a7f679bfefd74f77dc64b5954fc 100755 --- a/src/gausskernel/runtime/executor/execAmi.cpp +++ b/src/gausskernel/runtime/executor/execAmi.cpp @@ -32,6 +32,7 @@ #include "executor/node/nodeGroup.h" #include "executor/node/nodeHash.h" #include "executor/node/nodeHashjoin.h" +#include "executor/node/nodeIncrementalSort.h" #include "executor/node/nodeIndexonlyscan.h" #include "executor/node/nodeIndexscan.h" #include "executor/node/nodeLimit.h" @@ -258,6 +259,10 @@ void ExecReScanByType(PlanState* node) ExecReScanSortGroup((SortGroupState*)node); break; + case T_IncrementalSortState: + ExecReScanIncrementalSort((IncrementalSortState*) node); + break; + case T_GroupState: ExecReScanGroup((GroupState*)node); break; @@ -644,6 +649,13 @@ bool ExecSupportsBackwardScan(Plan* node) /* these don't evaluate tlist */ return true; + case T_IncrementalSort: + /* + * Unlike full sort, incremental sort keeps only a single group of + * tuples in memory, so it can't scan backwards. + */ + return false; + case T_LockRows: case T_Limit: /* these don't evaluate tlist */ diff --git a/src/gausskernel/runtime/executor/execProcnode.cpp b/src/gausskernel/runtime/executor/execProcnode.cpp index 1ae41a8540e9b680be6721624bfa3a7984926790..fea1cf9da4513e1501fb48c6b987ae15bf4ad9e7 100755 --- a/src/gausskernel/runtime/executor/execProcnode.cpp +++ b/src/gausskernel/runtime/executor/execProcnode.cpp @@ -89,6 +89,7 @@ #include "executor/node/nodeGroup.h" #include "executor/node/nodeHash.h" #include "executor/node/nodeHashjoin.h" +#include "executor/node/nodeIncrementalSort.h" #include "executor/node/nodeIndexonlyscan.h" #include "executor/node/nodeIndexscan.h" #include "executor/node/nodeLimit.h" @@ -368,6 +369,8 @@ PlanState* ExecInitNodeByType(Plan* node, EState* estate, int eflags) return (PlanState*)ExecInitMaterial((Material*)node, estate, eflags); case T_Sort: return (PlanState*)ExecInitSort((Sort*)node, estate, eflags); + case T_IncrementalSort: + return (PlanState*)ExecInitIncrementalSort((IncrementalSort*)node, estate, eflags); case T_SortGroup: return (PlanState*)ExecInitSortGroup((SortGroup*)node, estate, eflags); case T_Group: @@ -1251,6 +1254,10 @@ static void ExecEndNodeByType(PlanState* node) case T_SortState: ExecEndSort((SortState*)node); break; + + case T_IncrementalSortState: + ExecEndIncrementalSort((IncrementalSortState*)node); + break; case T_SortGroupState: ExecEndSortGroup((SortGroupState*)node); diff --git a/src/gausskernel/runtime/executor/instrument.cpp b/src/gausskernel/runtime/executor/instrument.cpp index a04e788bcfc71336dc2968722ede1af825d2012c..9d6b575e2a4934ccd4de762f577b03cecf580444 100644 --- a/src/gausskernel/runtime/executor/instrument.cpp +++ b/src/gausskernel/runtime/executor/instrument.cpp @@ -107,14 +107,6 @@ static inline uint64 rdtsc(void) } #endif -sortMessage sortmessage[] = { - {HEAPSORT, "top-N heapsort"}, - {QUICKSORT, "quicksort"}, - {EXTERNALSORT, "external sort"}, - {EXTERNALMERGE, "external merge"}, - {STILLINPROGRESS, "still in progress"}, -}; - TrackDesc trackdesc[] = { {RECV_PLAN, "begin query", false, TRACK_TIMESTAMP}, @@ -1295,6 +1287,9 @@ Instrumentation* ThreadInstrumentation::allocInstrSlot(int plan_node_id, int par pname = "Vector Sort"; plan_type = SORT_OP; break; + case T_IncrementalSort: + pname = "Incremental Sort"; + plan_type = SORT_OP; case T_Group: pname = "Group"; plan_type = UTILITY_OP; diff --git a/src/gausskernel/runtime/executor/nodeIncrementalSort.cpp b/src/gausskernel/runtime/executor/nodeIncrementalSort.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f3c04555108c536adb816d6e6629f7afc659fa47 --- /dev/null +++ b/src/gausskernel/runtime/executor/nodeIncrementalSort.cpp @@ -0,0 +1,1157 @@ +/*------------------------------------------------------------------------- + * + * nodeIncrementalSort.cpp + * Routines to handle incremental sorting of relations. + * + * Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/nodeIncrementalSort.cpp + * + * DESCRIPTION + * + * Incremental sort is an optimized variant of multikey sort for cases + * when the input is already sorted by a prefix of the sort keys. For + * example when a sort by (key1, key2 ... keyN) is requested, and the + * input is already sorted by (key1, key2 ... keyM), M < N, we can + * divide the input into groups where keys (key1, ... keyM) are equal, + * and only sort on the remaining columns. + * + * Consider the following example. We have input tuples consisting of + * two integers (X, Y) already presorted by X, while it's required to + * sort them by both X and Y. Let input tuples be following. + * + * (1, 5) + * (1, 2) + * (2, 9) + * (2, 1) + * (2, 5) + * (3, 3) + * (3, 7) + * + * An incremental sort algorithm would split the input into the following + * groups, which have equal X, and then sort them by Y individually: + * + * (1, 5) (1, 2) + * (2, 9) (2, 1) (2, 5) + * (3, 3) (3, 7) + * + * After sorting these groups and putting them altogether, we would get + * the following result which is sorted by X and Y, as requested: + * + * (1, 2) + * (1, 5) + * (2, 1) + * (2, 5) + * (2, 9) + * (3, 3) + * (3, 7) + * + * Incremental sort may be more efficient than plain sort, particularly + * on large datasets, as it reduces the amount of data to sort at once, + * making it more likely it fits into work_mem (eliminating the need to + * spill to disk). But the main advantage of incremental sort is that + * it can start producing rows early, before sorting the whole dataset, + * which is a significant benefit especially for queries with LIMIT. + * + * The algorithm we've implemented here is modified from the theoretical + * base described above by operating in two different modes: + * - Fetching a minimum number of tuples without checking prefix key + * group membership and sorting on all columns when safe. + * - Fetching all tuples for a single prefix key group and sorting on + * solely the unsorted columns. + * We always begin in the first mode, and employ a heuristic to switch + * into the second mode if we believe it's beneficial. + * + * Sorting incrementally can potentially use less memory, avoid fetching + * and sorting all tuples in the dataset, and begin returning tuples before + * the entire result set is available. + * + * The hybrid mode approach allows us to optimize for both very small + * groups (where the overhead of a new tuplesort is high) and very large + * groups (where we can lower cost by not having to sort on already sorted + * columns), albeit at some extra cost while switching between modes. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/tableam.h" +#include "executor/exec/execdebug.h" +#include "executor/node/nodeIncrementalSort.h" +#include "miscadmin.h" +#include "utils/lsyscache.h" +#include "utils/tuplesort.h" + +/* + * We need to store the instrumentation information in local node's sort + * info. This macro expands to choose the proper sort state and group info. + * + * Arguments: + * - node: type IncrementalSortState * + * - groupName: the token fullsort or prefixsort + */ +#define INSTRUMENT_SORT_GROUP(node, groupName) \ + do { \ + if ((node)->ss.ps.instrument != NULL) \ + { \ + instrumentSortedGroup(&(node)->incsort_info.groupName##GroupInfo, \ + (node)->groupName##_state); \ + } \ + } while (0) + +/* ---------------------------------------------------------------- + * instrumentSortedGroup + * + * Because incremental sort processes (potentially many) sort batches, we need + * to capture tuplesort stats each time we finalize a sort state. This summary + * data is later used for EXPLAIN ANALYZE output. + * ---------------------------------------------------------------- + */ +static void instrumentSortedGroup(IncrementalSortGroupInfo *groupInfo, Tuplesortstate *sortState) +{ + TuplesortInstrumentation sort_instr; + + groupInfo->groupCount++; + + tuplesort_get_stats(sortState, &sort_instr); + + /* Calculate total and maximum memory and disk space used. */ + switch (sort_instr.spaceType) { + case SORT_SPACE_TYPE_DISK: + groupInfo->totalDiskSpaceUsed += sort_instr.spaceUsed; + if (sort_instr.spaceUsed > groupInfo->maxDiskSpaceUsed) { + groupInfo->maxDiskSpaceUsed = sort_instr.spaceUsed; + } + + break; + case SORT_SPACE_TYPE_MEMORY: + groupInfo->totalMemorySpaceUsed += sort_instr.spaceUsed; + if (sort_instr.spaceUsed > groupInfo->maxMemorySpaceUsed) { + groupInfo->maxMemorySpaceUsed = sort_instr.spaceUsed; + } + + break; + } + + /* Track each sort method we've used. */ + groupInfo->sortMethods |= sort_instr.sortMethod; +} + +/* ---------------------------------------------------------------- + * preparePresortedCols + * + * Prepare information for presorted_keys comparisons. + * ---------------------------------------------------------------- + */ +static void preparePresortedCols(IncrementalSortState *node) +{ + IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan); + + node->presorted_keys = + (PresortedKeyData *) palloc(plannode->nPresortedCols * + sizeof(PresortedKeyData)); + + /* Pre-cache comparison functions for each pre-sorted key. */ + for (int i = 0; i < plannode->nPresortedCols; i++) { + Oid equalityOp; + Oid equalityFunc; + PresortedKeyData *key; + + key = &node->presorted_keys[i]; + key->attno = plannode->sort.sortColIdx[i]; + + equalityOp = get_equality_op_for_ordering_op(plannode->sort.sortOperators[i], + NULL); + if (!OidIsValid(equalityOp)) { + elog(ERROR, "missing equality operator for ordering operator %u", + plannode->sort.sortOperators[i]); + } + + equalityFunc = get_opcode(equalityOp); + if (!OidIsValid(equalityFunc)) { + elog(ERROR, "missing function for operator %u", equalityOp); + } + + /* Lookup the comparison function */ + fmgr_info_cxt(equalityFunc, &key->flinfo, CurrentMemoryContext); + + /* We can initialize the callinfo just once and re-use it */ + key->fcinfo = (FunctionCallInfo)palloc0(sizeof(FunctionCallInfoData)); + InitFunctionCallInfoData(*key->fcinfo, &key->flinfo, 2, + plannode->sort.collations[i], NULL, NULL); + key->fcinfo->argnull[0] = false; + key->fcinfo->argnull[1] = false; + } +} + +/* ---------------------------------------------------------------- + * isCurrentGroup + * + * Check whether a given tuple belongs to the current sort group by comparing + * the presorted column values to the pivot tuple of the current group. + * ---------------------------------------------------------------- + */ +static bool isCurrentGroup(IncrementalSortState *node, TupleTableSlot *pivot, TupleTableSlot *tuple) +{ + int nPresortedCols = castNode(IncrementalSort, node->ss.ps.plan)->nPresortedCols; + + /* + * That the input is sorted by keys * (0, ... n) implies that the tail + * keys are more likely to change. Therefore we do our comparison starting + * from the last pre-sorted column to optimize for early detection of + * inequality and minimizing the number of function calls.. + */ + for (int i = nPresortedCols - 1; i >= 0; i--) { + Datum datumA; + Datum datumB; + Datum result; + bool isnullA; + bool isnullB; + AttrNumber attno = node->presorted_keys[i].attno; + PresortedKeyData *key; + + datumA = tableam_tslot_getattr(pivot, attno, &isnullA); + datumB = tableam_tslot_getattr(tuple, attno, &isnullB); + + /* Special case for NULL-vs-NULL, else use standard comparison */ + if (isnullA || isnullB) { + if (isnullA == isnullB) { + continue; + } else { + return false; + } + } + + key = &node->presorted_keys[i]; + + key->fcinfo->arg[0] = datumA; + key->fcinfo->arg[1] = datumB; + + /* just for paranoia's sake, we reset isnull each time */ + key->fcinfo->isnull = false; + + result = FunctionCallInvoke(key->fcinfo); + + /* Check for null result, since caller is clearly not expecting one */ + if (key->fcinfo->isnull) { + ereport(ERROR, (errmsg("function %u returned NULL", key->flinfo.fn_oid))); + } + + if (!DatumGetBool(result)) { + return false; + } + } + return true; +} + +/* ---------------------------------------------------------------- + * switchToPresortedPrefixMode + * + * When we determine that we've likely encountered a large batch of tuples all + * having the same presorted prefix values, we want to optimize tuplesort by + * only sorting on unsorted suffix keys. + * + * The problem is that we've already accumulated several tuples in another + * tuplesort configured to sort by all columns (assuming that there may be + * more than one prefix key group). So to switch to presorted prefix mode we + * have to go back and look at all the tuples we've already accumulated to + * verify they're all part of the same prefix key group before sorting them + * solely by unsorted suffix keys. + * + * While it's likely that all tuples already fetched are all part of a single + * prefix group, we also have to handle the possibility that there is at least + * one different prefix key group before the large prefix key group. + * ---------------------------------------------------------------- + */ +static void switchToPresortedPrefixMode(IncrementalSortState *node) +{ + ScanDirection dir; + int64 nTuples; + TupleDesc tupDesc; + PlanState *outerNode; + IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan); + int64 work_mem = SET_NODEMEM(plannode->sort.plan.operatorMemKB[0], plannode->sort.plan.dop); + int64 max_mem = (plannode->sort.plan.operatorMaxMem > 0) + ? SET_NODEMEM(plannode->sort.plan.operatorMaxMem, plannode->sort.plan.dop) : 0; + + dir = node->ss.ps.state->es_direction; + outerNode = outerPlanState(node); + tupDesc = ExecGetResultType(outerNode); + + /* Configure the prefix sort state the first time around. */ + if (node->prefixsort_state == NULL) { + Tuplesortstate *prefixsort_state = NULL; + int nPresortedCols = plannode->nPresortedCols; + + /* + * Optimize the sort by assuming the prefix columns are all equal and + * thus we only need to sort by any remaining columns. + */ + prefixsort_state = tuplesort_begin_heap(tupDesc, + plannode->sort.numCols - nPresortedCols, + &(plannode->sort.sortColIdx[nPresortedCols]), + &(plannode->sort.sortOperators[nPresortedCols]), + &(plannode->sort.collations[nPresortedCols]), + &(plannode->sort.nullsFirst[nPresortedCols]), + work_mem, + false, + max_mem, + plannode->sort.plan.plan_node_id, + SET_DOP(plannode->sort.plan.dop)); + node->prefixsort_state = prefixsort_state; + } else { + /* Next group of presorted data */ + tuplesort_reset(node->prefixsort_state); + } + + /* + * If the current node has a bound, then it's reasonably likely that a + * large prefix key group will benefit from bounded sort, so configure the + * tuplesort to allow for that optimization. + */ + if (node->bounded) { + SO1_printf("Setting bound on presorted prefix tuplesort to: " INT64_FORMAT "\n", + node->bound - node->bound_Done); + tuplesort_set_bound(node->prefixsort_state, + node->bound - node->bound_Done); + } + + /* + * Copy as many tuples as we can (i.e., in the same prefix key group) from + * the full sort state to the prefix sort state. + */ + for (nTuples = 0; nTuples < node->n_fullsort_remaining; nTuples++) { + /* + * When we encounter multiple prefix key groups inside the full sort + * tuplesort we have to carry over the last read tuple into the next + * batch. + */ + if (nTuples == 0 && !TupIsNull(node->transfer_tuple)) { + tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple); + /* The carried over tuple is our new group pivot tuple. */ + ExecCopySlot(node->group_pivot, node->transfer_tuple); + } else { + tuplesort_gettupleslot(node->fullsort_state, + ScanDirectionIsForward(dir), + node->transfer_tuple, NULL); + + /* + * If this is our first time through the loop, then we need to + * save the first tuple we get as our new group pivot. + */ + if (TupIsNull(node->group_pivot)) { + ExecCopySlot(node->group_pivot, node->transfer_tuple); + } + + if (isCurrentGroup(node, node->group_pivot, node->transfer_tuple)) { + tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple); + } else { + /* + * The tuple isn't part of the current batch so we need to + * carry it over into the next batch of tuples we transfer out + * of the full sort tuplesort into the presorted prefix + * tuplesort. We don't actually have to do anything special to + * save the tuple since we've already loaded it into the + * node->transfer_tuple slot, and, even though that slot + * points to memory inside the full sort tuplesort, we can't + * reset that tuplesort anyway until we've fully transferred + * out its tuples, so this reference is safe. We do need to + * reset the group pivot tuple though since we've finished the + * current prefix key group. + */ + ExecClearTuple(node->group_pivot); + + /* Break out of for-loop early */ + break; + } + } + } + + /* + * Track how many tuples remain in the full sort batch so that we know if + * we need to sort multiple prefix key groups before processing tuples + * remaining in the large single prefix key group we think we've + * encountered. + */ + SO1_printf("Moving " INT64_FORMAT " tuples to presorted prefix tuplesort\n", nTuples); + node->n_fullsort_remaining -= nTuples; + SO1_printf("Setting n_fullsort_remaining to " INT64_FORMAT "\n", node->n_fullsort_remaining); + + if (node->n_fullsort_remaining == 0) { + /* + * We've found that all tuples remaining in the full sort batch are in + * the same prefix key group and moved all of those tuples into the + * presorted prefix tuplesort. We don't know that we've yet found the + * last tuple in the current prefix key group, so save our pivot + * comparison tuple and continue fetching tuples from the outer + * execution node to load into the presorted prefix tuplesort. + */ + ExecCopySlot(node->group_pivot, node->transfer_tuple); + SO_printf("Setting execution_status to INCSORT_LOADPREFIXSORT (switchToPresortedPrefixMode)\n"); + node->execution_status = INCSORT_LOADPREFIXSORT; + + /* + * Make sure we clear the transfer tuple slot so that next time we + * encounter a large prefix key group we don't incorrectly assume we + * have a tuple carried over from the previous group. + */ + ExecClearTuple(node->transfer_tuple); + } else { + /* + * We finished a group but didn't consume all of the tuples from the + * full sort state, so we'll sort this batch, let the outer node read + * out all of those tuples, and then come back around to find another + * batch. + */ + SO1_printf("Sorting presorted prefix tuplesort with " INT64_FORMAT " tuples\n", nTuples); + tuplesort_performsort(node->prefixsort_state); + + INSTRUMENT_SORT_GROUP(node, prefixsort); + + if (node->bounded) { + /* + * If the current node has a bound and we've already sorted n + * tuples, then the functional bound remaining is (original bound + * - n), so store the current number of processed tuples for use + * in configuring sorting bound. + */ + SO2_printf("Changing bound_Done from " INT64_FORMAT " to " INT64_FORMAT "\n", + Min(node->bound, node->bound_Done + nTuples), node->bound_Done); + node->bound_Done = Min(node->bound, node->bound_Done + nTuples); + } + + SO_printf("Setting execution_status to INCSORT_READPREFIXSORT (switchToPresortedPrefixMode)\n"); + node->execution_status = INCSORT_READPREFIXSORT; + } +} + +/* + * Sorting many small groups with tuplesort is inefficient. In order to + * cope with this problem we don't start a new group until the current one + * contains at least DEFAULT_MIN_GROUP_SIZE tuples (unfortunately this also + * means we can't assume small groups of tuples all have the same prefix keys.) + * When we have a bound that's less than DEFAULT_MIN_GROUP_SIZE we start looking + * for the new group as soon as we've met our bound to avoid fetching more + * tuples than we absolutely have to fetch. + */ +#define DEFAULT_MIN_GROUP_SIZE 32 + +/* + * While we've optimized for small prefix key groups by not starting our prefix + * key comparisons until we've reached a minimum number of tuples, we don't want + * that optimization to cause us to lose out on the benefits of being able to + * assume a large group of tuples is fully presorted by its prefix keys. + * Therefore we use the DEFAULT_MAX_FULL_SORT_GROUP_SIZE cutoff as a heuristic + * for determining when we believe we've encountered a large group, and, if we + * get to that point without finding a new prefix key group we transition to + * presorted prefix key mode. + */ +#define DEFAULT_MAX_FULL_SORT_GROUP_SIZE (2 * DEFAULT_MIN_GROUP_SIZE) + +/* ---------------------------------------------------------------- + * ExecIncrementalSort + * + * Assuming that outer subtree returns tuple presorted by some prefix + * of target sort columns, performs incremental sort. + * + * Conditions: + * -- none. + * + * Initial States: + * -- the outer child is prepared to return the first tuple. + * ---------------------------------------------------------------- + */ +TupleTableSlot *ExecIncrementalSort(PlanState *pstate) +{ + IncrementalSortState *node = castNode(IncrementalSortState, pstate); + EState *estate; + ScanDirection dir; + Tuplesortstate *read_sortstate; + Tuplesortstate *fullsort_state; + TupleTableSlot *slot; + IncrementalSort *plannode = (IncrementalSort *)node->ss.ps.plan; + PlanState *outerNode; + TupleDesc tupDesc; + int64 nTuples = 0; + int64 minGroupSize; + int64 work_mem = SET_NODEMEM(plannode->sort.plan.operatorMemKB[0], plannode->sort.plan.dop); + int64 max_mem = (plannode->sort.plan.operatorMaxMem > 0) + ? SET_NODEMEM(plannode->sort.plan.operatorMaxMem, plannode->sort.plan.dop) : 0; + + CHECK_FOR_INTERRUPTS(); + + estate = node->ss.ps.state; + dir = estate->es_direction; + fullsort_state = node->fullsort_state; + + /* + * If a previous iteration has sorted a batch, then we need to check to + * see if there are any remaining tuples in that batch that we can return + * before moving on to other execution states. + */ + if (node->execution_status == INCSORT_READFULLSORT || + node->execution_status == INCSORT_READPREFIXSORT) + { + /* + * Return next tuple from the current sorted group set if available. + */ + read_sortstate = node->execution_status == INCSORT_READFULLSORT ? + fullsort_state : node->prefixsort_state; + slot = node->ss.ps.ps_ResultTupleSlot; + + /* + * We have to populate the slot from the tuplesort before checking + * outerNodeDone because it will set the slot to NULL if no more + * tuples remain. If the tuplesort is empty, but we don't have any + * more tuples available for sort from the outer node, then + * outerNodeDone will have been set so we'll return that now-empty + * slot to the caller. + */ + if (tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir), + slot, NULL) || node->outerNodeDone) + { + + /* + * Note: there isn't a good test case for the node->outerNodeDone + * check directly, but we need it for any plan where the outer + * node will fail when trying to fetch too many tuples. + */ + return slot; + } else if (node->n_fullsort_remaining > 0) { + /* + * When we transition to presorted prefix mode, we might have + * accumulated at least one additional prefix key group in the + * full sort tuplesort. The first call to + * switchToPresortedPrefixMode() will have pulled the first one of + * those groups out, and we've returned those tuples to the parent + * node, but if at this point we still have tuples remaining in + * the full sort state (i.e., n_fullsort_remaining > 0), then we + * need to re-execute the prefix mode transition function to pull + * out the next prefix key group. + */ + SO1_printf("Re-calling switchToPresortedPrefixMode() because n_fullsort_remaining is > 0 (" INT64_FORMAT ")\n", + node->n_fullsort_remaining); + switchToPresortedPrefixMode(node); + } else { + /* + * If we don't have any sorted tuples to read and we're not + * currently transitioning into presorted prefix sort mode, then + * it's time to start the process all over again by building a new + * group in the full sort state. + */ + SO_printf("Setting execution_status to INCSORT_LOADFULLSORT (n_fullsort_remaining > 0)\n"); + node->execution_status = INCSORT_LOADFULLSORT; + } + } + + /* + * Scan the subplan in the forward direction while creating the sorted + * data. + */ + estate->es_direction = ForwardScanDirection; + + outerNode = outerPlanState(node); + tupDesc = ExecGetResultType(outerNode); + + /* Load tuples into the full sort state. */ + if (node->execution_status == INCSORT_LOADFULLSORT) { + /* + * Initialize sorting structures. + */ + if (fullsort_state == NULL) { + /* + * Initialize presorted column support structures for + * isCurrentGroup(). It's correct to do this along with the + * initial initialization for the full sort state (and not for the + * prefix sort state) since we always load the full sort state + * first. + */ + preparePresortedCols(node); + + /* + * Since we optimize small prefix key groups by accumulating a + * minimum number of tuples before sorting, we can't assume that a + * group of tuples all have the same prefix key values. Hence we + * setup the full sort tuplesort to sort by all requested sort + * keys. + */ + fullsort_state = tuplesort_begin_heap(tupDesc, + plannode->sort.numCols, + plannode->sort.sortColIdx, + plannode->sort.sortOperators, + plannode->sort.collations, + plannode->sort.nullsFirst, + work_mem, + false, + max_mem, + plannode->sort.plan.plan_node_id, + SET_DOP(plannode->sort.plan.dop)); + node->fullsort_state = fullsort_state; + } else { + /* Reset sort for the next batch. */ + tuplesort_reset(fullsort_state); + node->fullsort_state = fullsort_state; + } + + /* + * Calculate the remaining tuples left if bounded and configure both + * bounded sort and the minimum group size accordingly. + */ + if (node->bounded) { + int64 currentBound = node->bound - node->bound_Done; + + /* + * Bounded sort isn't likely to be a useful optimization for full + * sort mode since we limit full sort mode to a relatively small + * number of tuples and tuplesort doesn't switch over to top-n + * heap sort anyway unless it hits (2 * bound) tuples. + */ + if (currentBound < DEFAULT_MIN_GROUP_SIZE) + tuplesort_set_bound(fullsort_state, currentBound); + + minGroupSize = Min(DEFAULT_MIN_GROUP_SIZE, currentBound); + } else { + minGroupSize = DEFAULT_MIN_GROUP_SIZE; + } + + /* + * Because we have to read the next tuple to find out that we've + * encountered a new prefix key group, on subsequent groups we have to + * carry over that extra tuple and add it to the new group's sort here + * before we read any new tuples from the outer node. + */ + if (!TupIsNull(node->group_pivot)) { + tuplesort_puttupleslot(fullsort_state, node->group_pivot); + nTuples++; + + /* + * We're in full sort mode accumulating a minimum number of tuples + * and not checking for prefix key equality yet, so we can't + * assume the group pivot tuple will remain the same -- unless + * we're using a minimum group size of 1, in which case the pivot + * is obviously still the pivot. + */ + if (nTuples != minGroupSize) { + ExecClearTuple(node->group_pivot); + } + } + + + /* + * Pull as many tuples from the outer node as possible given our + * current operating mode. + */ + for (;;) { + slot = ExecProcNode(outerNode); + + /* + * If the outer node can't provide us any more tuples, then we can + * sort the current group and return those tuples. + */ + if (TupIsNull(slot)) { + /* + * We need to know later if the outer node has completed to be + * able to distinguish between being done with a batch and + * being done with the whole node. + */ + node->outerNodeDone = true; + + SO1_printf("Sorting fullsort with " INT64_FORMAT " tuples\n", nTuples); + tuplesort_performsort(fullsort_state); + + INSTRUMENT_SORT_GROUP(node, fullsort); + + SO_printf("Setting execution_status to INCSORT_READFULLSORT (final tuple)\n"); + node->execution_status = INCSORT_READFULLSORT; + break; + } + + /* Accumulate the next group of presorted tuples. */ + if (nTuples < minGroupSize) { + /* + * If we haven't yet hit our target minimum group size, then + * we don't need to bother checking for inclusion in the + * current prefix group since at this point we'll assume that + * we'll full sort this batch to avoid a large number of very + * tiny (and thus inefficient) sorts. + */ + tuplesort_puttupleslot(fullsort_state, slot); + nTuples++; + + /* + * If we've reached our minimum group size, then we need to + * store the most recent tuple as a pivot. + */ + if (nTuples == minGroupSize) { + ExecCopySlot(node->group_pivot, slot); + } + } else { + /* + * If we've already accumulated enough tuples to reach our + * minimum group size, then we need to compare any additional + * tuples to our pivot tuple to see if we reach the end of + * that prefix key group. Only after we find changed prefix + * keys can we guarantee sort stability of the tuples we've + * already accumulated. + */ + if (isCurrentGroup(node, node->group_pivot, slot)) { + /* + * As long as the prefix keys match the pivot tuple then + * load the tuple into the tuplesort. + */ + tuplesort_puttupleslot(fullsort_state, slot); + nTuples++; + } else { + /* + * Since the tuple we fetched isn't part of the current + * prefix key group we don't want to sort it as part of + * the current batch. Instead we use the group_pivot slot + * to carry it over to the next batch (even though we + * won't actually treat it as a group pivot). + */ + ExecCopySlot(node->group_pivot, slot); + + if (node->bounded) { + /* + * If the current node has a bound, and we've already + * sorted n tuples, then the functional bound + * remaining is (original bound - n), so store the + * current number of processed tuples for later use + * configuring the sort state's bound. + */ + SO2_printf("Changing bound_Done from " INT64_FORMAT " to " INT64_FORMAT "\n", + node->bound_Done, + Min(node->bound, node->bound_Done + nTuples)); + node->bound_Done = Min(node->bound, node->bound_Done + nTuples); + } + + /* + * Once we find changed prefix keys we can complete the + * sort and transition modes to reading out the sorted + * tuples. + */ + SO1_printf("Sorting fullsort tuplesort with " INT64_FORMAT " tuples\n", + nTuples); + tuplesort_performsort(fullsort_state); + + INSTRUMENT_SORT_GROUP(node, fullsort); + + SO_printf("Setting execution_status to INCSORT_READFULLSORT (found end of group)\n"); + node->execution_status = INCSORT_READFULLSORT; + break; + } + } + + /* + * Unless we've already transitioned modes to reading from the + * full sort state, then we assume that having read at least + * DEFAULT_MAX_FULL_SORT_GROUP_SIZE tuples means it's likely we're + * processing a large group of tuples all having equal prefix keys + * (but haven't yet found the final tuple in that prefix key + * group), so we need to transition into presorted prefix mode. + */ + if (nTuples > DEFAULT_MAX_FULL_SORT_GROUP_SIZE && + node->execution_status != INCSORT_READFULLSORT) + { + /* + * The group pivot we have stored has already been put into + * the tuplesort; we don't want to carry it over. Since we + * haven't yet found the end of the prefix key group, it might + * seem like we should keep this, but we don't actually know + * how many prefix key groups might be represented in the full + * sort state, so we'll let the mode transition function + * manage this state for us. + */ + ExecClearTuple(node->group_pivot); + + /* + * Unfortunately the tuplesort API doesn't include a way to + * retrieve tuples unless a sort has been performed, so we + * perform the sort even though we could just as easily rely + * on FIFO retrieval semantics when transferring them to the + * presorted prefix tuplesort. + */ + SO1_printf("Sorting fullsort tuplesort with " INT64_FORMAT " tuples\n", nTuples); + tuplesort_performsort(fullsort_state); + + INSTRUMENT_SORT_GROUP(node, fullsort); + + /* + * If the full sort tuplesort happened to switch into top-n + * heapsort mode then we will only be able to retrieve + * currentBound tuples (since the tuplesort will have only + * retained the top-n tuples). This is safe even though we + * haven't yet completed fetching the current prefix key group + * because the tuples we've "lost" already sorted "below" the + * retained ones, and we're already contractually guaranteed + * to not need any more than the currentBound tuples. + */ + if (tuplesort_used_bound(node->fullsort_state)) { + int64 currentBound = node->bound - node->bound_Done; + + SO2_printf("Read " INT64_FORMAT " tuples, but setting to " INT64_FORMAT + " because we used bounded sort\n", nTuples, Min(currentBound, nTuples)); + nTuples = Min(currentBound, nTuples); + } + + SO1_printf("Setting n_fullsort_remaining to " INT64_FORMAT + " and calling switchToPresortedPrefixMode()\n", nTuples); + + /* + * We might have multiple prefix key groups in the full sort + * state, so the mode transition function needs to know that + * it needs to move from the fullsort to presorted prefix + * sort. + */ + node->n_fullsort_remaining = nTuples; + + /* Transition the tuples to the presorted prefix tuplesort. */ + switchToPresortedPrefixMode(node); + + /* + * Since we know we had tuples to move to the presorted prefix + * tuplesort, we know that unless that transition has verified + * that all tuples belonged to the same prefix key group (in + * which case we can go straight to continuing to load tuples + * into that tuplesort), we should have a tuple to return + * here. + * + * Either way, the appropriate execution status should have + * been set by switchToPresortedPrefixMode(), so we can drop + * out of the loop here and let the appropriate path kick in. + */ + break; + } + } + } + + if (node->execution_status == INCSORT_LOADPREFIXSORT) { + /* + * We only enter this state after the mode transition function has + * confirmed all remaining tuples from the full sort state have the + * same prefix and moved those tuples to the prefix sort state. That + * function has also set a group pivot tuple (which doesn't need to be + * carried over; it's already been put into the prefix sort state). + */ + Assert(!TupIsNull(node->group_pivot)); + + /* + * Read tuples from the outer node and load them into the prefix sort + * state until we encounter a tuple whose prefix keys don't match the + * current group_pivot tuple, since we can't guarantee sort stability + * until we have all tuples matching those prefix keys. + */ + for (;;) { + slot = ExecProcNode(outerNode); + + /* + * If we've exhausted tuples from the outer node we're done + * loading the prefix sort state. + */ + if (TupIsNull(slot)) { + /* + * We need to know later if the outer node has completed to be + * able to distinguish between being done with a batch and + * being done with the whole node. + */ + node->outerNodeDone = true; + break; + } + + /* + * If the tuple's prefix keys match our pivot tuple, we're not + * done yet and can load it into the prefix sort state. If not, we + * don't want to sort it as part of the current batch. Instead we + * use the group_pivot slot to carry it over to the next batch + * (even though we won't actually treat it as a group pivot). + */ + if (isCurrentGroup(node, node->group_pivot, slot)) { + tuplesort_puttupleslot(node->prefixsort_state, slot); + nTuples++; + } else { + ExecCopySlot(node->group_pivot, slot); + break; + } + } + + /* + * Perform the sort and begin returning the tuples to the parent plan + * node. + */ + SO1_printf("Sorting presorted prefix tuplesort with " INT64_FORMAT " tuples\n", nTuples); + tuplesort_performsort(node->prefixsort_state); + + INSTRUMENT_SORT_GROUP(node, prefixsort); + + SO_printf("Setting execution_status to INCSORT_READPREFIXSORT (found end of group)\n"); + node->execution_status = INCSORT_READPREFIXSORT; + + if (node->bounded) { + /* + * If the current node has a bound, and we've already sorted n + * tuples, then the functional bound remaining is (original bound + * - n), so store the current number of processed tuples for use + * in configuring sorting bound. + */ + SO2_printf("Changing bound_Done from " INT64_FORMAT " to " INT64_FORMAT "\n", + node->bound_Done, + Min(node->bound, node->bound_Done + nTuples)); + node->bound_Done = Min(node->bound, node->bound_Done + nTuples); + } + } + + /* Restore to user specified direction. */ + estate->es_direction = dir; + + /* + * Get the first or next tuple from tuplesort. Returns NULL if no more + * tuples. + */ + read_sortstate = node->execution_status == INCSORT_READFULLSORT ? + fullsort_state : node->prefixsort_state; + slot = node->ss.ps.ps_ResultTupleSlot; + (void) tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir), + slot, NULL); + return slot; +} + +/* ---------------------------------------------------------------- + * ExecInitIncrementalSort + * + * Creates the run-time state information for the sort node + * produced by the planner and initializes its outer subtree. + * ---------------------------------------------------------------- + */ +PlanState *ExecInitIncrementalSort(IncrementalSort *node, EState *estate, int eflags) +{ + SO_printf("ExecInitIncrementalSort: initializing sort node\n"); + + /* + * Incremental sort can't be used with EXEC_FLAG_BACKWARD or + * EXEC_FLAG_MARK, because the current sort state contains only one sort + * batch rather than the full result set. + */ + Assert((eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)) == 0); + + /* Initialize state structure. */ + IncrementalSortState *incrsortstate = makeNode(IncrementalSortState); + incrsortstate->ss.ps.plan = (Plan *) node; + incrsortstate->ss.ps.state = estate; + incrsortstate->ss.ps.ExecProcNode = ExecIncrementalSort; + + incrsortstate->execution_status = INCSORT_LOADFULLSORT; + incrsortstate->bounded = false; + incrsortstate->outerNodeDone = false; + incrsortstate->bound_Done = 0; + incrsortstate->fullsort_state = NULL; + incrsortstate->prefixsort_state = NULL; + incrsortstate->group_pivot = NULL; + incrsortstate->transfer_tuple = NULL; + incrsortstate->n_fullsort_remaining = 0; + incrsortstate->presorted_keys = NULL; + + if (incrsortstate->ss.ps.instrument != NULL) { + IncrementalSortGroupInfo *fullsortGroupInfo = + &incrsortstate->incsort_info.fullsortGroupInfo; + IncrementalSortGroupInfo *prefixsortGroupInfo = + &incrsortstate->incsort_info.prefixsortGroupInfo; + + fullsortGroupInfo->groupCount = 0; + fullsortGroupInfo->maxDiskSpaceUsed = 0; + fullsortGroupInfo->totalDiskSpaceUsed = 0; + fullsortGroupInfo->maxMemorySpaceUsed = 0; + fullsortGroupInfo->totalMemorySpaceUsed = 0; + fullsortGroupInfo->sortMethods = 0; + prefixsortGroupInfo->groupCount = 0; + prefixsortGroupInfo->maxDiskSpaceUsed = 0; + prefixsortGroupInfo->totalDiskSpaceUsed = 0; + prefixsortGroupInfo->maxMemorySpaceUsed = 0; + prefixsortGroupInfo->totalMemorySpaceUsed = 0; + prefixsortGroupInfo->sortMethods = 0; + } + + /* + * Miscellaneous initialization + * + * Sort nodes don't initialize their ExprContexts because they never call + * ExecQual or ExecProject. + */ + + /* + * Initialize child nodes. + * + * Incremental sort does not support backwards scans and mark/restore, so + * we don't bother removing the flags from eflags here. We allow passing a + * REWIND flag, because although incremental sort can't use it, the child + * nodes may be able to do something more useful. + */ + outerPlanState(incrsortstate) = ExecInitNode(outerPlan(node), estate, eflags); + + /* + * Initialize scan slot and type. + */ + ExecInitScanTupleSlot(estate, &incrsortstate->ss); + + /* + * Initialize return slot and type. No need to initialize projection info + * because we don't do any projections. + */ + ExecInitResultTupleSlot(estate, &incrsortstate->ss.ps); + incrsortstate->ss.ps.ps_ProjInfo = NULL; + + /* + * initialize tuple type. no need to initialize projection info because + * this node doesn't do projections. + */ + ExecAssignScanTypeFromOuterPlan(&incrsortstate->ss); + + ExecAssignResultTypeFromTL(&incrsortstate->ss.ps, + incrsortstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor->td_tam_ops); + + /* + * Initialize standalone slots to store a tuple for pivot prefix keys and + * for carrying over a tuple from one batch to the next. + */ + incrsortstate->group_pivot = \ + MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)), false, + incrsortstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor->td_tam_ops); + incrsortstate->transfer_tuple = \ + MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)), false, + incrsortstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor->td_tam_ops); + + SO_printf("ExecInitIncrementalSort: sort node initialized\n"); + + return (PlanState*)incrsortstate; +} + +/* ---------------------------------------------------------------- + * ExecEndIncrementalSort(node) + * ---------------------------------------------------------------- + */ +void ExecEndIncrementalSort(IncrementalSortState *node) +{ + SO_printf("ExecEndIncrementalSort: shutting down sort node\n"); + + /* clean out the scan tuple */ + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* must drop pointer to sort result tuple */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + + /* must drop standalone tuple slots from outer node */ + ExecDropSingleTupleTableSlot(node->group_pivot); + ExecDropSingleTupleTableSlot(node->transfer_tuple); + + /* + * Release tuplesort resources. + */ + if (node->fullsort_state != NULL) { + tuplesort_end(node->fullsort_state); + node->fullsort_state = NULL; + } + + if (node->prefixsort_state != NULL) { + tuplesort_end(node->prefixsort_state); + node->prefixsort_state = NULL; + } + + /* + * Shut down the subplan. + */ + ExecEndNode(outerPlanState(node)); + + SO_printf("ExecEndIncrementalSort: sort node shutdown\n"); +} + +void ExecReScanIncrementalSort(IncrementalSortState *node) +{ + PlanState *outerPlan = outerPlanState(node); + + /* + * Incremental sort doesn't support efficient rescan even when parameters + * haven't changed (e.g., rewind) because unlike regular sort we don't + * store all tuples at once for the full sort. + * + * So even if EXEC_FLAG_REWIND is set we just reset all of our state and + * re-execute the sort along with the child node. Incremental sort itself + * can't do anything smarter, but maybe the child nodes can. + * + * In theory if we've only filled the full sort with one batch (and + * haven't reset it for a new batch yet) then we could efficiently rewind, + * but that seems a narrow enough case that it's not worth handling + * specially at this time. + */ + + /* must drop pointer to sort result tuple */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + + if (node->group_pivot != NULL) { + ExecClearTuple(node->group_pivot); + } + + if (node->transfer_tuple != NULL) { + ExecClearTuple(node->transfer_tuple); + } + + node->outerNodeDone = false; + node->n_fullsort_remaining = 0; + node->bound_Done = 0; + + node->execution_status = INCSORT_LOADFULLSORT; + + /* + * If we've set up either of the sort states yet, we need to reset them. + * We could end them and null out the pointers, but there's no reason to + * repay the setup cost, and because ExecIncrementalSort guards presorted + * column functions by checking to see if the full sort state has been + * initialized yet, setting the sort states to null here might actually + * cause a leak. + */ + if (node->fullsort_state != NULL) { + tuplesort_reset(node->fullsort_state); + } + + if (node->prefixsort_state != NULL) { + tuplesort_reset(node->prefixsort_state); + } + + /* + * If chgParam of subnode is not null, then the plan will be re-scanned by + * the first ExecProcNode. + */ + if (outerPlan->chgParam == NULL) { + ExecReScan(outerPlan); + } +} + +void ExecReSetIncrementalSort(IncrementalSortState *node) +{ + Assert(IS_PGXC_DATANODE && node != NULL && (IsA(node, IncrementalSortState))); + + /* reset full sort state */ + if (node->fullsort_state != NULL) { + if (node->ss.ps.ps_ResultTupleSlot != NULL) { + (void)ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + } + tuplesort_end((Tuplesortstate*)node->fullsort_state); + node->fullsort_state = NULL; + } + + /* reset prefix sort state */ + if (node->prefixsort_state != NULL) { + if (node->ss.ps.ps_ResultTupleSlot != NULL) { + (void)ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + } + tuplesort_end((Tuplesortstate*)node->prefixsort_state); + node->prefixsort_state = NULL; + } + + node->ss.ps.recursive_reset = true; + ExecReSetRecursivePlanTree(outerPlanState(node)); + + return; +} diff --git a/src/gausskernel/runtime/executor/nodeLimit.cpp b/src/gausskernel/runtime/executor/nodeLimit.cpp index 83ad36192329e18a45d7f61d5688fda15a29c6e2..d036e33fb52221d96c6493ba09c60cd2d7c3de11 100644 --- a/src/gausskernel/runtime/executor/nodeLimit.cpp +++ b/src/gausskernel/runtime/executor/nodeLimit.cpp @@ -426,6 +426,18 @@ static void pass_down_bound(LimitState* node, PlanState* child_node) sortState->bounded = true; sortState->bound = tuples_needed; } + } else if (IsA(child_node, IncrementalSortState)) { + IncrementalSortState* sortState = (IncrementalSortState*)child_node; + int64 tuples_needed = node->count + node->offset; + + /* negative test checks for overflow in sum */ + if (node->noCount || tuples_needed < 0) { + /* make sure flag gets reset if needed upon rescan */ + sortState->bounded = false; + } else { + sortState->bounded = true; + sortState->bound = tuples_needed; + } } else if (IsA(child_node, MergeAppendState)) { MergeAppendState* maState = (MergeAppendState*)child_node; int i; diff --git a/src/gausskernel/runtime/executor/nodeRecursiveunion.cpp b/src/gausskernel/runtime/executor/nodeRecursiveunion.cpp index fefbe52765abf17741c3c1c63878a0ce2ab23a52..2ebef885fd24a009e4bc254381f8b5f38e1f0717 100644 --- a/src/gausskernel/runtime/executor/nodeRecursiveunion.cpp +++ b/src/gausskernel/runtime/executor/nodeRecursiveunion.cpp @@ -26,6 +26,7 @@ #include "executor/node/nodeRecursiveunion.h" #include "executor/node/nodeSetOp.h" #include "executor/node/nodeSort.h" +#include "executor/node/nodeIncrementalSort.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" #include "libpq/pqsignal.h" @@ -2202,6 +2203,10 @@ void ExecReSetRecursivePlanTree(PlanState* node) ExecReSetSort((SortState*)node); break; + case T_IncrementalSortState: + ExecReSetIncrementalSort((IncrementalSortState*)node); + break; + case T_MaterialState: ExecReSetMaterial((MaterialState*)node); break; diff --git a/src/gausskernel/runtime/executor/nodeSort.cpp b/src/gausskernel/runtime/executor/nodeSort.cpp index 6ce479f00bc05a0f985fb72581cdd7f687b5396a..1107d1b75ff3891271d99c4067dab34e86285d61 100644 --- a/src/gausskernel/runtime/executor/nodeSort.cpp +++ b/src/gausskernel/runtime/executor/nodeSort.cpp @@ -187,17 +187,14 @@ static TupleTableSlot* ExecSort(PlanState* state) /* Cache sort info into SortState for display of explain analyze */ if (node->ss.ps.instrument != NULL) { - tuplesort_get_stats(tuple_sortstate, &(node->sortMethodId), &(node->spaceTypeId), &(node->spaceUsed)); + tuplesort_get_stats(tuple_sortstate, &(node->sinstrument)); } if (HAS_INSTR(&node->ss, true)) { plan_state->instrument->width = (int)tuplesort_get_avgwidth(tuple_sortstate); plan_state->instrument->sysBusy = tuplesort_get_busy_status(tuple_sortstate); plan_state->instrument->spreadNum = tuplesort_get_spread_num(tuple_sortstate); - tuplesort_get_stats(tuple_sortstate, - &(plan_state->instrument->sorthashinfo.sortMethodId), - &(plan_state->instrument->sorthashinfo.spaceTypeId), - &(plan_state->instrument->sorthashinfo.spaceUsed)); + tuplesort_get_stats(tuple_sortstate, &(plan_state->instrument->sorthashinfo.sinstrument)); } SO1_printf("ExecSort: %s\n", "sorting done"); } diff --git a/src/gausskernel/runtime/vecexecutor/vecnode/vechashjoin.cpp b/src/gausskernel/runtime/vecexecutor/vecnode/vechashjoin.cpp index 4aec0d42b46fcf70cdfd4fe63debdfa3ff70473a..f2779d6f1f79944950dd0fdfe0fc91ea0657eb7a 100644 --- a/src/gausskernel/runtime/vecexecutor/vecnode/vechashjoin.cpp +++ b/src/gausskernel/runtime/vecexecutor/vecnode/vechashjoin.cpp @@ -892,7 +892,7 @@ void HashJoinTbl::Build() plan_state->instrument->sorthashinfo.hashbuild_time = m_build_time; if (MEMORY_HASH == m_strategy) { plan_state->instrument->sorthashinfo.hash_writefile = false; - plan_state->instrument->sorthashinfo.spaceUsed = ((AllocSetContext*)m_hashContext)->totalSpace; + plan_state->instrument->sorthashinfo.sinstrument.spaceUsed = ((AllocSetContext*)m_hashContext)->totalSpace; plan_state->instrument->sorthashinfo.hash_FileNum = 0; plan_state->instrument->sorthashinfo.hash_spillNum = 0; if (m_tupleCount > 0) diff --git a/src/gausskernel/runtime/vecexecutor/vecnode/vecsort.cpp b/src/gausskernel/runtime/vecexecutor/vecnode/vecsort.cpp index 9b65f78ec42795dd62666b0340e30bc1afc51987..00f2fba199b4b9932f3e042efe7d8d7c585e8eed 100644 --- a/src/gausskernel/runtime/vecexecutor/vecnode/vecsort.cpp +++ b/src/gausskernel/runtime/vecexecutor/vecnode/vecsort.cpp @@ -226,17 +226,14 @@ VectorBatch* ExecVecSort(VecSortState* node) /* Cache sort info into SortState for display of explain analyze */ if (node->ss.ps.instrument != NULL) { - batchsort_get_stats(batch_sort_stat, &(node->sortMethodId), &(node->spaceTypeId), &(node->spaceUsed)); + batchsort_get_stats(batch_sort_stat, &(node->sinstrument)); } if (HAS_INSTR(&node->ss, true)) { plan_stat->instrument->width = (int)batch_sort_stat->m_colWidth; plan_stat->instrument->sysBusy = batch_sort_stat->m_sysBusy; plan_stat->instrument->spreadNum = batch_sort_stat->m_spreadNum; - batchsort_get_stats(batch_sort_stat, - &(plan_stat->instrument->sorthashinfo.sortMethodId), - &(plan_stat->instrument->sorthashinfo.spaceTypeId), - &(plan_stat->instrument->sorthashinfo.spaceUsed)); + batchsort_get_stats(batch_sort_stat, &(plan_stat->instrument->sorthashinfo.sinstrument)); plan_stat->instrument->sorthashinfo.spill_size = total_size; } SO1_printf("ExecVecSort: %s\n", "sorting done"); diff --git a/src/gausskernel/runtime/vecexecutor/vectorsonic/vsonichashjoin.cpp b/src/gausskernel/runtime/vecexecutor/vectorsonic/vsonichashjoin.cpp index 7b2c3641e3e2e95dbc5b82a2cda6a681615746ea..525bb48fbb2f88c28427d23aa1b861bbbb1c37fe 100644 --- a/src/gausskernel/runtime/vecexecutor/vectorsonic/vsonichashjoin.cpp +++ b/src/gausskernel/runtime/vecexecutor/vectorsonic/vsonichashjoin.cpp @@ -557,7 +557,7 @@ void SonicHashJoin::Build() INSTR->sysBusy = m_memControl.sysBusy; INSTR->spreadNum = m_memControl.spreadNum; INSTR->sorthashinfo.hashbuild_time = m_build_time; - INSTR->sorthashinfo.spaceUsed = m_memControl.allocatedMem - m_memControl.availMem; + INSTR->sorthashinfo.sinstrument.spaceUsed = m_memControl.allocatedMem - m_memControl.availMem; } } diff --git a/src/include/executor/exec/execdebug.h b/src/include/executor/exec/execdebug.h index 82896388f08ac39b76923791a8c5c812668c2186..9e2a5da9e3b85c44e153671d5e6a3f0c22caf706 100644 --- a/src/include/executor/exec/execdebug.h +++ b/src/include/executor/exec/execdebug.h @@ -108,10 +108,12 @@ #define SO_nodeDisplay(l) nodeDisplay(l) #define SO_printf(s) printf(s) #define SO1_printf(s, p) printf(s, p) +#define SO2_printf(s, p1, p2) printf(s, p1, p2) #else #define SO_nodeDisplay(l) #define SO_printf(s) #define SO1_printf(s, p) +#define SO2_printf(s, p1, p2) #endif /* EXEC_SORTDEBUG */ /* ---------------- diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 098541a79fb459af39df254d59d032c0fc86daea..dad33e23d690de0fc0a57bff8db72ff8a148e450 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -83,27 +83,41 @@ typedef enum DFSType { TYPE_LOG_FT /* foreign table type, log data */ } DFSType; -#define SORT_IN_DISK 0x00000001 /* space type Disk */ -#define SORT_IN_MEMORY 0x00000002 /* space type Memory */ - -enum sortMethodId { - /* sort Method */ - HEAPSORT, - QUICKSORT, - EXTERNALSORT, - EXTERNALMERGE, - STILLINPROGRESS -}; - -struct sortMessage { - sortMethodId TypeId; - char* sortName; -}; +/* + * Data structures for reporting sort statistics. Note that + * TuplesortInstrumentation can't contain any pointers because we + * sometimes put it in shared memory. + * + * To indicate that sort never executed, we assign zero to + * SORT_TYPE_NONE(until parallel incremental sort is introduced). + * The other values of this enum can be + * OR'ed together to represent a situation where different workers used + * different methods, so we need a separate bit for each one. Keep the + * NUM_TUPLESORTMETHODS constant in sync with the number of bits! + */ +typedef enum { + SORT_TYPE_NONE = 0, + SORT_TYPE_TOP_N_HEAPSORT = 1 << 0, + SORT_TYPE_QUICKSORT = 1 << 1, + SORT_TYPE_EXTERNAL_SORT = 1 << 2, + SORT_TYPE_EXTERNAL_MERGE = 1 << 3 +} TuplesortMethod; + +#define NUM_TUPLESORTMETHODS 4 + +typedef enum { + SORT_SPACE_TYPE_DISK, + SORT_SPACE_TYPE_MEMORY +} TuplesortSpaceType; + +typedef struct TuplesortInstrumentation { + TuplesortMethod sortMethod; /* sort algorithm used */ + TuplesortSpaceType spaceType; /* type of space spaceUsed represents */ + int64 spaceUsed; /* space consumption, in kB */ +} TuplesortInstrumentation; typedef struct SortHashInfo { - int sortMethodId; - int spaceTypeId; - long spaceUsed; + TuplesortInstrumentation sinstrument; int nbatch; int nbatch_original; int nbuckets; diff --git a/src/include/executor/node/nodeIncrementalSort.h b/src/include/executor/node/nodeIncrementalSort.h new file mode 100644 index 0000000000000000000000000000000000000000..891cb3f52504b54da34bf7ee235a6da5a8443589 --- /dev/null +++ b/src/include/executor/node/nodeIncrementalSort.h @@ -0,0 +1,23 @@ +/*------------------------------------------------------------------------- + * + * nodeIncrementalSort.h + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/executor/nodeIncrementalSort.h + * + *------------------------------------------------------------------------- + */ +#ifndef NODEINCREMENTALSORT_H +#define NODEINCREMENTALSORT_H + +#include "nodes/execnodes.h" + +extern PlanState *ExecInitIncrementalSort(IncrementalSort *node, EState *estate, int eflags); +extern TupleTableSlot *ExecIncrementalSort(IncrementalSortState *pstate); +extern void ExecEndIncrementalSort(IncrementalSortState *node); +extern void ExecReScanIncrementalSort(IncrementalSortState *node); +extern void ExecReSetIncrementalSort(IncrementalSortState *node); + +#endif /* NODEINCREMENTALSORT_H */ diff --git a/src/include/knl/knl_guc/knl_session_attr_sql.h b/src/include/knl/knl_guc/knl_session_attr_sql.h index 182fa7cf3109aeaebe0eaf82be8051638b9cf22f..af06abfb44334d7f063c5a8b28cc51667f176cb6 100644 --- a/src/include/knl/knl_guc/knl_session_attr_sql.h +++ b/src/include/knl/knl_guc/knl_session_attr_sql.h @@ -270,6 +270,7 @@ typedef struct knl_session_attr_sql { bool enable_vector_targetlist; bool enable_default_local_index; #endif + bool enable_incremental_sort; } knl_session_attr_sql; #endif /* SRC_INCLUDE_KNL_KNL_SESSION_ATTR_SQL */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 4a51128dabfe150f07748afa94af26fdf3903c5a..e374b035ff6d5dcc54641b9258b9b3ca69d13876 100755 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -2552,6 +2552,19 @@ typedef struct MaterialState { Tuplestorestate* tuplestorestate; } MaterialState; +/* ---------------- + * When performing sorting by multiple keys, it's possible that the input + * dataset is already sorted on a prefix of those keys. We call these + * "presorted keys". + * PresortedKeyData represents information about one such key. + * ---------------- + */ +typedef struct PresortedKeyData { + FmgrInfo flinfo; /* comparison function info */ + FunctionCallInfo fcinfo; /* comparison function call info */ + OffsetNumber attno; /* attribute number in tuple */ +} PresortedKeyData; + /* ---------------- * SortState information * ---------------- @@ -2566,12 +2579,59 @@ typedef struct SortState { int64 bound_Done; /* value of bound we did the sort with */ void* tuplesortstate; /* private state of tuplesort.c */ int32 local_work_mem; /* work_mem local for this sort */ - int sortMethodId; /* sort method for explain */ - int spaceTypeId; /* space type for explain */ - long spaceUsed; /* space used for explain */ int64* space_size; /* spill size for temp table */ + TuplesortInstrumentation sinstrument; } SortState; +/* ---------------- + * Instrumentation information for IncrementalSort + * ---------------- + */ +typedef struct IncrementalSortGroupInfo { + int64 groupCount; + int64 maxDiskSpaceUsed; + int64 totalDiskSpaceUsed; + int64 maxMemorySpaceUsed; + int64 totalMemorySpaceUsed; + bits32 sortMethods; /* bitmask of TuplesortMethod */ +} IncrementalSortGroupInfo; + +typedef struct IncrementalSortInfo { + IncrementalSortGroupInfo fullsortGroupInfo; + IncrementalSortGroupInfo prefixsortGroupInfo; +} IncrementalSortInfo; + +/* ---------------- + * IncrementalSortState information + * ---------------- + */ +typedef enum { + INCSORT_LOADFULLSORT, + INCSORT_LOADPREFIXSORT, + INCSORT_READFULLSORT, + INCSORT_READPREFIXSORT, +} IncrementalSortExecutionStatus; + +typedef struct IncrementalSortState { + ScanState ss; /* its first field is NodeTag */ + bool bounded; /* is the result set bounded? */ + int64 bound; /* if bounded, how many tuples are needed */ + bool outerNodeDone; /* finished fetching tuples from outer node */ + int64 bound_Done; /* value of bound we did the sort with */ + IncrementalSortExecutionStatus execution_status; + int64 n_fullsort_remaining; + Tuplesortstate *fullsort_state; /* private state of tuplesort.cpp */ + Tuplesortstate *prefixsort_state; /* private state of tuplesort.cpp */ + + /* the keys by which the input path is already sorted */ + PresortedKeyData *presorted_keys; + IncrementalSortInfo incsort_info; + + /* slot for pivot tuple defining values of presorted keys within group */ + TupleTableSlot *group_pivot; + TupleTableSlot *transfer_tuple; +} IncrementalSortState; + struct SortGroupStatePriv; /* ---------------- * SortGroupState information diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 6f284726ea3d12118f9fe1f1e7f8209559082655..6688ec3675f1a2c0125a6a54e899c862b9206ed9 100755 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -86,6 +86,7 @@ typedef enum NodeTag { T_HashJoin, T_Material, T_Sort, + T_IncrementalSort, T_SortGroup, T_Group, T_Agg, @@ -206,6 +207,7 @@ typedef enum NodeTag { T_HashJoinState, T_MaterialState, T_SortState, + T_IncrementalSortState, T_SortGroupState, T_GroupState, T_AggState, diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 3c1b3958520dccd7c7785d074859da40880ef7a1..cb02a8b382a8108abc3e5620e3823ba01437f456 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -1235,6 +1235,15 @@ typedef struct Sort { OpMemInfo mem_info; /* Memory info for sort */ } Sort; +/* ---------------- + * incremental sort node + * ---------------- + */ +typedef struct IncrementalSort { + Sort sort; + int nPresortedCols; /* number of presorted columns */ +} IncrementalSort; + /* ---------------- * SortGroup node * ---------------- diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index a0aca6b10393931e71432c663993978669e84b21..b61e2907f1522d533af665c899c9f1d686aa35ae 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -108,6 +108,9 @@ extern void cost_remotequery(RemoteQueryPath* rqpath, PlannerInfo* root, RelOptI #endif extern void cost_ctescan(Path* path, PlannerInfo* root, RelOptInfo* baserel); extern void cost_recursive_union(Plan* runion, Plan* nrterm, Plan* rterm); +extern void cost_incremental_sort(Path *path, PlannerInfo *root, List *pathkeys, int presorted_keys, + Cost input_startup_cost, Cost input_total_cost, double input_tuples, int width, Cost comparison_cost, + int sort_mem, double limit_tuples); extern void cost_sort(Path* path, List* pathkeys, Cost input_cost, double tuples, int width, Cost comparison_cost, int sort_mem, double limit_tuples, bool col_store, int dop = 1, OpMemInfo* mem_info = NULL, bool index_sort = false); diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index e1fe18cdf7d29c3270b769a1a7a843563d71565f..74493768e4dce8e0bcec08fc339c191de326e8c8 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -160,6 +160,7 @@ extern void construct_pathkeys(PlannerInfo *root, List *tlist, List *activeWindo List *groupClause, bool canonical); extern PathKeysComparison compare_pathkeys(List* keys1, List* keys2); extern bool pathkeys_contained_in(List* keys1, List* keys2); +extern bool pathkeys_count_contained_in(List *keys1, List *keys2, int *n_common); extern Path* get_cheapest_path_for_pathkeys( List* paths, List* pathkeys, Relids required_outer, CostSelector cost_criterion); extern Path* get_cheapest_fractional_path_for_pathkeys( diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index e8223eae3b205350b4035a8406028cca536629c2..6b80e3bcc64be54359e4b259b4952bc769d25e67 100755 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -50,10 +50,9 @@ extern bool get_number_of_groups(PlannerInfo* root, RelOptInfo* final_rel, doubl extern void update_tuple_fraction(PlannerInfo* root, RelOptInfo* final_rel, double* numdistinct); -extern void generate_cheapest_and_sorted_path(PlannerInfo* root, RelOptInfo* final_rel, Path** cheapest_path, - Path** sorted_path, double* num_groups, bool has_groupby); - - +extern void generate_cheapest_and_sorted_path(PlannerInfo* root, RelOptInfo* final_rel, + Path** cheapest_path, Path** sorted_path, Path** partial_sorted_path, + double* num_groups, bool has_groupby); /* * prototypes for plan/planagg.c @@ -83,6 +82,13 @@ extern SortGroup* make_sort_group_from_groupcols(PlannerInfo* root, List* groupc extern Sort* make_sort_from_targetlist(PlannerInfo* root, Plan* lefttree, double limit_tuples); extern Sort* make_sort(PlannerInfo* root, Plan* lefttree, int numCols, AttrNumber* sortColIdx, Oid* sortOperators, Oid* collations, bool* nullsFirst, double limit_tuples); +extern IncrementalSort *make_incrementalsort_from_pathkeys(PlannerInfo* root, Plan *lefttree, List *pathkeys, + int nPresortedCols, double limit_tuples); +extern IncrementalSort* make_incrementalsort_from_groupcols( + PlannerInfo* root, List* groupcls, AttrNumber* grpColIdx, Plan* lefttree, List *pathkeys, int nPresortedCols); +extern IncrementalSort *make_incrementalsort(PlannerInfo* root, Plan *lefttree, List *pathkeys, int numCols, + int nPresortedCols, AttrNumber *sortColIdx, Oid *sortOperators, Oid *collations, bool *nullsFirst, + double limit_tuples); extern SortGroup* make_sortgroup(PlannerInfo* root, Plan* lefttree, int numCols, AttrNumber* sortColIdx, Oid* sortOperators, Oid* collations, bool* nullsFirst, double dNumGroup); extern Agg* make_agg(PlannerInfo* root, List* tlist, List* qual, AggStrategy aggstrategy, diff --git a/src/include/utils/batchsort.h b/src/include/utils/batchsort.h index 86600115a7af2921b837934f46548ce6f1034e43..a2144683cb30def855849feb535160b75a10799c 100644 --- a/src/include/utils/batchsort.h +++ b/src/include/utils/batchsort.h @@ -556,8 +556,6 @@ extern void batchsort_getbatch(Batchsortstate* state, bool forward, VectorBatch* extern void batchsort_end(Batchsortstate* state); -extern void batchsort_get_stats(Batchsortstate* state, int* sortMethodId, int* spaceTypeId, long* spaceUsed); - /* * These routines may only be called if randomAccess was specified 'true'. * Likewise, backwards scan in gettuple/getdatum is only allowed if @@ -569,6 +567,6 @@ extern void batchsort_rescan(Batchsortstate* state); extern void batchsort_markpos(Batchsortstate* state); extern void batchsort_restorepos(Batchsortstate* state); -extern void batchsort_get_stats(Batchsortstate* state, int* sortMethodId, int* spaceTypeId, long* spaceUsed); +extern void batchsort_get_stats(Batchsortstate *state, TuplesortInstrumentation *stats); #endif /* BATCHSORT_H */ diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h index 37f04e9d579c09ea0786c0fee18971f40cd79425..46119e9600e1d8938e08e75c4d1b6224677d5c38 100644 --- a/src/include/utils/memutils.h +++ b/src/include/utils/memutils.h @@ -95,6 +95,8 @@ extern THR_LOCAL PGDLLIMPORT MemoryContext ErrorContext; extern void MemoryContextInit(void); #define MemoryContextReset(context) \ (((MemoryContext)context)->mcxt_methods->mcxt_reset(context)) +#define MemoryContextResetOnly(context) \ + (((MemoryContext)context)->mcxt_methods->mcxt_reset_only(context)) #define MemoryContextDelete(context) \ (((MemoryContext)context)->mcxt_methods->mcxt_delete(context)) #define MemoryContextDeleteChildren(context, list) \ @@ -128,6 +130,7 @@ extern MemoryContext GetMemoryChunkContext(void* pointer); #endif extern void std_MemoryContextReset(MemoryContext context); +extern void std_MemoryContextResetOnly(MemoryContext context); extern void std_MemoryContextDelete(MemoryContext context); extern void std_MemoryContextDeleteChildren(MemoryContext context, List* context_list = NULL); extern void std_MemoryContextDestroyAtThreadExit(MemoryContext context); @@ -136,6 +139,7 @@ extern void std_MemoryContextSetParent(MemoryContext context, MemoryContext new_ extern void std_MemoryContextCheck(MemoryContext context, bool own_by_session); extern void opt_MemoryContextReset(MemoryContext context); +extern void opt_MemoryContextResetOnly(MemoryContext context); extern void opt_MemoryContextDelete(MemoryContext context); extern void opt_MemoryContextDeleteChildren(MemoryContext context, List* context_list = NULL); extern void opt_MemoryContextDestroyAtThreadExit(MemoryContext context); diff --git a/src/include/utils/palloc.h b/src/include/utils/palloc.h index 283dabf6c962a140d1b0b786daca3f70d395690b..588e5b1a331715e2992240a9378c835d93c24eba 100644 --- a/src/include/utils/palloc.h +++ b/src/include/utils/palloc.h @@ -92,6 +92,7 @@ typedef struct McxtAllocationMethods { */ typedef struct McxtOperationMethods { void (*mcxt_reset)(MemoryContext context); + void (*mcxt_reset_only)(MemoryContext context); void (*mcxt_delete)(MemoryContext context); void (*mcxt_delete_children)(MemoryContext context, List* context_list); void (*mcxt_destroy)(MemoryContext context); diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index 326ecfddbe8359d59d3e8cd7f4b196c084c53a55..79a3c52a4d447ba45bb3f1dac583ad424f7a4fb8 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -22,6 +22,7 @@ #include "access/itup.h" #include "executor/tuptable.h" +#include "executor/instrument.h" #include "fmgr.h" #include "utils/relcache.h" #include "utils/tuplestore.h" @@ -139,6 +140,7 @@ extern void tuplesort_remoteread_end(Tuplesortstate* state); #endif extern void tuplesort_set_bound(Tuplesortstate* state, int64 bound); +extern bool tuplesort_used_bound(Tuplesortstate *state); extern void tuplesort_set_siblings(Tuplesortstate* state, const int numKeys, const List *internalEntryList); extern void tuplesort_puttupleslot(Tuplesortstate* state, TupleTableSlot* slot); @@ -158,7 +160,11 @@ extern bool tuplesort_getdatum(Tuplesortstate* state, bool forward, Datum* val, extern void tuplesort_end(Tuplesortstate* state); -extern void tuplesort_get_stats(Tuplesortstate* state, int* sortMethodId, int* spaceTypeId, long* spaceUsed); +extern void tuplesort_reset(Tuplesortstate *state); + +extern void tuplesort_get_stats(Tuplesortstate* state, TuplesortInstrumentation *stats); +extern const char *tuplesort_method_name(TuplesortMethod m); +extern const char *tuplesort_space_type_name(TuplesortSpaceType t); extern int tuplesort_merge_order(double allowedMem); extern Size tuplesort_estimate_shared(int nworkers); diff --git a/src/test/regress/expected/a_outerjoin_conversion.out b/src/test/regress/expected/a_outerjoin_conversion.out index 6e7e21e42bbdfe213af948391e956c4adcdd2163..39837a79155ba6c88535c0a51a45a17c2fae2dc4 100644 --- a/src/test/regress/expected/a_outerjoin_conversion.out +++ b/src/test/regress/expected/a_outerjoin_conversion.out @@ -44,14 +44,18 @@ explain(costs off) select * from t11, t12 where t11.c1 = t12.c2(+) order by 1,2,3,4,5,6; QUERY PLAN ------------------------------------------------------------ - Sort + Incremental Sort Sort Key: t11.c1, t11.c2, t11.c3, t12.c1, t12.c2, t12.c3 - -> Hash Left Join - Hash Cond: (t11.c1 = t12.c2) - -> Seq Scan on t11 - -> Hash + Presorted Key: t11.c1 + -> Merge Left Join + Merge Cond: (t11.c1 = t12.c2) + -> Sort + Sort Key: t11.c1 + -> Seq Scan on t11 + -> Sort + Sort Key: t12.c2 -> Seq Scan on t12 -(7 rows) +(11 rows) select * from t11, t12 where t11.c1(+) = t12.c2 order by 1,2,3,4,5,6; c1 | c2 | c3 | c1 | c2 | c3 @@ -122,12 +126,13 @@ explain(costs off) select * from t11, t12 where t11.c1 = t12.c1(+) and t11.c1 in (select c3 from t13 where c1 > 8) order by 1,2,3,4,5,6; QUERY PLAN ------------------------------------------------------------ - Sort + Incremental Sort Sort Key: t11.c1, t11.c2, t11.c3, t12.c1, t12.c2, t12.c3 - -> Hash Right Join - Hash Cond: (t12.c1 = t11.c1) - -> Seq Scan on t12 - -> Hash + Presorted Key: t11.c1 + -> Merge Left Join + Merge Cond: (t11.c1 = t12.c1) + -> Sort + Sort Key: t11.c1 -> Hash Join Hash Cond: (t11.c1 = t13.c3) -> Seq Scan on t11 @@ -136,7 +141,10 @@ select * from t11, t12 where t11.c1 = t12.c1(+) and t11.c1 in (select c3 from t1 Group By Key: t13.c3 -> Seq Scan on t13 Filter: (c1 > 8) -(14 rows) + -> Sort + Sort Key: t12.c1 + -> Seq Scan on t12 +(18 rows) select * from t11, t12 where t11.c1 = t12.c1(+) and t12.c1 in (select c3 from t13 where c1 > 8) order by 1,2,3,4,5,6; c1 | c2 | c3 | c1 | c2 | c3 @@ -149,12 +157,13 @@ explain(costs off) select * from t11, t12 where t11.c1 = t12.c1(+) and t12.c1 in (select c3 from t13 where c1 > 8) order by 1,2,3,4,5,6; QUERY PLAN -------------------------------------------------------- - Sort + Incremental Sort Sort Key: t11.c1, t11.c2, t11.c3, t12.c2, t12.c3 - -> Hash Join - Hash Cond: (t11.c1 = t12.c1) - -> Seq Scan on t11 - -> Hash + Presorted Key: t11.c1 + -> Merge Join + Merge Cond: (t12.c1 = t11.c1) + -> Sort + Sort Key: t12.c1 -> Hash Join Hash Cond: (t12.c1 = t13.c3) -> Seq Scan on t12 @@ -163,7 +172,10 @@ select * from t11, t12 where t11.c1 = t12.c1(+) and t12.c1 in (select c3 from t1 Group By Key: t13.c3 -> Seq Scan on t13 Filter: (c1 > 8) -(14 rows) + -> Sort + Sort Key: t11.c1 + -> Seq Scan on t11 +(18 rows) /* multi-join-condition */ select * from t11, t12 where t11.c1(+) = t12.c1 and t11.c2(+) = t12.c2 order by 1,2,3,4,5,6; @@ -380,20 +392,25 @@ select * from t14, (select t12.c1 as dt_col1, t13.c2 as dt_col2 from t12,t13 whe explain(costs off) select * from t14, (select t12.c1 as dt_col1, t13.c2 as dt_col2 from t12,t13 where t12.c1 = t13.c1(+)) dt where t14.a4 = dt.dt_col1 order by 1,2,3,4,5; - QUERY PLAN --------------------------------------------- - Sort + QUERY PLAN +--------------------------------------------- + Incremental Sort Sort Key: t14.a4, t14.b4, t14.c4, t13.c2 - -> Hash Left Join - Hash Cond: (t12.c1 = t13.c1) - -> Hash Join - Hash Cond: (t14.a4 = t12.c1) - -> Seq Scan on t14 - -> Hash + Presorted Key: t14.a4 + -> Merge Left Join + Merge Cond: (t12.c1 = t13.c1) + -> Merge Join + Merge Cond: (t14.a4 = t12.c1) + -> Sort + Sort Key: t14.a4 + -> Seq Scan on t14 + -> Sort + Sort Key: t12.c1 -> Seq Scan on t12 - -> Hash + -> Sort + Sort Key: t13.c1 -> Seq Scan on t13 -(11 rows) +(16 rows) /* (+) used in subLink */ select t11.c1,t11.c2, t12.c1, t12.c2 @@ -427,12 +444,13 @@ where t11.c1 = t12.c1(+) and t11.c2 in ( ) order by 1,2,3,4; QUERY PLAN -------------------------------------------------------------------- - Sort + Incremental Sort Sort Key: t11.c1, t11.c2, t12.c1, t12.c2 - -> Hash Right Join - Hash Cond: (t12.c1 = t11.c1) - -> Seq Scan on t12 - -> Hash + Presorted Key: t11.c1 + -> Merge Left Join + Merge Cond: (t11.c1 = t12.c1) + -> Sort + Sort Key: t11.c1 -> Hash Join Hash Cond: (t11.c2 = t14.a4) -> Seq Scan on t11 @@ -444,7 +462,10 @@ where t11.c1 = t12.c1(+) and t11.c2 in ( -> Seq Scan on t14 -> Hash -> Seq Scan on t13 -(17 rows) + -> Sort + Sort Key: t12.c1 + -> Seq Scan on t12 +(21 rows) select t11.c1,t11.c2, t12.c1, t12.c2 from t11, t12 @@ -482,12 +503,13 @@ where t11.c1 = t12.c1(+) and t11.c2 in ( ) order by 1,2,3,4; QUERY PLAN -------------------------------------------------------------------- - Sort + Incremental Sort Sort Key: t11.c1, t11.c2, t12.c1, t12.c2 - -> Hash Right Join - Hash Cond: (t12.c1 = t11.c1) - -> Seq Scan on t12 - -> Hash + Presorted Key: t11.c1 + -> Merge Left Join + Merge Cond: (t11.c1 = t12.c1) + -> Sort + Sort Key: t11.c1 -> Hash Join Hash Cond: (t11.c2 = t14.a4) -> Seq Scan on t11 @@ -499,7 +521,10 @@ where t11.c1 = t12.c1(+) and t11.c2 in ( -> Seq Scan on t14 -> Hash -> Seq Scan on t13 -(17 rows) + -> Sort + Sort Key: t12.c1 + -> Seq Scan on t12 +(21 rows) select t11.c1,t11.c2, t12.c1, t12.c2 from t11, t12 @@ -532,12 +557,13 @@ where t11.c1 = t12.c1(+) and t11.c2 in ( ) order by 1,2,3,4; QUERY PLAN -------------------------------------------------------------------- - Sort + Incremental Sort Sort Key: t11.c1, t11.c2, t12.c1, t12.c2 - -> Hash Right Join - Hash Cond: (t12.c1 = t11.c1) - -> Seq Scan on t12 - -> Hash + Presorted Key: t11.c1 + -> Merge Left Join + Merge Cond: (t11.c1 = t12.c1) + -> Sort + Sort Key: t11.c1 -> Hash Join Hash Cond: (t11.c2 = t14.a4) -> Seq Scan on t11 @@ -549,7 +575,10 @@ where t11.c1 = t12.c1(+) and t11.c2 in ( -> Seq Scan on t13 -> Hash -> Seq Scan on t14 -(17 rows) + -> Sort + Sort Key: t12.c1 + -> Seq Scan on t12 +(21 rows) /* (+) used in subquery-RTE join with relation-RTE */ select * @@ -599,20 +628,24 @@ from t14, ( from t12,t13 where t12.c1 = t13.c1(+) ) dt where t14.a4 = dt.dt_col1(+) order by 1,2,3,4; - QUERY PLAN --------------------------------------------- - Sort + QUERY PLAN +-------------------------------------------------- + Incremental Sort Sort Key: t14.a4, t14.b4, t14.c4, t12.c1 - -> Hash Left Join - Hash Cond: (t12.c1 = t13.c1) - -> Hash Left Join - Hash Cond: (t14.a4 = t12.c1) + Presorted Key: t14.a4 + -> Merge Left Join + Merge Cond: (t14.a4 = t12.c1) + -> Sort + Sort Key: t14.a4 -> Seq Scan on t14 - -> Hash + -> Sort + Sort Key: t12.c1 + -> Hash Left Join + Hash Cond: (t12.c1 = t13.c1) -> Seq Scan on t12 - -> Hash - -> Seq Scan on t13 -(11 rows) + -> Hash + -> Seq Scan on t13 +(15 rows) select * from t14, ( @@ -1362,57 +1395,70 @@ select * from (select t11.c1 a from t11 left join t12 on t11.c1 = t12.c2) inner explain (verbose on, costs off, analyze on, timing off, cpu off) select * from (select t11.c1 a from t11 left join t12 on t11.c1 = t12.c2) inner join t13 on (a = c3) order by 1,2,3,4; - QUERY PLAN ---------------------------------------------------------------------------------- - Sort (actual rows=10 loops=1) + QUERY PLAN +--------------------------------------------------------------------------------------- + Incremental Sort (actual rows=10 loops=1) Output: t11.c1, t13.c1, t13.c2, t13.c3 Sort Key: t11.c1, t13.c1, t13.c2 - Sort Method: quicksort Memory: 26kB - -> Hash Join (actual rows=10 loops=1) + Presorted Key: t11.c1 + Full-sort Groups: 1 Sort Method: quicksort Average Memory: 26kB Peak Memory: 26kB + -> Merge Join (actual rows=10 loops=1) Output: t11.c1, t13.c1, t13.c2, t13.c3 - Hash Cond: (t11.c1 = t13.c3) - -> Hash Left Join (actual rows=15 loops=1) + Merge Cond: (t11.c1 = t13.c3) + -> Merge Left Join (actual rows=15 loops=1) Output: t11.c1 - Hash Cond: (t11.c1 = t12.c2) - -> Seq Scan on plus_outerjoin.t11 (actual rows=15 loops=1) - Output: t11.c1, t11.c2, t11.c3 - -> Hash (actual rows=15 loops=1) + Merge Cond: (t11.c1 = t12.c2) + -> Sort (actual rows=15 loops=1) + Output: t11.c1 + Sort Key: t11.c1 + Sort Method: quicksort Memory: 26kB + -> Seq Scan on plus_outerjoin.t11 (actual rows=15 loops=1) + Output: t11.c1 + -> Sort (actual rows=11 loops=1) Output: t12.c2 ---?.* + Sort Key: t12.c2 + Sort Method: quicksort Memory: 26kB -> Seq Scan on plus_outerjoin.t12 (actual rows=15 loops=1) Output: t12.c2 - -> Hash (actual rows=15 loops=1) + -> Sort (actual rows=11 loops=1) Output: t13.c1, t13.c2, t13.c3 ---?.* + Sort Key: t13.c3 + Sort Method: quicksort Memory: 26kB -> Seq Scan on plus_outerjoin.t13 (actual rows=15 loops=1) Output: t13.c1, t13.c2, t13.c3 --? Total runtime: .* ms -(23 rows) +(30 rows) explain(verbose on, costs off) select * from (select t11.c1 a from t11,t12 where t11.c1 = t12.c2(+)) inner join t13 on (a = c3) order by 1,2,3,4; QUERY PLAN -------------------------------------------------------- - Sort + Incremental Sort Output: t11.c1, t13.c1, t13.c2, t13.c3 Sort Key: t11.c1, t13.c1, t13.c2 - -> Hash Join + Presorted Key: t11.c1 + -> Merge Join Output: t11.c1, t13.c1, t13.c2, t13.c3 - Hash Cond: (t11.c1 = t13.c3) - -> Hash Left Join + Merge Cond: (t11.c1 = t13.c3) + -> Merge Left Join Output: t11.c1 - Hash Cond: (t11.c1 = t12.c2) - -> Seq Scan on plus_outerjoin.t11 - Output: t11.c1, t11.c2, t11.c3 - -> Hash + Merge Cond: (t11.c1 = t12.c2) + -> Sort + Output: t11.c1 + Sort Key: t11.c1 + -> Seq Scan on plus_outerjoin.t11 + Output: t11.c1 + -> Sort Output: t12.c2 + Sort Key: t12.c2 -> Seq Scan on plus_outerjoin.t12 Output: t12.c2 - -> Hash + -> Sort Output: t13.c1, t13.c2, t13.c3 + Sort Key: t13.c3 -> Seq Scan on plus_outerjoin.t13 Output: t13.c1, t13.c2, t13.c3 -(19 rows) +(25 rows) ----used in create view --------- create view plus_v as select t11.c1, t12.c2 from t11, t12 where t11.c1 = t12.c2(+) and t12.c2(+) in (1,2,3); diff --git a/src/test/regress/expected/alter_table_000.out b/src/test/regress/expected/alter_table_000.out index 0848412bf23a10a0ec040420d53240fd1aefc3ac..8cb66922d1541be99bc9de902a79afb65c7e28d8 100644 --- a/src/test/regress/expected/alter_table_000.out +++ b/src/test/regress/expected/alter_table_000.out @@ -568,19 +568,24 @@ explain (verbose true, costs false) insert into test_drop_column_1 select test_d Insert on public.test_drop_column_1 -> Subquery Scan on "*SELECT*" Output: "*SELECT*".a, "*SELECT*".a, NULL::integer - -> Sort + -> Incremental Sort Output: test_drop_column_2.a, test_drop_column_3.a Sort Key: test_drop_column_2.a, test_drop_column_3.a - -> Hash Join + Presorted Key: test_drop_column_2.a + -> Merge Join Output: test_drop_column_2.a, test_drop_column_3.a - Hash Cond: (test_drop_column_2.a = test_drop_column_3.b) - -> Seq Scan on public.test_drop_column_2 - Output: test_drop_column_2.a, test_drop_column_2.b - -> Hash + Merge Cond: (test_drop_column_2.a = test_drop_column_3.b) + -> Sort + Output: test_drop_column_2.a + Sort Key: test_drop_column_2.a + -> Seq Scan on public.test_drop_column_2 + Output: test_drop_column_2.a + -> Sort Output: test_drop_column_3.a, test_drop_column_3.b + Sort Key: test_drop_column_3.b -> Seq Scan on public.test_drop_column_3 Output: test_drop_column_3.a, test_drop_column_3.b -(15 rows) +(20 rows) insert into test_drop_column_1 select test_drop_column_2.a, test_drop_column_3.a from test_drop_column_2, test_drop_column_3 where test_drop_column_2.a = test_drop_column_3.b order by 1, 2; explain (verbose true, costs false) update test_drop_column_1 set a=test_drop_column_2.a from test_drop_column_2; diff --git a/src/test/regress/expected/bypass_preparedexecute_support.out b/src/test/regress/expected/bypass_preparedexecute_support.out index eb44cd9627612a29c640b842c9cd4f94528d5c80..b04b17bdfec7a6b65f26c59980a7e0e7914ff78c 100755 --- a/src/test/regress/expected/bypass_preparedexecute_support.out +++ b/src/test/regress/expected/bypass_preparedexecute_support.out @@ -810,13 +810,14 @@ execute p212; --not bypass prepare p213 as select * from test_bypass_sq2 order by col1,col2; explain execute p213; - QUERY PLAN ----------------------------------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------------------------- [No Bypass]reason: Bypass not executed because query's scan operator is not index. - Sort (cost=1000000003267.94..1000000003273.31 rows=2149 width=8) + Incremental Sort (cost=0.59..148.16 rows=2149 width=8) Sort Key: col1, col2 - -> Seq Scan on test_bypass_sq2 (cost=10000000000.00..1000000003149.00 rows=2149 width=8) -(4 rows) + Presorted Key: col1 + -> Index Scan using itest_bypass_sq2 on test_bypass_sq2 (cost=0.00..80.49 rows=2149 width=8) +(5 rows) --bypass prepare p27 as select * from test_bypass_sq2 where col1 = $1 limit 1; diff --git a/src/test/regress/expected/bypass_simplequery_support.out b/src/test/regress/expected/bypass_simplequery_support.out index 14d07a32f1c19a9bbad0bef24852f6e1eb8708f3..f15eca81521759c55f4ccb4107519b1a19a89720 100644 --- a/src/test/regress/expected/bypass_simplequery_support.out +++ b/src/test/regress/expected/bypass_simplequery_support.out @@ -8,6 +8,7 @@ set opfusion_debug_mode = 'log'; set log_min_messages=debug; set logging_module = 'on(OPFUSION)'; set sql_beta_feature = 'index_cost_with_leaf_pages_only'; +set enable_incremental_sort = off; -- create table drop table if exists test_bypass_sq1; NOTICE: table "test_bypass_sq1" does not exist, skipping @@ -1356,12 +1357,12 @@ select col1 from test_bypass_sq2 order by col1; --not bypass explain select * from test_bypass_sq2 order by col1,col2; - QUERY PLAN ----------------------------------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------------------------- [No Bypass]reason: Bypass not executed because query's scan operator is not index. - Sort (cost=1000000003267.94..1000000003273.31 rows=2149 width=8) + Sort (cost=199.43..204.80 rows=2149 width=8) Sort Key: col1, col2 - -> Seq Scan on test_bypass_sq2 (cost=10000000000.00..1000000003149.00 rows=2149 width=8) + -> Index Scan using itest_bypass_sq2 on test_bypass_sq2 (cost=0.00..80.49 rows=2149 width=8) (4 rows) -- diff --git a/src/test/regress/expected/col_count_distinct_3.out b/src/test/regress/expected/col_count_distinct_3.out index 3f8f86590caf843e361ff3f3e35f3113839bb900..4ae220a4a54986393d21b47f031b96d8316f14b6 100644 --- a/src/test/regress/expected/col_count_distinct_3.out +++ b/src/test/regress/expected/col_count_distinct_3.out @@ -10,6 +10,7 @@ create table t_distinct(a int, b int, c int, d int) with (orientation=column); INSERT INTO t_distinct select generate_series(1, 1000)%501, generate_series(1, 1000)%75, generate_series(1, 1000)%25, generate_series(1, 1000)%7 from src; analyze t_distinct; -- Case 3 groupagg optimization +set enable_incremental_sort = off; set enable_hashagg=off; explain (costs off) select avg(distinct(a)) from t_distinct; QUERY PLAN @@ -143,6 +144,7 @@ select count(distinct(c)), d from t_distinct group by d having avg(distinct(c)) 25 | 6 (7 rows) +reset enable_incremental_sort; reset enable_hashagg; -- Case 4 two_level_hashagg explain (costs off) select count(distinct(b)), count(c), d from t_distinct group by d order by d; diff --git a/src/test/regress/expected/col_count_distinct_4.out b/src/test/regress/expected/col_count_distinct_4.out index 4ea718eb99d076664b9b086eb82bbfce43fd776d..18e4ee3603dea03b23cb2b24390cc383fae3b374 100755 --- a/src/test/regress/expected/col_count_distinct_4.out +++ b/src/test/regress/expected/col_count_distinct_4.out @@ -3,6 +3,7 @@ */ create schema col_distribute_count_distinct_4; set current_schema = col_distribute_count_distinct_4; +set enable_incremental_sort = false; -- Create Table and Insert Data create table src(c1 int); insert into src values(0); @@ -518,6 +519,7 @@ select count(distinct(c)) from (select a, ''::text as c from (select t1.a from t -- Clean Table drop table t_distinct; +reset enable_incremental_sort; reset current_schema; drop schema col_distribute_count_distinct_4 cascade; NOTICE: drop cascades to table col_distribute_count_distinct_4.src diff --git a/src/test/regress/expected/col_partition_iterator_elimination.out b/src/test/regress/expected/col_partition_iterator_elimination.out index 211e86b391657dbce26b2f1d4f595f17bb5076d9..ebd572a7e46a497fb347a48c15b215fe4f263325 100644 --- a/src/test/regress/expected/col_partition_iterator_elimination.out +++ b/src/test/regress/expected/col_partition_iterator_elimination.out @@ -81,6 +81,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; QUERY PLAN @@ -259,6 +260,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = on; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; QUERY PLAN @@ -437,6 +439,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = on; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; QUERY PLAN @@ -639,6 +642,7 @@ set enable_indexscan = on; set enable_bitmapscan = off; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; QUERY PLAN @@ -817,6 +821,7 @@ set enable_indexscan = on; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = on; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; QUERY PLAN @@ -995,6 +1000,7 @@ set enable_indexscan = on; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = on; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; QUERY PLAN @@ -1197,6 +1203,7 @@ set enable_indexscan = off; set enable_bitmapscan = on; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; QUERY PLAN @@ -1391,6 +1398,7 @@ set enable_indexscan = off; set enable_bitmapscan = on; set enable_nestloop = off; set enable_hashjoin = on; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; QUERY PLAN @@ -1585,6 +1593,7 @@ set enable_indexscan = off; set enable_bitmapscan = on; set enable_nestloop = off; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = on; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; QUERY PLAN @@ -1797,6 +1806,7 @@ select count(t1.b) from test_range_pt1 t1 join test_range_pt t2 on t1.b = t2.b w 2 (1 row) +RESET enable_incremental_sort; DROP SCHEMA col_partition_iterator_elimination CASCADE; NOTICE: drop cascades to 2 other objects DETAIL: drop cascades to table test_range_pt diff --git a/src/test/regress/expected/count_distinct_part2.out b/src/test/regress/expected/count_distinct_part2.out index 875f58c035a15729d7fa1484aa16a9979cf7a2e9..61ba40ce59e7a8c2f683a8fee82eef55bf71d3f7 100755 --- a/src/test/regress/expected/count_distinct_part2.out +++ b/src/test/regress/expected/count_distinct_part2.out @@ -287,14 +287,15 @@ select c from t_distinct group by c having avg(distinct(c))>50 order by 1 limit explain (costs off) select b, c from t_distinct group by b, c order by b, count(distinct(c))-c; QUERY PLAN ------------------------------------------ - Sort + Incremental Sort Sort Key: b, ((count(DISTINCT c) - c)) + Presorted Key: b -> GroupAggregate Group By Key: b, c -> Sort Sort Key: b, c -> Seq Scan on t_distinct -(7 rows) +(8 rows) select b, c from t_distinct group by b, c order by b, count(distinct(c))-c limit 10; b | c @@ -675,14 +676,15 @@ select count(distinct(a))+avg(b) col2, count(c) col3, d from t_distinct group by explain (costs off) select count(distinct(a)) col1, avg(b) col2, count(c) col3, d from t_distinct group by d order by d, avg(distinct(c)); QUERY PLAN ------------------------------------------ - Sort + Incremental Sort Sort Key: d, (avg(DISTINCT c)) + Presorted Key: d -> GroupAggregate Group By Key: d -> Sort Sort Key: d -> Seq Scan on t_distinct -(7 rows) +(8 rows) select count(distinct(a)) col1, avg(b) col2, count(c) col3, d from t_distinct group by d order by d, avg(distinct(c)); col1 | col2 | col3 | d @@ -699,14 +701,15 @@ select count(distinct(a)) col1, avg(b) col2, count(c) col3, d from t_distinct gr explain (costs off) select count(distinct(a)) col1, d, avg(b) col2, sum(distinct(a)) col3, avg(distinct(c)) col4 from t_distinct group by d order by d, avg(distinct(c)); QUERY PLAN ------------------------------------------ - Sort + Incremental Sort Sort Key: d, (avg(DISTINCT c)) + Presorted Key: d -> GroupAggregate Group By Key: d -> Sort Sort Key: d -> Seq Scan on t_distinct -(7 rows) +(8 rows) select count(distinct(a)) col1, d, avg(b) col2, sum(distinct(a)) col3, avg(distinct(c)) col4 from t_distinct group by d order by d, avg(distinct(c)); col1 | d | col2 | col3 | col4 @@ -745,15 +748,16 @@ select distinct case when min(distinct c)>60 then min(distinct c) else null end explain (costs off) select count(distinct(a)) col1, d, avg(b) col2, sum(distinct(a)) col3, avg(distinct(c)) col4 from t_distinct group by d having col1=1428 or d+col4>125 order by d, avg(distinct(c)); QUERY PLAN --------------------------------------------------------------------------------------------------- - Sort + Incremental Sort Sort Key: d, (avg(DISTINCT c)) + Presorted Key: d -> GroupAggregate Group By Key: d Filter: ((count(DISTINCT a) = 1428) OR (((d)::numeric + avg(DISTINCT c)) > 125::numeric)) -> Sort Sort Key: d -> Seq Scan on t_distinct -(8 rows) +(9 rows) select count(distinct(a)) col1, d, avg(b) col2, sum(distinct(a)) col3, avg(distinct(c)) col4 from t_distinct group by d having col1=1428 or d+col4>125 order by d, avg(distinct(c)); col1 | d | col2 | col3 | col4 diff --git a/src/test/regress/expected/count_distinct_part4.out b/src/test/regress/expected/count_distinct_part4.out index c4000dcd8f7ba4f30def4c057f46337680b8d3f6..7e201feb32cf500412e876c782194c188e0ecc10 100755 --- a/src/test/regress/expected/count_distinct_part4.out +++ b/src/test/regress/expected/count_distinct_part4.out @@ -54,8 +54,9 @@ ORDER BY 1 , 3; QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ - Sort + Incremental Sort Sort Key: (((dt.web_site_sk)::numeric * date_dim_less.d_current_quarter)), (min(dt.web_rec_end_date)), (avg(DISTINCT (((dt.web_site_sk)::numeric * date_dim_less.d_current_quarter)))) + Presorted Key: (((dt.web_site_sk)::numeric * date_dim_less.d_current_quarter)) -> GroupAggregate Group By Key: (((dt.web_site_sk)::numeric * date_dim_less.d_current_quarter)), dt.web_site_sk Filter: (min(DISTINCT dt.web_rec_end_date) IS NOT NULL) @@ -81,7 +82,7 @@ ORDER BY 1 -> Hash -> Seq Scan on llvm_web_site Filter: (web_site_sk > 1) -(27 rows) +(28 rows) --full join explain (costs off) @@ -114,8 +115,9 @@ ORDER BY 1 , 3; QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ - Sort + Incremental Sort Sort Key: (((dt.web_site_sk)::numeric * date_dim_less.d_current_quarter)), (min(dt.web_rec_end_date)), (avg(DISTINCT (((dt.web_site_sk)::numeric * date_dim_less.d_current_quarter)))) + Presorted Key: (((dt.web_site_sk)::numeric * date_dim_less.d_current_quarter)) -> GroupAggregate Group By Key: (((dt.web_site_sk)::numeric * date_dim_less.d_current_quarter)), dt.web_site_sk Filter: (min(DISTINCT dt.web_rec_end_date) IS NOT NULL) @@ -141,7 +143,7 @@ ORDER BY 1 -> Hash -> Seq Scan on llvm_web_site Filter: (web_site_sk > 1) -(27 rows) +(28 rows) --inner join explain (costs off) @@ -156,8 +158,9 @@ having dt.web_site_sk > 1 and min(distinct web_rec_end_date) is not null order by 1, 2; QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------------- - Sort + Incremental Sort Sort Key: (((llvm_web_site.web_site_sk)::numeric * date_dim_less.d_current_quarter)), (avg(DISTINCT llvm_web_site.web_site_sk)) + Presorted Key: (((llvm_web_site.web_site_sk)::numeric * date_dim_less.d_current_quarter)) -> GroupAggregate Group By Key: (((llvm_web_site.web_site_sk)::numeric * date_dim_less.d_current_quarter)), llvm_web_site.web_site_sk Filter: (min(DISTINCT llvm_web_site.web_rec_end_date) IS NOT NULL) @@ -170,7 +173,7 @@ order by 1, 2; -> Hash -> Seq Scan on date_dim_less Filter: ((substr((d_date_sk)::text, (-1)))::bigint > 1) -(14 rows) +(15 rows) --left join explain (costs off) @@ -185,8 +188,9 @@ having dt.web_site_sk > 1 and min(distinct web_rec_end_date) is not null order by 1, 2; QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------------- - Sort + Incremental Sort Sort Key: (((llvm_web_site.web_site_sk)::numeric * date_dim_less.d_current_quarter)), (avg(DISTINCT llvm_web_site.web_site_sk)) + Presorted Key: (((llvm_web_site.web_site_sk)::numeric * date_dim_less.d_current_quarter)) -> GroupAggregate Group By Key: (((llvm_web_site.web_site_sk)::numeric * date_dim_less.d_current_quarter)), llvm_web_site.web_site_sk Filter: (min(DISTINCT llvm_web_site.web_rec_end_date) IS NOT NULL) @@ -198,7 +202,7 @@ order by 1, 2; -> Hash -> Seq Scan on llvm_web_site Filter: (web_site_sk > 1) -(13 rows) +(14 rows) --right join explain (costs off) @@ -213,8 +217,9 @@ having dt.web_site_sk > 1 and min(distinct web_rec_end_date) is not null order by 1, 2; QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------------- - Sort + Incremental Sort Sort Key: (((llvm_web_site.web_site_sk)::numeric * date_dim_less.d_current_quarter)), (avg(DISTINCT llvm_web_site.web_site_sk)) + Presorted Key: (((llvm_web_site.web_site_sk)::numeric * date_dim_less.d_current_quarter)) -> GroupAggregate Group By Key: (((llvm_web_site.web_site_sk)::numeric * date_dim_less.d_current_quarter)), llvm_web_site.web_site_sk Filter: (min(DISTINCT llvm_web_site.web_rec_end_date) IS NOT NULL) @@ -227,7 +232,7 @@ order by 1, 2; -> Hash -> Seq Scan on date_dim_less Filter: ((substr((d_date_sk)::text, (-1)))::bigint > 1) -(14 rows) +(15 rows) explain (costs off) select dt.web_site_sk * d_current_quarter, @@ -241,8 +246,9 @@ having coalesce(dt.web_site_sk+1,2)>1 and min(distinct web_rec_end_date) is not order by 1, 2; QUERY PLAN --------------------------------------------------------------------------------------------------------------------------------------- - Sort + Incremental Sort Sort Key: (((llvm_web_site.web_site_sk)::numeric * date_dim_less.d_current_quarter)), (avg(DISTINCT llvm_web_site.web_site_sk)) + Presorted Key: (((llvm_web_site.web_site_sk)::numeric * date_dim_less.d_current_quarter)) -> GroupAggregate Group By Key: (((llvm_web_site.web_site_sk)::numeric * date_dim_less.d_current_quarter)), ((llvm_web_site.web_site_sk + 1)) Filter: (min(DISTINCT llvm_web_site.web_rec_end_date) IS NOT NULL) @@ -254,7 +260,7 @@ having coalesce(dt.web_site_sk+1,2)>1 and min(distinct web_rec_end_date) is not -> Seq Scan on llvm_web_site -> Hash -> Seq Scan on date_dim_less -(13 rows) +(14 rows) explain (costs off) select dt.web_site_sk * d_current_quarter, @@ -268,8 +274,9 @@ having coalesce(dt.web_site_sk+web_site_sk,2)>1 and min(distinct web_rec_end_dat order by 1, 2; QUERY PLAN --------------------------------------------------------------------------------------------------------------------------------------------------------------- - Sort + Incremental Sort Sort Key: (((llvm_web_site.web_site_sk)::numeric * date_dim_less.d_current_quarter)), (avg(DISTINCT llvm_web_site.web_site_sk)) + Presorted Key: (((llvm_web_site.web_site_sk)::numeric * date_dim_less.d_current_quarter)) -> GroupAggregate Group By Key: (((llvm_web_site.web_site_sk)::numeric * date_dim_less.d_current_quarter)), ((llvm_web_site.web_site_sk + llvm_web_site.web_site_sk)) Filter: (min(DISTINCT llvm_web_site.web_rec_end_date) IS NOT NULL) @@ -281,7 +288,7 @@ having coalesce(dt.web_site_sk+web_site_sk,2)>1 and min(distinct web_rec_end_dat -> Seq Scan on llvm_web_site -> Hash -> Seq Scan on date_dim_less -(13 rows) +(14 rows) set explain_perf_mode=pretty; CREATE TABLE m_inte_counter_detail ( diff --git a/src/test/regress/expected/gpi_index_only.out b/src/test/regress/expected/gpi_index_only.out index 1c4416d1bcb8dfeee70d9dcb820014594b3625e0..2ae09b8f2951fca6950896907fe497256bf7d016 100644 --- a/src/test/regress/expected/gpi_index_only.out +++ b/src/test/regress/expected/gpi_index_only.out @@ -266,6 +266,7 @@ create index gpi_J2_TBL_nonp_k_index on gpi_J2_TBL(k) global; create index gpi_J2_TBL_nonp_t_index on gpi_J2_TBL(t) global; set enable_bitmapscan=off; set enable_seqscan=off; +set enable_material=off; vacuum analyze gpi_J1_TBL; vacuum analyze gpi_J2_TBL; explain (costs false) SELECT distinct(t1.b), t2.e @@ -404,13 +405,12 @@ explain (costs false) SELECT distinct(j),a.k,b.k Group By Key: gpi_j1_tbl.j, a.k, b.k -> Nested Loop -> Nested Loop - -> Index Only Scan using gpi_j1_tbl_nonp_j_index on gpi_j1_tbl - Index Cond: (j = 5) -> Index Only Scan using gpi_j2_tbl_nonp_k_index on gpi_j2_tbl a - Index Cond: (k < gpi_j1_tbl.j) + -> Index Only Scan using gpi_j1_tbl_nonp_j_index on gpi_j1_tbl + Index Cond: ((j > a.k) AND (j = 5)) -> Index Only Scan using gpi_j2_tbl_nonp_k_index on gpi_j2_tbl b Index Cond: (k < a.k) -(12 rows) +(11 rows) SELECT distinct(j),a.k,b.k FROM gpi_J1_TBL CROSS JOIN gpi_J2_TBL a CROSS JOIN gpi_J2_TBL b @@ -562,13 +562,14 @@ explain (costs false) SELECT distinct(j),k QUERY PLAN ------------------------------------------------------------------------------- Unique - -> Sort + -> Incremental Sort Sort Key: gpi_j1_tbl.j, gpi_j2_tbl.k - -> Merge Right Join - Merge Cond: (gpi_j2_tbl.k = gpi_j1_tbl.j) - -> Index Only Scan using gpi_j2_tbl_nonp_k_index on gpi_j2_tbl + Presorted Key: gpi_j1_tbl.j + -> Merge Left Join + Merge Cond: (gpi_j1_tbl.j = gpi_j2_tbl.k) -> Index Only Scan using gpi_j1_tbl_nonp_j_index on gpi_j1_tbl -(7 rows) + -> Index Only Scan using gpi_j2_tbl_nonp_k_index on gpi_j2_tbl +(8 rows) SELECT distinct(j),k FROM gpi_J1_TBL LEFT OUTER JOIN gpi_J2_TBL ON (gpi_J1_TBL.j = gpi_J2_TBL.k) @@ -652,6 +653,7 @@ SELECT distinct(j),k | 34 (9 rows) +reset enable_material; drop table if exists gpi_J1_TBL; drop table if exists gpi_J2_TBL; set client_min_messages=notice; diff --git a/src/test/regress/expected/hw_subpartition_gpi.out b/src/test/regress/expected/hw_subpartition_gpi.out index d6b5b9b2f5c2becbd88830b60c4444258d3fca1f..a32435b8540fe3d5ca61ec22f44797bda30814fb 100644 --- a/src/test/regress/expected/hw_subpartition_gpi.out +++ b/src/test/regress/expected/hw_subpartition_gpi.out @@ -481,15 +481,15 @@ select * from range_list where month_code = '201902' order by 1, 2, 3, 4; (0 rows) explain(costs off, verbose on) select * from range_list where dept_code = '1' order by 1, 2, 3, 4; - QUERY PLAN ------------------------------------------------------------------------------ + QUERY PLAN +---------------------------------------------------------------------------------------------- Sort Output: month_code, dept_code, user_no, sales_amt Sort Key: range_list.month_code, range_list.user_no, range_list.sales_amt -> Partition Iterator Output: month_code, dept_code, user_no, sales_amt Iterations: 2, Sub Iterations: 2 - -> Partitioned Seq Scan on subpartition_gpi.range_list + -> Partitioned Index Scan using idx_month_code_local on subpartition_gpi.range_list Output: month_code, dept_code, user_no, sales_amt Filter: ((range_list.dept_code)::text = '1'::text) Selected Partitions: 1..2 @@ -503,15 +503,15 @@ select * from range_list where dept_code = '1' order by 1, 2, 3, 4; (1 row) explain(costs off, verbose on) select * from range_list where user_no = '1' order by 1, 2, 3, 4; - QUERY PLAN -------------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------------------- Sort Output: month_code, dept_code, user_no, sales_amt Sort Key: range_list.month_code, range_list.dept_code, range_list.sales_amt -> Partition Iterator Output: month_code, dept_code, user_no, sales_amt Iterations: 2, Sub Iterations: 4 - -> Partitioned Seq Scan on subpartition_gpi.range_list + -> Partitioned Index Scan using idx_month_code_local on subpartition_gpi.range_list Output: month_code, dept_code, user_no, sales_amt Filter: ((range_list.user_no)::text = '1'::text) Selected Partitions: 1..2 @@ -752,15 +752,15 @@ select * from range_list where month_code = '201902' order by 1, 2, 3, 4; (3 rows) explain(costs off, verbose on) select * from range_list where dept_code = '1' order by 1, 2, 3, 4; - QUERY PLAN ------------------------------------------------------------------------------ + QUERY PLAN +---------------------------------------------------------------------------------------------- Sort Output: month_code, dept_code, user_no, sales_amt Sort Key: range_list.month_code, range_list.user_no, range_list.sales_amt -> Partition Iterator Output: month_code, dept_code, user_no, sales_amt Iterations: 2, Sub Iterations: 2 - -> Partitioned Seq Scan on subpartition_gpi.range_list + -> Partitioned Index Scan using idx_month_code_local on subpartition_gpi.range_list Output: month_code, dept_code, user_no, sales_amt Filter: ((range_list.dept_code)::text = '1'::text) Selected Partitions: 1..2 @@ -776,15 +776,15 @@ select * from range_list where dept_code = '1' order by 1, 2, 3, 4; (3 rows) explain(costs off, verbose on) select * from range_list where user_no = '1' order by 1, 2, 3, 4; - QUERY PLAN -------------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------------------- Sort Output: month_code, dept_code, user_no, sales_amt Sort Key: range_list.month_code, range_list.dept_code, range_list.sales_amt -> Partition Iterator Output: month_code, dept_code, user_no, sales_amt Iterations: 2, Sub Iterations: 6 - -> Partitioned Seq Scan on subpartition_gpi.range_list + -> Partitioned Index Scan using idx_month_code_local on subpartition_gpi.range_list Output: month_code, dept_code, user_no, sales_amt Filter: ((range_list.user_no)::text = '1'::text) Selected Partitions: 1..2 @@ -1027,15 +1027,15 @@ select * from range_range where month_code = '201902' order by 1, 2, 3, 4; (3 rows) explain(costs off, verbose on) select * from range_range where dept_code = '1' order by 1, 2, 3, 4; - QUERY PLAN --------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------------------------- Sort Output: month_code, dept_code, user_no, sales_amt Sort Key: range_range.month_code, range_range.user_no, range_range.sales_amt -> Partition Iterator Output: month_code, dept_code, user_no, sales_amt Iterations: 2, Sub Iterations: 2 - -> Partitioned Seq Scan on subpartition_gpi.range_range + -> Partitioned Index Scan using idx_month_code_local on subpartition_gpi.range_range Output: month_code, dept_code, user_no, sales_amt Filter: ((range_range.dept_code)::text = '1'::text) Selected Partitions: 1..2 @@ -1050,15 +1050,15 @@ select * from range_range where dept_code = '1' order by 1, 2, 3, 4; (2 rows) explain(costs off, verbose on) select * from range_range where user_no = '1' order by 1, 2, 3, 4; - QUERY PLAN ----------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------------------------- Sort Output: month_code, dept_code, user_no, sales_amt Sort Key: range_range.month_code, range_range.dept_code, range_range.sales_amt -> Partition Iterator Output: month_code, dept_code, user_no, sales_amt Iterations: 2, Sub Iterations: 6 - -> Partitioned Seq Scan on subpartition_gpi.range_range + -> Partitioned Index Scan using idx_month_code_local on subpartition_gpi.range_range Output: month_code, dept_code, user_no, sales_amt Filter: ((range_range.user_no)::text = '1'::text) Selected Partitions: 1..2 diff --git a/src/test/regress/expected/llvm_vecsort2.out b/src/test/regress/expected/llvm_vecsort2.out index 3c17f4426b7ccf3ba2feb5beef6f93e6fb523cf3..423da7f3599725e1b67664f449f525e85cb7ddda 100644 --- a/src/test/regress/expected/llvm_vecsort2.out +++ b/src/test/regress/expected/llvm_vecsort2.out @@ -204,6 +204,7 @@ select sum(col_num1), col_vchar, col_text from llvm_vecsort_table_04 group by ro ---- set enable_nestloop = off; set enable_hashjoin = off; +set enable_incremental_sort = off; select A.col_text, B.col_text, sum(A.col_num1) from llvm_vecsort_table_04 A inner join llvm_vecsort_table_04 B on A.col_text = B.col_text group by rollup(1, 2), 1, 2 order by 1, 2, 3; col_text | col_text | sum -----------------------------------+-----------------------------------+------------------------------ @@ -263,6 +264,7 @@ select A.col_text, B.col_text, sum(A.col_num1) from llvm_vecsort_table_04 A inne sheissobeautiful | sheissobeautiful | 0 (54 rows) +reset enable_incremental_sort; reset enable_nestloop; reset enable_hashjoin; ---- diff --git a/src/test/regress/expected/recursive_ref_recursive.out b/src/test/regress/expected/recursive_ref_recursive.out index d81f6a84945b4e0fe721f33d603f376cffacacff..dfd9906ed2925e635a5b0467c81f8b9589467ecf 100644 --- a/src/test/regress/expected/recursive_ref_recursive.out +++ b/src/test/regress/expected/recursive_ref_recursive.out @@ -2125,8 +2125,9 @@ WITH RECURSIVE cte AS select * from cte t1 join cte t2 on t1.id = t2.id order by 1, 2; QUERY PLAN ------------------------------------------------------------------------- - Sort + Incremental Sort Sort Key: t1.id, t1.parentid + Presorted Key: t1.id CTE cte -> Recursive Union -> Seq Scan on rec_tb1 @@ -2145,12 +2146,15 @@ select * from cte t1 join cte t2 on t1.id = t2.id order by 1, 2; Iterations: 4 -> Partitioned Seq Scan on rec_tb4 a Selected Partitions: 1..4 - -> Hash Join - Hash Cond: (t1.id = t2.id) - -> CTE Scan on cte t1 - -> Hash + -> Merge Join + Merge Cond: (t1.id = t2.id) + -> Sort + Sort Key: t1.id + -> CTE Scan on cte t1 + -> Sort + Sort Key: t2.id -> CTE Scan on cte t2 -(25 rows) +(29 rows) WITH RECURSIVE cte AS ( @@ -2627,11 +2631,12 @@ with recursive tmp as ( on tmp.parentid = rec_tb2.id ), tmp2 AS (select id, parentid, name, substr(name, 5) name1 from tmp) select * from tmp,tmp2 where tmp.parentid = tmp2.parentid and tmp2.id not in (select parentid from tmp2) order by tmp.parentid,1,2,3,4,5,6,7,8; - QUERY PLAN ------------------------------------------------------------------------------------------------------------ - Sort + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Incremental Sort Output: tmp.id, tmp.parentid, tmp.name, tmp.substr, tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 Sort Key: tmp.parentid, tmp.id, tmp.name, tmp.substr, tmp2.id, tmp2.name, tmp2.name1 + Presorted Key: tmp.parentid CTE tmp -> Recursive Union -> Partition Iterator @@ -2657,21 +2662,25 @@ with recursive tmp as ( CTE tmp2 -> CTE Scan on tmp Output: tmp.id, tmp.parentid, tmp.name, substr((tmp.name)::text, 5) - -> Hash Join + -> Merge Join Output: tmp.id, tmp.parentid, tmp.name, tmp.substr, tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 - Hash Cond: (tmp2.parentid = tmp.parentid) - -> Nested Loop Anti Join + Merge Cond: (tmp2.parentid = tmp.parentid) + -> Sort Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 - Join Filter: ((tmp2.id = tmp2.parentid) OR (tmp2.id IS NULL) OR (tmp2.parentid IS NULL)) - -> CTE Scan on tmp2 - Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 - -> CTE Scan on tmp2 + Sort Key: tmp2.parentid + -> Nested Loop Anti Join Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 - -> Hash + Join Filter: ((tmp2.id = tmp2.parentid) OR (tmp2.id IS NULL) OR (tmp2.parentid IS NULL)) + -> CTE Scan on tmp2 + Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 + -> CTE Scan on tmp2 + Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 + -> Sort Output: tmp.id, tmp.parentid, tmp.name, tmp.substr + Sort Key: tmp.parentid -> CTE Scan on tmp Output: tmp.id, tmp.parentid, tmp.name, tmp.substr -(42 rows) +(47 rows) with recursive tmp as ( select id, parentid, name, substr(name, 5) @@ -2701,11 +2710,12 @@ with recursive tmp as ( on tmp.parentid = rec_tb2.id ), tmp2 AS (select id, parentid, name, substr(name, 5) name1 from tmp) select * from tmp,tmp2 where tmp.parentid = tmp2.parentid and tmp2.id not in (select parentid from tmp2) order by tmp.parentid,1,2,3,4,5,6,7,8; - QUERY PLAN ------------------------------------------------------------------------------------------------------------ - Sort + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Incremental Sort Output: tmp.id, tmp.parentid, tmp.name, tmp.substr, tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 Sort Key: tmp.parentid, tmp.id, tmp.name, tmp.substr, tmp2.id, tmp2.name, tmp2.name1 + Presorted Key: tmp.parentid CTE tmp -> Recursive Union -> Partition Iterator @@ -2731,21 +2741,25 @@ with recursive tmp as ( CTE tmp2 -> CTE Scan on tmp Output: tmp.id, tmp.parentid, tmp.name, substr((tmp.name)::text, 5) - -> Hash Join + -> Merge Join Output: tmp.id, tmp.parentid, tmp.name, tmp.substr, tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 - Hash Cond: (tmp2.parentid = tmp.parentid) - -> Nested Loop Anti Join + Merge Cond: (tmp2.parentid = tmp.parentid) + -> Sort Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 - Join Filter: ((tmp2.id = tmp2.parentid) OR (tmp2.id IS NULL) OR (tmp2.parentid IS NULL)) - -> CTE Scan on tmp2 + Sort Key: tmp2.parentid + -> Nested Loop Anti Join Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 - -> CTE Scan on tmp2 - Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 - -> Hash + Join Filter: ((tmp2.id = tmp2.parentid) OR (tmp2.id IS NULL) OR (tmp2.parentid IS NULL)) + -> CTE Scan on tmp2 + Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 + -> CTE Scan on tmp2 + Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 + -> Sort Output: tmp.id, tmp.parentid, tmp.name, tmp.substr + Sort Key: tmp.parentid -> CTE Scan on tmp Output: tmp.id, tmp.parentid, tmp.name, tmp.substr -(42 rows) +(47 rows) with recursive tmp as ( select id, parentid, name, substr(name, 5) @@ -2815,11 +2829,12 @@ with recursive tmp as ( on tmp.parentid = rec_tb2.id ), tmp2 AS (select id, parentid, name, substr(name, 5) name1 from tmp) select * from tmp,tmp2 where tmp.parentid = tmp2.parentid and tmp2.id not in (select parentid from tmp2) order by tmp.parentid,1,2,3,4,5,6,7,8; - QUERY PLAN ------------------------------------------------------------------------------------------------------------ - Sort + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Incremental Sort Output: tmp.id, tmp.parentid, tmp.name, tmp.substr, tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 Sort Key: tmp.parentid, tmp.id, tmp.name, tmp.substr, tmp2.id, tmp2.name, tmp2.name1 + Presorted Key: tmp.parentid CTE tmp -> Recursive Union -> Seq Scan on recursive_ref_recursive.rec_tb1 @@ -2837,21 +2852,25 @@ with recursive tmp as ( CTE tmp2 -> CTE Scan on tmp Output: tmp.id, tmp.parentid, tmp.name, substr((tmp.name)::text, 5) - -> Hash Join + -> Merge Join Output: tmp.id, tmp.parentid, tmp.name, tmp.substr, tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 - Hash Cond: (tmp2.parentid = tmp.parentid) - -> Nested Loop Anti Join + Merge Cond: (tmp2.parentid = tmp.parentid) + -> Sort Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 - Join Filter: ((tmp2.id = tmp2.parentid) OR (tmp2.id IS NULL) OR (tmp2.parentid IS NULL)) - -> CTE Scan on tmp2 - Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 - -> CTE Scan on tmp2 + Sort Key: tmp2.parentid + -> Nested Loop Anti Join Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 - -> Hash + Join Filter: ((tmp2.id = tmp2.parentid) OR (tmp2.id IS NULL) OR (tmp2.parentid IS NULL)) + -> CTE Scan on tmp2 + Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 + -> CTE Scan on tmp2 + Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 + -> Sort Output: tmp.id, tmp.parentid, tmp.name, tmp.substr + Sort Key: tmp.parentid -> CTE Scan on tmp Output: tmp.id, tmp.parentid, tmp.name, tmp.substr -(34 rows) +(39 rows) with recursive tmp as ( select id, parentid, name, substr(name, 5) @@ -2881,11 +2900,12 @@ with recursive tmp as ( on tmp.parentid = rec_tb2.id ), tmp2 AS (select id, parentid, name, substr(name, 5) name1 from tmp) select * from tmp,tmp2 where tmp.parentid = tmp2.parentid and tmp2.id not in (select parentid from tmp2) order by tmp.parentid,1,2,3,4,5,6,7,8; - QUERY PLAN ------------------------------------------------------------------------------------------------------------ - Sort + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Incremental Sort Output: tmp.id, tmp.parentid, tmp.name, tmp.substr, tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 Sort Key: tmp.parentid, tmp.id, tmp.name, tmp.substr, tmp2.id, tmp2.name, tmp2.name1 + Presorted Key: tmp.parentid CTE tmp -> Recursive Union -> Seq Scan on recursive_ref_recursive.rec_tb1 @@ -2903,21 +2923,25 @@ with recursive tmp as ( CTE tmp2 -> CTE Scan on tmp Output: tmp.id, tmp.parentid, tmp.name, substr((tmp.name)::text, 5) - -> Hash Join + -> Merge Join Output: tmp.id, tmp.parentid, tmp.name, tmp.substr, tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 - Hash Cond: (tmp2.parentid = tmp.parentid) - -> Nested Loop Anti Join + Merge Cond: (tmp2.parentid = tmp.parentid) + -> Sort Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 - Join Filter: ((tmp2.id = tmp2.parentid) OR (tmp2.id IS NULL) OR (tmp2.parentid IS NULL)) - -> CTE Scan on tmp2 + Sort Key: tmp2.parentid + -> Nested Loop Anti Join Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 - -> CTE Scan on tmp2 - Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 - -> Hash + Join Filter: ((tmp2.id = tmp2.parentid) OR (tmp2.id IS NULL) OR (tmp2.parentid IS NULL)) + -> CTE Scan on tmp2 + Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 + -> CTE Scan on tmp2 + Output: tmp2.id, tmp2.parentid, tmp2.name, tmp2.name1 + -> Sort Output: tmp.id, tmp.parentid, tmp.name, tmp.substr + Sort Key: tmp.parentid -> CTE Scan on tmp Output: tmp.id, tmp.parentid, tmp.name, tmp.substr -(34 rows) +(39 rows) with recursive tmp as ( select id, parentid, name, substr(name, 5) diff --git a/src/test/regress/expected/row_partition_iterator_elimination.out b/src/test/regress/expected/row_partition_iterator_elimination.out index ffe6bf18e233bc11e078bff0e542ffde90d3fea4..aeba96becf29777d88fe9f6e86793a3ccaa91d4d 100644 --- a/src/test/regress/expected/row_partition_iterator_elimination.out +++ b/src/test/regress/expected/row_partition_iterator_elimination.out @@ -116,6 +116,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; QUERY PLAN @@ -333,6 +334,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = on; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; QUERY PLAN @@ -558,6 +560,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = on; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; QUERY PLAN @@ -799,6 +802,7 @@ set enable_indexscan = on; set enable_bitmapscan = off; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; QUERY PLAN @@ -1016,6 +1020,7 @@ set enable_indexscan = on; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = on; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; QUERY PLAN @@ -1241,6 +1246,7 @@ set enable_indexscan = on; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = on; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; QUERY PLAN @@ -1482,6 +1488,7 @@ set enable_indexscan = off; set enable_bitmapscan = on; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; QUERY PLAN @@ -1735,6 +1742,7 @@ set enable_indexscan = off; set enable_bitmapscan = on; set enable_nestloop = off; set enable_hashjoin = on; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; QUERY PLAN @@ -1996,6 +2004,7 @@ set enable_indexscan = off; set enable_bitmapscan = on; set enable_nestloop = off; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = on; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; QUERY PLAN @@ -2273,6 +2282,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; set try_vector_engine_strategy = force; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; @@ -2539,6 +2549,7 @@ set enable_indexscan = on; set enable_bitmapscan = off; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; set try_vector_engine_strategy = force; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; @@ -2805,6 +2816,7 @@ set enable_indexscan = off; set enable_bitmapscan = on; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; set try_vector_engine_strategy = force; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; @@ -3123,6 +3135,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_hash_ht where a = 30 and ctid = (select ctid from test_hash_ht where a = 30 limit 1); QUERY PLAN @@ -3190,6 +3203,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = on; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_hash_ht where a = 30 and ctid = (select ctid from test_hash_ht where a = 30 limit 1); QUERY PLAN @@ -3257,6 +3271,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = on; explain(costs off, verbose on) select * from test_hash_ht where a = 30 and ctid = (select ctid from test_hash_ht where a = 30 limit 1); QUERY PLAN @@ -3318,6 +3333,7 @@ select * from (select * from test_hash_ht where a = 5 and ctid = (select ctid fr 5 | 6 | | | 5 | 6 | | (1 row) +RESET enable_incremental_sort; DROP SCHEMA row_partition_iterator_elimination CASCADE; NOTICE: drop cascades to 2 other objects DETAIL: drop cascades to table test_hash_ht diff --git a/src/test/regress/expected/segment_subpartition_gpi.out b/src/test/regress/expected/segment_subpartition_gpi.out index 585afef3c34502240c115f7cd77a083bd1d5984a..0d46c18d8dd29034e5d5d4693569c73767382b7b 100644 --- a/src/test/regress/expected/segment_subpartition_gpi.out +++ b/src/test/regress/expected/segment_subpartition_gpi.out @@ -481,15 +481,15 @@ select * from range_list where month_code = '201902' order by 1, 2, 3, 4; (0 rows) explain(costs off, verbose on) select * from range_list where dept_code = '1' order by 1, 2, 3, 4; - QUERY PLAN ------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------------------------------ Sort Output: month_code, dept_code, user_no, sales_amt Sort Key: range_list.month_code, range_list.user_no, range_list.sales_amt -> Partition Iterator Output: month_code, dept_code, user_no, sales_amt Iterations: 2, Sub Iterations: 2 - -> Partitioned Seq Scan on segment_subpartition_gpi.range_list + -> Partitioned Index Scan using idx_month_code_local on segment_subpartition_gpi.range_list Output: month_code, dept_code, user_no, sales_amt Filter: ((range_list.dept_code)::text = '1'::text) Selected Partitions: 1..2 @@ -503,15 +503,15 @@ select * from range_list where dept_code = '1' order by 1, 2, 3, 4; (1 row) explain(costs off, verbose on) select * from range_list where user_no = '1' order by 1, 2, 3, 4; - QUERY PLAN -------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------ Sort Output: month_code, dept_code, user_no, sales_amt Sort Key: range_list.month_code, range_list.dept_code, range_list.sales_amt -> Partition Iterator Output: month_code, dept_code, user_no, sales_amt Iterations: 2, Sub Iterations: 4 - -> Partitioned Seq Scan on segment_subpartition_gpi.range_list + -> Partitioned Index Scan using idx_month_code_local on segment_subpartition_gpi.range_list Output: month_code, dept_code, user_no, sales_amt Filter: ((range_list.user_no)::text = '1'::text) Selected Partitions: 1..2 @@ -752,15 +752,15 @@ select * from range_list where month_code = '201902' order by 1, 2, 3, 4; (3 rows) explain(costs off, verbose on) select * from range_list where dept_code = '1' order by 1, 2, 3, 4; - QUERY PLAN ------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------------------------------ Sort Output: month_code, dept_code, user_no, sales_amt Sort Key: range_list.month_code, range_list.user_no, range_list.sales_amt -> Partition Iterator Output: month_code, dept_code, user_no, sales_amt Iterations: 2, Sub Iterations: 2 - -> Partitioned Seq Scan on segment_subpartition_gpi.range_list + -> Partitioned Index Scan using idx_month_code_local on segment_subpartition_gpi.range_list Output: month_code, dept_code, user_no, sales_amt Filter: ((range_list.dept_code)::text = '1'::text) Selected Partitions: 1..2 @@ -776,15 +776,15 @@ select * from range_list where dept_code = '1' order by 1, 2, 3, 4; (3 rows) explain(costs off, verbose on) select * from range_list where user_no = '1' order by 1, 2, 3, 4; - QUERY PLAN -------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------ Sort Output: month_code, dept_code, user_no, sales_amt Sort Key: range_list.month_code, range_list.dept_code, range_list.sales_amt -> Partition Iterator Output: month_code, dept_code, user_no, sales_amt Iterations: 2, Sub Iterations: 6 - -> Partitioned Seq Scan on segment_subpartition_gpi.range_list + -> Partitioned Index Scan using idx_month_code_local on segment_subpartition_gpi.range_list Output: month_code, dept_code, user_no, sales_amt Filter: ((range_list.user_no)::text = '1'::text) Selected Partitions: 1..2 @@ -1027,15 +1027,15 @@ select * from range_range where month_code = '201902' order by 1, 2, 3, 4; (3 rows) explain(costs off, verbose on) select * from range_range where dept_code = '1' order by 1, 2, 3, 4; - QUERY PLAN --------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------- Sort Output: month_code, dept_code, user_no, sales_amt Sort Key: range_range.month_code, range_range.user_no, range_range.sales_amt -> Partition Iterator Output: month_code, dept_code, user_no, sales_amt Iterations: 2, Sub Iterations: 2 - -> Partitioned Seq Scan on segment_subpartition_gpi.range_range + -> Partitioned Index Scan using idx_month_code_local on segment_subpartition_gpi.range_range Output: month_code, dept_code, user_no, sales_amt Filter: ((range_range.dept_code)::text = '1'::text) Selected Partitions: 1..2 @@ -1050,15 +1050,15 @@ select * from range_range where dept_code = '1' order by 1, 2, 3, 4; (2 rows) explain(costs off, verbose on) select * from range_range where user_no = '1' order by 1, 2, 3, 4; - QUERY PLAN ----------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------- Sort Output: month_code, dept_code, user_no, sales_amt Sort Key: range_range.month_code, range_range.dept_code, range_range.sales_amt -> Partition Iterator Output: month_code, dept_code, user_no, sales_amt Iterations: 2, Sub Iterations: 6 - -> Partitioned Seq Scan on segment_subpartition_gpi.range_range + -> Partitioned Index Scan using idx_month_code_local on segment_subpartition_gpi.range_range Output: month_code, dept_code, user_no, sales_amt Filter: ((range_range.user_no)::text = '1'::text) Selected Partitions: 1..2 diff --git a/src/test/regress/expected/seqscan_fusion.out b/src/test/regress/expected/seqscan_fusion.out index 16592a0e44f37464ce4cd970ed059d1a7aaf1b4e..eef3a6e54198156808b207eef85804653cc22428 100644 --- a/src/test/regress/expected/seqscan_fusion.out +++ b/src/test/regress/expected/seqscan_fusion.out @@ -2500,25 +2500,30 @@ explain verbose select b from t where b in (select b from t1 group by b) group b explain verbose select a,b,c,d from t2 where exists (select t.b, sum(cc) from (select b, sum(c) as cc from t1 group by b) s1, t where s1.b=t.b group by t.b order by 1,2) order by 1,2,3,4; - QUERY PLAN ---------------------------------------------------------------------------------------------- - Sort (cost=511.39..515.83 rows=1776 width=16) + QUERY PLAN +---------------------------------------------------------------------------------------------- + Sort (cost=374.55..378.99 rows=1776 width=16) Output: t2.a, t2.b, t2.c, t2.d Sort Key: t2.a, t2.b, t2.c, t2.d InitPlan 1 (returns $0) - -> Sort (cost=387.78..388.28 rows=200 width=40) + -> Incremental Sort (cost=249.07..621.96 rows=200 width=40) Output: t.b, (sum((t1.c)::bigint)) Sort Key: t.b, (sum((t1.c)::bigint)) - -> HashAggregate (cost=378.13..380.13 rows=200 width=40) + Presorted Key: t.b + -> GroupAggregate (cost=247.23..612.96 rows=200 width=40) Output: t.b, sum((t1.c)::bigint) Group By Key: t.b - -> Hash Join (cost=49.96..259.85 rows=15771 distinct=[200, 200] width=8) + -> Merge Join (cost=247.23..492.67 rows=15771 width=8) Output: t1.c, t.b - Hash Cond: (t1.b = t.b) - -> Seq Scan on lazyagg.t1 (cost=0.00..27.76 rows=1776 width=8) - Output: t1.a, t1.b, t1.c, t1.d - -> Hash (cost=27.76..27.76 rows=1776 width=4) + Merge Cond: (t1.b = t.b) + -> Sort (cost=123.61..128.05 rows=1776 width=8) + Output: t1.c, t1.b + Sort Key: t1.b + -> Seq Scan on lazyagg.t1 (cost=0.00..27.76 rows=1776 width=8) + Output: t1.c, t1.b + -> Sort (cost=123.61..128.05 rows=1776 width=4) Output: t.b + Sort Key: t.b -> Seq Scan on lazyagg.t (cost=0.00..27.76 rows=1776 width=4) Output: t.b -> Result (cost=0.00..27.76 rows=1776 width=16) @@ -2526,7 +2531,7 @@ select a,b,c,d from t2 where exists One-Time Filter: $0 -> Seq Scan on lazyagg.t2 (cost=0.00..27.76 rows=1776 width=16) Output: t2.a, t2.b, t2.c, t2.d -(24 rows) +(29 rows) ------------------------------------------------------------------------------------ -- multi-layer lazy agg diff --git a/src/test/regress/expected/sortgroup_agg.out b/src/test/regress/expected/sortgroup_agg.out index 443eb1a1f30b436bd3a987925da58546a627b96a..e4ede98ffab3c404ad26445d1a81757426bff436 100644 --- a/src/test/regress/expected/sortgroup_agg.out +++ b/src/test/regress/expected/sortgroup_agg.out @@ -20,14 +20,15 @@ explain (costs off) select sum(id), v1,v2 from tbl_10k group by v1,v2 order by v QUERY PLAN ---------------------------------------------- Limit - -> Sort + -> Incremental Sort Sort Key: v1, (sum(id)) + Presorted Key: v1 -> GroupAggregate Group By Key: v1, v2 -> Group Sort Sorted Group Key: v1, v2 -> Seq Scan on tbl_10k -(8 rows) +(9 rows) create table agg_1 as select sum(id), v1,v2 from tbl_10k group by v1,v2 order by v1,sum(id) limit 10 offset 11; diff --git a/src/test/regress/expected/sqlbypass_partition.out b/src/test/regress/expected/sqlbypass_partition.out index 30e31182258e323c0eadc2378263c05d9024c2fb..21f60c136ffc9166efbe5ea64450880350435b83 100755 --- a/src/test/regress/expected/sqlbypass_partition.out +++ b/src/test/regress/expected/sqlbypass_partition.out @@ -2450,15 +2450,16 @@ explain select * from test_bypass_sql_partition where col1 > 0 order by col1,col QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------------------------- [No Bypass]reason: Bypass not executed because query's scan operator is not index. - Limit (cost=143.46..143.49 rows=10 width=40) - -> Sort (cost=143.46..144.44 rows=389 width=40) + Limit (cost=2.09..5.79 rows=10 width=40) + -> Incremental Sort (cost=2.09..146.20 rows=389 width=40) Sort Key: col1, col2 DESC + Presorted Key: col1 -> Partition Iterator (cost=0.00..135.06 rows=389 width=40) Iterations: 8 -> Partitioned Index Scan using itest_bypass_sql_partition on test_bypass_sql_partition (cost=0.00..135.06 rows=389 width=40) Index Cond: (col1 > 0) Selected Partitions: 1..8 -(9 rows) +(10 rows) --bypass explain select col1,col2 from test_bypass_sql_partition where col1>0 and col1<10 order by col1,col2 limit 10 ; diff --git a/src/test/regress/expected/xc_groupby.out b/src/test/regress/expected/xc_groupby.out index 80a3804d1184056691a1a1bd8b6c1704fbb9c66e..0e7f040c54781ba4d11d240f9849b92498133f19 100644 --- a/src/test/regress/expected/xc_groupby.out +++ b/src/test/regress/expected/xc_groupby.out @@ -178,19 +178,24 @@ explain (verbose true, costs false) select xc_groupby_tab1.val + xc_groupby_tab2 Group Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 Group By Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 - -> Sort + -> Incremental Sort Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 - -> Hash Join + Presorted Key: xc_groupby_tab1.val + -> Merge Join Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 - Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 - -> Hash + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val -(15 rows) +(20 rows) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1; ?column? @@ -870,19 +875,24 @@ explain (verbose true, costs false) select xc_groupby_tab1.val + xc_groupby_tab2 Group Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 Group By Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 - -> Sort + -> Incremental Sort Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 - -> Hash Join + Presorted Key: xc_groupby_tab1.val + -> Merge Join Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 - Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 - -> Hash + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val -(15 rows) +(20 rows) select xc_groupby_tab1.val + xc_groupby_tab2.val2 sum from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by sum; sum @@ -1360,19 +1370,24 @@ explain (verbose true, costs false) select xc_groupby_tab1.val + xc_groupby_tab2 Group Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 Group By Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 - -> Sort + -> Incremental Sort Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 - -> Hash Join + Presorted Key: xc_groupby_tab1.val + -> Merge Join Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 - Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 - -> Hash + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val -(15 rows) +(20 rows) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2; ?column? @@ -1852,19 +1867,24 @@ explain (verbose true, costs false) select xc_groupby_tab1.val + xc_groupby_tab2 Group Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 Group By Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 - -> Sort + -> Incremental Sort Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2 - -> Hash Join + Presorted Key: xc_groupby_tab1.val + -> Merge Join Output: xc_groupby_tab1.val, xc_groupby_tab2.val2 - Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) - -> Seq Scan on public.xc_groupby_tab1 - Output: xc_groupby_tab1.val, xc_groupby_tab1.val2 - -> Hash + Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val) + -> Sort + Output: xc_groupby_tab1.val + Sort Key: xc_groupby_tab1.val + -> Seq Scan on public.xc_groupby_tab1 + Output: xc_groupby_tab1.val + -> Sort Output: xc_groupby_tab2.val2, xc_groupby_tab2.val + Sort Key: xc_groupby_tab2.val -> Seq Scan on public.xc_groupby_tab2 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val -(15 rows) +(20 rows) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by xc_groupby_tab1.val + xc_groupby_tab2.val2; ?column? diff --git a/src/test/regress/input/incremental_sort.source b/src/test/regress/input/incremental_sort.source new file mode 100644 index 0000000000000000000000000000000000000000..09c06a19ad17c8525ca123b9a734a1af781fefe6 --- /dev/null +++ b/src/test/regress/input/incremental_sort.source @@ -0,0 +1,191 @@ +create schema incremental_sort; +set current_schema = 'incremental_sort'; + +create table tenk1 ( + unique1 int4, + unique2 int4, + two int4, + four int4, + ten int4, + twenty int4, + hundred int4, + thousand int4, + twothousand int4, + fivethous int4, + tenthous int4, + odd int4, + even int4, + stringu1 name, + stringu2 name, + string4 name +); +copy tenk1 from '@abs_srcdir@/data/tenk.data'; +analyze tenk1; +-- When there is a LIMIT clause, incremental sort is beneficial because +-- it only has to sort some of the groups, and not the entire table. +explain (costs off) +select * from (select * from tenk1 order by four) t order by four, ten +limit 1; + +-- When work_mem is not enough to sort the entire table, incremental sort +-- may be faster if individual groups still fit into work_mem. +set work_mem to '4MB'; +explain (costs off) +select * from (select * from tenk1 order by four) t order by four, ten; +reset work_mem; +drop table tenk1; + +create table t(a integer, b integer); + +create or replace function explain_analyze_without_memory(query text) +returns table (out_line text) language plpgsql +as +$$ +declare + line text; + new_line text; +begin + for line in + execute 'explain (analyze, costs off, timing off) ' || query + loop + new_line := regexp_replace(line, '\d+kB', 'NNkB', 'g'); + out_line := regexp_replace(new_line, 'Total runtime: \d+\.\d+ ms', 'Total runtime: NN ms'); + return next; + end loop; +end; +$$; + +-- A single large group tested around each mode transition point. +insert into t(a, b) select floor(i/100 + 1), i + 1 from generate_series(0, 999) n(i); +analyze t; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; +select * from (select * from t order by a) s order by a, b limit 31; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; +select * from (select * from t order by a) s order by a, b limit 32; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; +select * from (select * from t order by a) s order by a, b limit 33; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; +select * from (select * from t order by a) s order by a, b limit 65; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; +select * from (select * from t order by a) s order by a, b limit 66; +delete from t; + +-- An initial large group followed by a small group. +insert into t(a, b) select floor(i/50 + 1), i + 1 from generate_series(0, 999) n(i); +analyze t; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 55; +select * from (select * from t order by a) s order by a, b limit 55; +-- Test EXPLAIN ANALYZE with only a fullsort group. +select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 55'); +delete from t; + +-- An initial small group followed by a large group. +insert into t(a, b) select (case when i < 5 then i else 9 end), i from generate_series(1, 1000) n(i); +analyze t; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 70; +select * from (select * from t order by a) s order by a, b limit 70; +-- Checks case where we hit a group boundary at the last tuple of a batch. +-- Because the full sort state is bounded, we scan 64 tuples (the mode +-- transition point) but only retain 5. Thus when we transition modes, all +-- tuples in the full sort state have different prefix keys. +explain (costs off) select * from (select * from t order by a) s order by a, b limit 5; +select * from (select * from t order by a) s order by a, b limit 5; + +-- Test EXPLAIN ANALYZE with both fullsort and presorted groups. +select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 70'); +delete from t; + +-- Small groups of 10 tuples each tested around each mode transition point. +insert into t(a, b) select floor(i / 10), i from generate_series(1, 1000) n(i); +analyze t; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; +select * from (select * from t order by a) s order by a, b limit 31; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; +select * from (select * from t order by a) s order by a, b limit 32; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; +select * from (select * from t order by a) s order by a, b limit 33; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; +select * from (select * from t order by a) s order by a, b limit 65; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; +select * from (select * from t order by a) s order by a, b limit 66; +delete from t; + +-- Small groups of only 1 tuple each tested around each mode transition point. +insert into t(a, b) select i, i from generate_series(1, 1000) n(i); +analyze t; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; +select * from (select * from t order by a) s order by a, b limit 31; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; +select * from (select * from t order by a) s order by a, b limit 32; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; +select * from (select * from t order by a) s order by a, b limit 33; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; +select * from (select * from t order by a) s order by a, b limit 65; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; +select * from (select * from t order by a) s order by a, b limit 66; +delete from t; + +drop table t; + +-- Incremental sort in various places +create table t(a integer, b integer, c integer); +insert into t(a, b, c) select floor(i / 10), i, i from generate_series(1, 10000) n(i); +analyze t; + +set enable_incremental_sort = off; +explain (costs off) select a, b, sum(c) sc from (select * from t order by a) s group by a, b order by a, b, sc limit 10; +select a, b, sum(c) sc from (select * from t order by a) s group by a, b order by a, b, sc limit 10; +explain (costs off) select count(a) over (order by a, b) from (select * from t order by a) s order by a, b, c limit 10; +select count(a) over (order by a, b) from (select * from t order by a) s order by a, b, c limit 10; +explain (costs off) select distinct a, b from (select * from t order by a) s order by a, b limit 10; +select distinct a, b from (select * from t order by a) s order by a, b limit 10; +explain (costs off) select distinct a, b, sum(c) sc from (select * from t order by a) s group by a, b order by a, b, sc limit 10; +select distinct a, b, sum(c) sc from (select * from t order by a) s group by a, b order by a, b, sc limit 10; +explain (costs off) select a, b, c, avg(a) avg from (select * from t order by a) s group by a, b, c order by a, b, c, avg limit 10; +select a, b, c, avg(a) avg from (select * from t order by a) s group by a, b, c order by a, b, c, avg limit 10; + +set enable_incremental_sort = on; +explain (costs off) select a, b, sum(c) sc from (select * from t order by a) s group by a, b order by a, b, sc limit 10; +select a, b, sum(c) sc from (select * from t order by a) s group by a, b order by a, b, sc limit 10; +explain (costs off) select count(a) over (order by a, b) from (select * from t order by a) s order by a, b, c limit 10; +select count(a) over (order by a, b) from (select * from t order by a) s order by a, b, c limit 10; +explain (costs off) select distinct a, b from (select * from t order by a) s order by a, b limit 10; +select distinct a, b from (select * from t order by a) s order by a, b limit 10; +explain (costs off) select distinct a, b, sum(c) sc from (select * from t order by a) s group by a, b order by a, b, sc limit 10; +select distinct a, b, sum(c) sc from (select * from t order by a) s group by a, b order by a, b, sc limit 10; +explain (costs off) select a, b, c, avg(a) avg from (select * from t order by a) s group by a, b, c order by a, b, c, avg limit 10; +select a, b, c, avg(a) avg from (select * from t order by a) s group by a, b, c order by a, b, c, avg limit 10; + +drop table t; + +-- Incremental sort vs. parallel queries +set query_dop = 2; +create table t (a int, b int, c int); +insert into t select mod(i,10),mod(i,10),i from generate_series(1,200000) s(i); +analyze t; + +set enable_incremental_sort = off; +explain (costs off) select a, b ,c from (select a, b, c from t order by 1 limit 100) order by 1, 2 limit 10; + +set enable_incremental_sort = on; +explain (costs off) select a, b ,c from (select a, b, c from t order by 1 limit 100) order by 1, 2 limit 10; + +-- Incremental sort in explain perf +set explain_perf_mode = pretty; +set enable_hashagg = off; + +explain (verbose on, analyze on, costs off, timing off) select a, b ,c from (select a, b, c from t order by 1 limit 20000) group by 1, 2, 3 limit 1000; + +set work_mem = '64kB'; +explain (verbose on, analyze on, costs off, timing off) select a, b ,c from (select a, b, c from t order by 1 limit 20000) group by 1, 2, 3 limit 1000; + +-- clean up +drop table t; +drop function explain_analyze_without_memory; +reset enable_incremental_sort; +reset query_dop; +reset explain_perf_mode; +reset enable_hashagg; +reset work_mem; +reset current_schema; +drop schema incremental_sort; \ No newline at end of file diff --git a/src/test/regress/output/incremental_sort.source b/src/test/regress/output/incremental_sort.source new file mode 100644 index 0000000000000000000000000000000000000000..28f41437f3688d806c5d3478e0446054f3a48d32 --- /dev/null +++ b/src/test/regress/output/incremental_sort.source @@ -0,0 +1,1756 @@ +create schema incremental_sort; +set current_schema = 'incremental_sort'; +create table tenk1 ( + unique1 int4, + unique2 int4, + two int4, + four int4, + ten int4, + twenty int4, + hundred int4, + thousand int4, + twothousand int4, + fivethous int4, + tenthous int4, + odd int4, + even int4, + stringu1 name, + stringu2 name, + string4 name +); +copy tenk1 from '@abs_srcdir@/data/tenk.data'; +analyze tenk1; +-- When there is a LIMIT clause, incremental sort is beneficial because +-- it only has to sort some of the groups, and not the entire table. +explain (costs off) +select * from (select * from tenk1 order by four) t order by four, ten +limit 1; + QUERY PLAN +----------------------------------------- + Limit + -> Incremental Sort + Sort Key: tenk1.four, tenk1.ten + Presorted Key: tenk1.four + -> Sort + Sort Key: tenk1.four + -> Seq Scan on tenk1 +(7 rows) + +-- When work_mem is not enough to sort the entire table, incremental sort +-- may be faster if individual groups still fit into work_mem. +set work_mem to '4MB'; +explain (costs off) +select * from (select * from tenk1 order by four) t order by four, ten; + QUERY PLAN +----------------------------------- + Incremental Sort + Sort Key: tenk1.four, tenk1.ten + Presorted Key: tenk1.four + -> Sort + Sort Key: tenk1.four + -> Seq Scan on tenk1 +(6 rows) + +reset work_mem; +drop table tenk1; +create table t(a integer, b integer); +create or replace function explain_analyze_without_memory(query text) +returns table (out_line text) language plpgsql +as +$$ +declare + line text; + new_line text; +begin + for line in + execute 'explain (analyze, costs off, timing off) ' || query + loop + new_line := regexp_replace(line, '\d+kB', 'NNkB', 'g'); + out_line := regexp_replace(new_line, 'Total runtime: \d+\.\d+ ms', 'Total runtime: NN ms'); + return next; + end loop; +end; +$$; +-- A single large group tested around each mode transition point. +insert into t(a, b) select floor(i/100 + 1), i + 1 from generate_series(0, 999) n(i); +analyze t; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 31; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 +(31 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 32; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 + 1 | 32 +(32 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 33; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 + 1 | 32 + 1 | 33 +(33 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 65; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 + 1 | 32 + 1 | 33 + 1 | 34 + 1 | 35 + 1 | 36 + 1 | 37 + 1 | 38 + 1 | 39 + 1 | 40 + 1 | 41 + 1 | 42 + 1 | 43 + 1 | 44 + 1 | 45 + 1 | 46 + 1 | 47 + 1 | 48 + 1 | 49 + 1 | 50 + 1 | 51 + 1 | 52 + 1 | 53 + 1 | 54 + 1 | 55 + 1 | 56 + 1 | 57 + 1 | 58 + 1 | 59 + 1 | 60 + 1 | 61 + 1 | 62 + 1 | 63 + 1 | 64 + 1 | 65 +(65 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 66; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 + 1 | 32 + 1 | 33 + 1 | 34 + 1 | 35 + 1 | 36 + 1 | 37 + 1 | 38 + 1 | 39 + 1 | 40 + 1 | 41 + 1 | 42 + 1 | 43 + 1 | 44 + 1 | 45 + 1 | 46 + 1 | 47 + 1 | 48 + 1 | 49 + 1 | 50 + 1 | 51 + 1 | 52 + 1 | 53 + 1 | 54 + 1 | 55 + 1 | 56 + 1 | 57 + 1 | 58 + 1 | 59 + 1 | 60 + 1 | 61 + 1 | 62 + 1 | 63 + 1 | 64 + 1 | 65 + 1 | 66 +(66 rows) + +delete from t; +-- An initial large group followed by a small group. +insert into t(a, b) select floor(i/50 + 1), i + 1 from generate_series(0, 999) n(i); +analyze t; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 55; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 55; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 + 1 | 32 + 1 | 33 + 1 | 34 + 1 | 35 + 1 | 36 + 1 | 37 + 1 | 38 + 1 | 39 + 1 | 40 + 1 | 41 + 1 | 42 + 1 | 43 + 1 | 44 + 1 | 45 + 1 | 46 + 1 | 47 + 1 | 48 + 1 | 49 + 1 | 50 + 2 | 51 + 2 | 52 + 2 | 53 + 2 | 54 + 2 | 55 +(55 rows) + +-- Test EXPLAIN ANALYZE with only a fullsort group. +select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 55'); + explain_analyze_without_memory +-------------------------------------------------------------------------------------------------------------- + Limit (actual rows=55 loops=1) + -> Incremental Sort (actual rows=55 loops=1) + Sort Key: t.a, t.b + Presorted Key: t.a + Full-sort Groups: 2 Sort Methods: top-N heapsort, quicksort Average Memory: NNkB Peak Memory: NNkB + -> Sort (actual rows=101 loops=1) + Sort Key: t.a + Sort Method: quicksort Memory: NNkB + -> Seq Scan on t (actual rows=1000 loops=1) + Total runtime: NN ms +(10 rows) + +delete from t; +-- An initial small group followed by a large group. +insert into t(a, b) select (case when i < 5 then i else 9 end), i from generate_series(1, 1000) n(i); +analyze t; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 70; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 70; + a | b +---+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 9 | 5 + 9 | 6 + 9 | 7 + 9 | 8 + 9 | 9 + 9 | 10 + 9 | 11 + 9 | 12 + 9 | 13 + 9 | 14 + 9 | 15 + 9 | 16 + 9 | 17 + 9 | 18 + 9 | 19 + 9 | 20 + 9 | 21 + 9 | 22 + 9 | 23 + 9 | 24 + 9 | 25 + 9 | 26 + 9 | 27 + 9 | 28 + 9 | 29 + 9 | 30 + 9 | 31 + 9 | 32 + 9 | 33 + 9 | 34 + 9 | 35 + 9 | 36 + 9 | 37 + 9 | 38 + 9 | 39 + 9 | 40 + 9 | 41 + 9 | 42 + 9 | 43 + 9 | 44 + 9 | 45 + 9 | 46 + 9 | 47 + 9 | 48 + 9 | 49 + 9 | 50 + 9 | 51 + 9 | 52 + 9 | 53 + 9 | 54 + 9 | 55 + 9 | 56 + 9 | 57 + 9 | 58 + 9 | 59 + 9 | 60 + 9 | 61 + 9 | 62 + 9 | 63 + 9 | 64 + 9 | 65 + 9 | 66 + 9 | 67 + 9 | 68 + 9 | 69 + 9 | 70 +(70 rows) + +-- Checks case where we hit a group boundary at the last tuple of a batch. +-- Because the full sort state is bounded, we scan 64 tuples (the mode +-- transition point) but only retain 5. Thus when we transition modes, all +-- tuples in the full sort state have different prefix keys. +explain (costs off) select * from (select * from t order by a) s order by a, b limit 5; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 5; + a | b +---+--- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 9 | 5 +(5 rows) + +-- Test EXPLAIN ANALYZE with both fullsort and presorted groups. +select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 70'); + explain_analyze_without_memory +--------------------------------------------------------------------------------------------------------------- + Limit (actual rows=70 loops=1) + -> Incremental Sort (actual rows=70 loops=1) + Sort Key: t.a, t.b + Presorted Key: t.a + Full-sort Groups: 1 Sort Method: quicksort Average Memory: NNkB Peak Memory: NNkB + Pre-sorted Groups: 5 Sort Methods: top-N heapsort, quicksort Average Memory: NNkB Peak Memory: NNkB + -> Sort (actual rows=1000 loops=1) + Sort Key: t.a + Sort Method: quicksort Memory: NNkB + -> Seq Scan on t (actual rows=1000 loops=1) + Total runtime: NN ms +(11 rows) + +delete from t; +-- Small groups of 10 tuples each tested around each mode transition point. +insert into t(a, b) select floor(i / 10), i from generate_series(1, 1000) n(i); +analyze t; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 31; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 2 | 20 + 2 | 21 + 2 | 22 + 2 | 23 + 2 | 24 + 2 | 25 + 2 | 26 + 2 | 27 + 2 | 28 + 2 | 29 + 3 | 30 + 3 | 31 +(31 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 32; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 2 | 20 + 2 | 21 + 2 | 22 + 2 | 23 + 2 | 24 + 2 | 25 + 2 | 26 + 2 | 27 + 2 | 28 + 2 | 29 + 3 | 30 + 3 | 31 + 3 | 32 +(32 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 33; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 2 | 20 + 2 | 21 + 2 | 22 + 2 | 23 + 2 | 24 + 2 | 25 + 2 | 26 + 2 | 27 + 2 | 28 + 2 | 29 + 3 | 30 + 3 | 31 + 3 | 32 + 3 | 33 +(33 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 65; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 2 | 20 + 2 | 21 + 2 | 22 + 2 | 23 + 2 | 24 + 2 | 25 + 2 | 26 + 2 | 27 + 2 | 28 + 2 | 29 + 3 | 30 + 3 | 31 + 3 | 32 + 3 | 33 + 3 | 34 + 3 | 35 + 3 | 36 + 3 | 37 + 3 | 38 + 3 | 39 + 4 | 40 + 4 | 41 + 4 | 42 + 4 | 43 + 4 | 44 + 4 | 45 + 4 | 46 + 4 | 47 + 4 | 48 + 4 | 49 + 5 | 50 + 5 | 51 + 5 | 52 + 5 | 53 + 5 | 54 + 5 | 55 + 5 | 56 + 5 | 57 + 5 | 58 + 5 | 59 + 6 | 60 + 6 | 61 + 6 | 62 + 6 | 63 + 6 | 64 + 6 | 65 +(65 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 66; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 2 | 20 + 2 | 21 + 2 | 22 + 2 | 23 + 2 | 24 + 2 | 25 + 2 | 26 + 2 | 27 + 2 | 28 + 2 | 29 + 3 | 30 + 3 | 31 + 3 | 32 + 3 | 33 + 3 | 34 + 3 | 35 + 3 | 36 + 3 | 37 + 3 | 38 + 3 | 39 + 4 | 40 + 4 | 41 + 4 | 42 + 4 | 43 + 4 | 44 + 4 | 45 + 4 | 46 + 4 | 47 + 4 | 48 + 4 | 49 + 5 | 50 + 5 | 51 + 5 | 52 + 5 | 53 + 5 | 54 + 5 | 55 + 5 | 56 + 5 | 57 + 5 | 58 + 5 | 59 + 6 | 60 + 6 | 61 + 6 | 62 + 6 | 63 + 6 | 64 + 6 | 65 + 6 | 66 +(66 rows) + +delete from t; +-- Small groups of only 1 tuple each tested around each mode transition point. +insert into t(a, b) select i, i from generate_series(1, 1000) n(i); +analyze t; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 31; + a | b +----+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 11 | 11 + 12 | 12 + 13 | 13 + 14 | 14 + 15 | 15 + 16 | 16 + 17 | 17 + 18 | 18 + 19 | 19 + 20 | 20 + 21 | 21 + 22 | 22 + 23 | 23 + 24 | 24 + 25 | 25 + 26 | 26 + 27 | 27 + 28 | 28 + 29 | 29 + 30 | 30 + 31 | 31 +(31 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 32; + a | b +----+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 11 | 11 + 12 | 12 + 13 | 13 + 14 | 14 + 15 | 15 + 16 | 16 + 17 | 17 + 18 | 18 + 19 | 19 + 20 | 20 + 21 | 21 + 22 | 22 + 23 | 23 + 24 | 24 + 25 | 25 + 26 | 26 + 27 | 27 + 28 | 28 + 29 | 29 + 30 | 30 + 31 | 31 + 32 | 32 +(32 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 33; + a | b +----+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 11 | 11 + 12 | 12 + 13 | 13 + 14 | 14 + 15 | 15 + 16 | 16 + 17 | 17 + 18 | 18 + 19 | 19 + 20 | 20 + 21 | 21 + 22 | 22 + 23 | 23 + 24 | 24 + 25 | 25 + 26 | 26 + 27 | 27 + 28 | 28 + 29 | 29 + 30 | 30 + 31 | 31 + 32 | 32 + 33 | 33 +(33 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 65; + a | b +----+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 11 | 11 + 12 | 12 + 13 | 13 + 14 | 14 + 15 | 15 + 16 | 16 + 17 | 17 + 18 | 18 + 19 | 19 + 20 | 20 + 21 | 21 + 22 | 22 + 23 | 23 + 24 | 24 + 25 | 25 + 26 | 26 + 27 | 27 + 28 | 28 + 29 | 29 + 30 | 30 + 31 | 31 + 32 | 32 + 33 | 33 + 34 | 34 + 35 | 35 + 36 | 36 + 37 | 37 + 38 | 38 + 39 | 39 + 40 | 40 + 41 | 41 + 42 | 42 + 43 | 43 + 44 | 44 + 45 | 45 + 46 | 46 + 47 | 47 + 48 | 48 + 49 | 49 + 50 | 50 + 51 | 51 + 52 | 52 + 53 | 53 + 54 | 54 + 55 | 55 + 56 | 56 + 57 | 57 + 58 | 58 + 59 | 59 + 60 | 60 + 61 | 61 + 62 | 62 + 63 | 63 + 64 | 64 + 65 | 65 +(65 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 66; + a | b +----+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 11 | 11 + 12 | 12 + 13 | 13 + 14 | 14 + 15 | 15 + 16 | 16 + 17 | 17 + 18 | 18 + 19 | 19 + 20 | 20 + 21 | 21 + 22 | 22 + 23 | 23 + 24 | 24 + 25 | 25 + 26 | 26 + 27 | 27 + 28 | 28 + 29 | 29 + 30 | 30 + 31 | 31 + 32 | 32 + 33 | 33 + 34 | 34 + 35 | 35 + 36 | 36 + 37 | 37 + 38 | 38 + 39 | 39 + 40 | 40 + 41 | 41 + 42 | 42 + 43 | 43 + 44 | 44 + 45 | 45 + 46 | 46 + 47 | 47 + 48 | 48 + 49 | 49 + 50 | 50 + 51 | 51 + 52 | 52 + 53 | 53 + 54 | 54 + 55 | 55 + 56 | 56 + 57 | 57 + 58 | 58 + 59 | 59 + 60 | 60 + 61 | 61 + 62 | 62 + 63 | 63 + 64 | 64 + 65 | 65 + 66 | 66 +(66 rows) + +delete from t; +drop table t; +-- Incremental sort in various places +create table t(a integer, b integer, c integer); +insert into t(a, b, c) select floor(i / 10), i, i from generate_series(1, 10000) n(i); +analyze t; +set enable_incremental_sort = off; +explain (costs off) select a, b, sum(c) sc from (select * from t order by a) s group by a, b order by a, b, sc limit 10; + QUERY PLAN +---------------------------------------- + Limit + -> Sort + Sort Key: t.a, t.b, (sum(t.c)) + -> HashAggregate + Group By Key: t.a, t.b + -> Sort + Sort Key: t.a + -> Seq Scan on t +(8 rows) + +select a, b, sum(c) sc from (select * from t order by a) s group by a, b order by a, b, sc limit 10; + a | b | sc +---+----+---- + 0 | 1 | 1 + 0 | 2 | 2 + 0 | 3 | 3 + 0 | 4 | 4 + 0 | 5 | 5 + 0 | 6 | 6 + 0 | 7 | 7 + 0 | 8 | 8 + 0 | 9 | 9 + 1 | 10 | 10 +(10 rows) + +explain (costs off) select count(a) over (order by a, b) from (select * from t order by a) s order by a, b, c limit 10; + QUERY PLAN +--------------------------------------------- + Limit + -> Sort + Sort Key: t.a, t.b, t.c + -> WindowAgg + -> Sort + Sort Key: t.a, t.b + -> Sort + Sort Key: t.a + -> Seq Scan on t +(9 rows) + +select count(a) over (order by a, b) from (select * from t order by a) s order by a, b, c limit 10; + count +------- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +explain (costs off) select distinct a, b from (select * from t order by a) s order by a, b limit 10; + QUERY PLAN +--------------------------------------------- + Limit + -> Sort + Sort Key: s.a, s.b + -> HashAggregate + Group By Key: s.a, s.b + -> Subquery Scan on s + -> Sort + Sort Key: t.a + -> Seq Scan on t +(9 rows) + +select distinct a, b from (select * from t order by a) s order by a, b limit 10; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 +(10 rows) + +explain (costs off) select distinct a, b, sum(c) sc from (select * from t order by a) s group by a, b order by a, b, sc limit 10; + QUERY PLAN +------------------------------------------------ + Limit + -> Sort + Sort Key: t.a, t.b, (sum(t.c)) + -> HashAggregate + Group By Key: t.a, t.b, sum(t.c) + -> HashAggregate + Group By Key: t.a, t.b + -> Sort + Sort Key: t.a + -> Seq Scan on t +(10 rows) + +select distinct a, b, sum(c) sc from (select * from t order by a) s group by a, b order by a, b, sc limit 10; + a | b | sc +---+----+---- + 0 | 1 | 1 + 0 | 2 | 2 + 0 | 3 | 3 + 0 | 4 | 4 + 0 | 5 | 5 + 0 | 6 | 6 + 0 | 7 | 7 + 0 | 8 | 8 + 0 | 9 | 9 + 1 | 10 | 10 +(10 rows) + +explain (costs off) select a, b, c, avg(a) avg from (select * from t order by a) s group by a, b, c order by a, b, c, avg limit 10; + QUERY PLAN +--------------------------------------------- + Limit + -> Sort + Sort Key: t.a, t.b, t.c, (avg(t.a)) + -> HashAggregate + Group By Key: t.a, t.b, t.c + -> Sort + Sort Key: t.a + -> Seq Scan on t +(8 rows) + +select a, b, c, avg(a) avg from (select * from t order by a) s group by a, b, c order by a, b, c, avg limit 10; + a | b | c | avg +---+----+----+------------------------ + 0 | 1 | 1 | 0.00000000000000000000 + 0 | 2 | 2 | 0.00000000000000000000 + 0 | 3 | 3 | 0.00000000000000000000 + 0 | 4 | 4 | 0.00000000000000000000 + 0 | 5 | 5 | 0.00000000000000000000 + 0 | 6 | 6 | 0.00000000000000000000 + 0 | 7 | 7 | 0.00000000000000000000 + 0 | 8 | 8 | 0.00000000000000000000 + 0 | 9 | 9 | 0.00000000000000000000 + 1 | 10 | 10 | 1.00000000000000000000 +(10 rows) + +set enable_incremental_sort = on; +explain (costs off) select a, b, sum(c) sc from (select * from t order by a) s group by a, b order by a, b, sc limit 10; + QUERY PLAN +--------------------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b, (sum(t.c)) + Presorted Key: t.a, t.b + -> GroupAggregate + Group By Key: t.a, t.b + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(12 rows) + +select a, b, sum(c) sc from (select * from t order by a) s group by a, b order by a, b, sc limit 10; + a | b | sc +---+----+---- + 0 | 1 | 1 + 0 | 2 | 2 + 0 | 3 | 3 + 0 | 4 | 4 + 0 | 5 | 5 + 0 | 6 | 6 + 0 | 7 | 7 + 0 | 8 | 8 + 0 | 9 | 9 + 1 | 10 | 10 +(10 rows) + +explain (costs off) select count(a) over (order by a, b) from (select * from t order by a) s order by a, b, c limit 10; + QUERY PLAN +--------------------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b, t.c + Presorted Key: t.a, t.b + -> WindowAgg + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(11 rows) + +select count(a) over (order by a, b) from (select * from t order by a) s order by a, b, c limit 10; + count +------- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +explain (costs off) select distinct a, b from (select * from t order by a) s order by a, b limit 10; + QUERY PLAN +--------------------------------------------- + Limit + -> Unique + -> Incremental Sort + Sort Key: s.a, s.b + Presorted Key: s.a + -> Subquery Scan on s + -> Sort + Sort Key: t.a + -> Seq Scan on t +(9 rows) + +select distinct a, b from (select * from t order by a) s order by a, b limit 10; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 +(10 rows) + +explain (costs off) select distinct a, b, sum(c) sc from (select * from t order by a) s group by a, b order by a, b, sc limit 10; + QUERY PLAN +--------------------------------------------------- + Limit + -> Unique + -> Incremental Sort + Sort Key: t.a, t.b, (sum(t.c)) + Presorted Key: t.a, t.b + -> GroupAggregate + Group By Key: t.a, t.b + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(13 rows) + +select distinct a, b, sum(c) sc from (select * from t order by a) s group by a, b order by a, b, sc limit 10; + a | b | sc +---+----+---- + 0 | 1 | 1 + 0 | 2 | 2 + 0 | 3 | 3 + 0 | 4 | 4 + 0 | 5 | 5 + 0 | 6 | 6 + 0 | 7 | 7 + 0 | 8 | 8 + 0 | 9 | 9 + 1 | 10 | 10 +(10 rows) + +explain (costs off) select a, b, c, avg(a) avg from (select * from t order by a) s group by a, b, c order by a, b, c, avg limit 10; + QUERY PLAN +--------------------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b, t.c, (avg(t.a)) + Presorted Key: t.a, t.b, t.c + -> GroupAggregate + Group By Key: t.a, t.b, t.c + -> Incremental Sort + Sort Key: t.a, t.b, t.c + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(12 rows) + +select a, b, c, avg(a) avg from (select * from t order by a) s group by a, b, c order by a, b, c, avg limit 10; + a | b | c | avg +---+----+----+------------------------ + 0 | 1 | 1 | 0.00000000000000000000 + 0 | 2 | 2 | 0.00000000000000000000 + 0 | 3 | 3 | 0.00000000000000000000 + 0 | 4 | 4 | 0.00000000000000000000 + 0 | 5 | 5 | 0.00000000000000000000 + 0 | 6 | 6 | 0.00000000000000000000 + 0 | 7 | 7 | 0.00000000000000000000 + 0 | 8 | 8 | 0.00000000000000000000 + 0 | 9 | 9 | 0.00000000000000000000 + 1 | 10 | 10 | 1.00000000000000000000 +(10 rows) + +drop table t; +-- Incremental sort vs. parallel queries +set query_dop = 2; +create table t (a int, b int, c int); +insert into t select mod(i,10),mod(i,10),i from generate_series(1,200000) s(i); +analyze t; +set enable_incremental_sort = off; +explain (costs off) select a, b ,c from (select a, b, c from t order by 1 limit 100) order by 1, 2 limit 10; + QUERY PLAN +---------------------------------------------------------------- + Limit + -> Sort + Sort Key: t.a, t.b + -> Limit + -> Sort + Sort Key: t.a + -> Streaming(type: LOCAL GATHER dop: 1/2) + -> Limit + -> Sort + Sort Key: t.a + -> Seq Scan on t +(11 rows) + +set enable_incremental_sort = on; +explain (costs off) select a, b ,c from (select a, b, c from t order by 1 limit 100) order by 1, 2 limit 10; + QUERY PLAN +---------------------------------------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Limit + -> Sort + Sort Key: t.a + -> Streaming(type: LOCAL GATHER dop: 1/2) + -> Limit + -> Sort + Sort Key: t.a + -> Seq Scan on t +(12 rows) + +-- Incremental sort in explain perf +set explain_perf_mode = pretty; +set enable_hashagg = off; +explain (verbose on, analyze on, costs off, timing off) select a, b ,c from (select a, b, c from t order by 1 limit 20000) group by 1, 2, 3 limit 1000; +--?.* +--?.* +--? 1 | -> Limit |.* +--? 2 | -> Group |.* +--? 3 | -> Incremental Sort |.* +--? 4 | -> Limit |.* +--? 5 | -> Sort |.* +--? 6 | -> Streaming(type: LOCAL GATHER dop: 1/2) |.* +--? 7 | -> Limit |.* +--? 8 | -> Sort |.* +--? 9 | -> Seq Scan on incremental_sort.t |.* +(9 rows) + +--?.* +--?.* + 3 --Incremental Sort +--? Full-sort Groups: 1 Sort Method: quicksort Memory: .* +--? Pre-sorted Groups: 1 Sort Method: quicksort Memory: .* + 5 --Sort +--? Sort Method: quicksort Memory: .* + 8 --Sort +--? Sort Method: top-N heapsort Memory: .* +(7 rows) + + Targetlist Information (identified by plan id) +--?.* + 1 --Limit + Output: t.a, t.b, t.c + 2 --Group + Output: t.a, t.b, t.c + Group By Key: t.a, t.b, t.c + 3 --Incremental Sort + Output: t.a, t.b, t.c + Sort Key: t.a, t.b, t.c + Presorted Key: t.a + 4 --Limit + Output: t.a, t.b, t.c + 5 --Sort + Output: t.a, t.b, t.c + Sort Key: t.a + 6 --Streaming(type: LOCAL GATHER dop: 1/2) + Output: t.a, t.b, t.c + 7 --Limit + Output: t.a, t.b, t.c + 8 --Sort + Output: t.a, t.b, t.c + Sort Key: t.a + 9 --Seq Scan on incremental_sort.t + Output: t.a, t.b, t.c +(23 rows) + + User Define Profiling +-------------------------------------------------------------- + Segment Id: 1 Track name: Datanode build connection + Plan Node id: 1 Track name: Datanode start up stream thread +(2 rows) + +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +(10 rows) + +set work_mem = '64kB'; +explain (verbose on, analyze on, costs off, timing off) select a, b ,c from (select a, b, c from t order by 1 limit 20000) group by 1, 2, 3 limit 1000; +--?.* +--?.* +--? 1 | -> Limit | 1000 |.* +--? 2 | -> Group | 1000 |.* +--? 3 | -> Incremental Sort | 1000 |.* +--? 4 | -> Limit | 6001 |.* +--? 5 | -> Sort | 6001 |.* +--? 6 | -> Streaming(type: LOCAL GATHER dop: 1/2) | 40000 |.* +--? 7 | -> Limit | 40000 |.* +--? 8 | -> Sort | 40000 |.* +--? 9 | -> Seq Scan on incremental_sort.t | 60000 |.* +(9 rows) + +--?.* +--?.* + 3 --Incremental Sort +--? Full-sort Groups: 1 Sort Method: quicksort Memory: .* +--? Pre-sorted Groups: 1 Sort Method: external merge Disk: .* +--? Sort Method: external merge Memory: .* + 5 --Sort +--? Sort Method: external merge Disk: .* +--? Sort Method: external merge Memory: .* + 8 --Sort +--? Sort Method: external merge Disk: .* +(9 rows) + + Targetlist Information (identified by plan id) +------------------------------------------------ + 1 --Limit + Output: t.a, t.b, t.c + 2 --Group + Output: t.a, t.b, t.c + Group By Key: t.a, t.b, t.c + 3 --Incremental Sort + Output: t.a, t.b, t.c + Sort Key: t.a, t.b, t.c + Presorted Key: t.a + 4 --Limit + Output: t.a, t.b, t.c + 5 --Sort + Output: t.a, t.b, t.c + Sort Key: t.a + 6 --Streaming(type: LOCAL GATHER dop: 1/2) + Output: t.a, t.b, t.c + 7 --Limit + Output: t.a, t.b, t.c + 8 --Sort + Output: t.a, t.b, t.c + Sort Key: t.a + 9 --Seq Scan on incremental_sort.t + Output: t.a, t.b, t.c +(23 rows) + + User Define Profiling +-------------------------------------------------------------- + Segment Id: 1 Track name: Datanode build connection + Plan Node id: 1 Track name: Datanode start up stream thread +(2 rows) + +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +(10 rows) + +-- clean up +drop table t; +drop function explain_analyze_without_memory; +reset enable_incremental_sort; +reset query_dop; +reset explain_perf_mode; +reset enable_hashagg; +reset work_mem; +reset current_schema; +drop schema incremental_sort; \ No newline at end of file diff --git a/src/test/regress/output/partiton_pathkey_row_plan.source b/src/test/regress/output/partiton_pathkey_row_plan.source index 91787d08f886028f6594fa5bb68e9a401496621e..e73072dab0c0e945026ac911b8516f2938731711 100644 --- a/src/test/regress/output/partiton_pathkey_row_plan.source +++ b/src/test/regress/output/partiton_pathkey_row_plan.source @@ -186,40 +186,43 @@ explain(costs off) select * from part where c = 10 order by a desc, b desc, c de (6 rows) explain(costs off) select * from part order by a asc, b desc limit 10; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------- Limit - -> Sort + -> Incremental Sort Sort Key: a, b DESC + Presorted Key: a -> Partition Iterator Iterations: 4 - -> Partitioned Seq Scan on part + -> Partitioned Index Scan using idx_a_b on part Selected Partitions: 1..4 -(7 rows) +(8 rows) explain(costs off) select * from part order by a desc, b asc limit 10; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------- Limit - -> Sort + -> Incremental Sort Sort Key: a DESC, b - -> Partition Iterator + Presorted Key: a + -> Partition Iterator Scan Backward Iterations: 4 - -> Partitioned Seq Scan on part + -> Partitioned Index Scan Backward using idx_a_b on part Selected Partitions: 1..4 -(7 rows) +(8 rows) explain(costs off) select * from part order by a desc, b desc nulls last limit 10; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------- Limit - -> Sort + -> Incremental Sort Sort Key: a DESC, b DESC NULLS LAST - -> Partition Iterator + Presorted Key: a + -> Partition Iterator Scan Backward Iterations: 4 - -> Partitioned Seq Scan on part + -> Partitioned Index Scan Backward using idx_a_b on part Selected Partitions: 1..4 -(7 rows) +(8 rows) explain(costs off) select * from part order by a desc nulls last, b desc limit 10; QUERY PLAN @@ -270,16 +273,17 @@ explain(costs off) select * from part order by b limit 10; (7 rows) explain(costs off) select * from part order by a, b, c limit 10; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------- Limit - -> Sort + -> Incremental Sort Sort Key: a, b, c + Presorted Key: a, b -> Partition Iterator Iterations: 4 - -> Partitioned Seq Scan on part + -> Partitioned Index Scan using idx_a_b on part Selected Partitions: 1..4 -(7 rows) +(8 rows) drop index idx_a_b; create index idx_a_b_c on part(a,b,c) local; diff --git a/src/test/regress/output/recovery_2pc_tools.source b/src/test/regress/output/recovery_2pc_tools.source index 2d16070fddc2d7fbbfd91c51799293d6d9ad1545..b3b9e08d338e11fa367953820f3aaa502a5b3aec 100644 --- a/src/test/regress/output/recovery_2pc_tools.source +++ b/src/test/regress/output/recovery_2pc_tools.source @@ -288,6 +288,7 @@ select name,vartype,unit,min_val,max_val from pg_settings where name <> 'qunit_c enable_ignore_case_in_dquotes | bool | | | enable_incremental_catchup | bool | | | enable_incremental_checkpoint | bool | | | + enable_incremental_sort | bool | | | enable_index_nestloop | bool | | | enable_indexonlyscan | bool | | | enable_indexscan | bool | | | diff --git a/src/test/regress/parallel_schedule0 b/src/test/regress/parallel_schedule0 index 02d900b673aaffd6165daf04a6bbb40347e92729..45e7f62c87ce45cdd145ebd76e47c6585b90c174 100644 --- a/src/test/regress/parallel_schedule0 +++ b/src/test/regress/parallel_schedule0 @@ -1113,6 +1113,9 @@ test: row_count_function # test for inherit table test: inherits01 +# incremental sort +test: incremental_sort + # show_warnings test: show_warnings prevent_table_in_sys_schema create_tbl_init_td_check diff --git a/src/test/regress/parallel_schedule0C b/src/test/regress/parallel_schedule0C index 7f3b6629190cde25ccf9da04b7e7b37f0469ea2c..cc5b816a9818e805155921d085a7c129f62f0d89 100644 --- a/src/test/regress/parallel_schedule0C +++ b/src/test/regress/parallel_schedule0C @@ -72,6 +72,9 @@ test: leaky_function_operator # ---------- #test: gs_guc +# incremental sort +test: incremental_sort + test: cstore_unique_index test: cast_privileges_test diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 5afbe2fe3a8b4a2fce28ec6efb25bcc4d1573413..560e863de1ff7e27d8027bfda4f5b53c0c1ec9af 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -77,6 +77,7 @@ test: select_distinct_on test: select_implicit test: select_having test: subselect +test: incremental_sort test: union test: case test: join diff --git a/src/test/regress/sql/bypass_simplequery_support.sql b/src/test/regress/sql/bypass_simplequery_support.sql index 5117b4242a702b234c7af004ed00968f43764b8d..5bb7077408050883d7defa6ca9dc06f3c97d0a21 100755 --- a/src/test/regress/sql/bypass_simplequery_support.sql +++ b/src/test/regress/sql/bypass_simplequery_support.sql @@ -8,7 +8,7 @@ set opfusion_debug_mode = 'log'; set log_min_messages=debug; set logging_module = 'on(OPFUSION)'; set sql_beta_feature = 'index_cost_with_leaf_pages_only'; - +set enable_incremental_sort = off; -- create table drop table if exists test_bypass_sq1; create table test_bypass_sq1(col1 int, col2 int, col3 text); diff --git a/src/test/regress/sql/col_count_distinct_3.sql b/src/test/regress/sql/col_count_distinct_3.sql index 48a56de35d12e4455d75650618c4a92e363e7b61..b74d49ab63b0fab88424ace22c462b1c897aaf83 100644 --- a/src/test/regress/sql/col_count_distinct_3.sql +++ b/src/test/regress/sql/col_count_distinct_3.sql @@ -14,6 +14,7 @@ INSERT INTO t_distinct select generate_series(1, 1000)%501, generate_series(1, 1 analyze t_distinct; -- Case 3 groupagg optimization +set enable_incremental_sort = off; set enable_hashagg=off; explain (costs off) select avg(distinct(a)) from t_distinct; select avg(distinct(a)) from t_distinct; @@ -27,6 +28,7 @@ explain (costs off) select b, c from t_distinct group by b, c order by b, count( select b, c from t_distinct group by b, c order by b, count(distinct(c))-c limit 10; explain (costs off) select count(distinct(c)), d from t_distinct group by d having avg(distinct(c)) <> 0 order by 2; select count(distinct(c)), d from t_distinct group by d having avg(distinct(c)) <> 0 order by 2; +reset enable_incremental_sort; reset enable_hashagg; -- Case 4 two_level_hashagg diff --git a/src/test/regress/sql/col_count_distinct_4.sql b/src/test/regress/sql/col_count_distinct_4.sql index 66bec65b50e3758ac8dc2f15753652a7b393e118..3b257411ff3e3df05711fd7366f2e04e5376c2a9 100644 --- a/src/test/regress/sql/col_count_distinct_4.sql +++ b/src/test/regress/sql/col_count_distinct_4.sql @@ -4,6 +4,7 @@ create schema col_distribute_count_distinct_4; set current_schema = col_distribute_count_distinct_4; +set enable_incremental_sort = false; -- Create Table and Insert Data create table src(c1 int); @@ -75,5 +76,6 @@ select count(distinct(c)) from (select a, ''::text as c from (select t1.a from t -- Clean Table drop table t_distinct; +reset enable_incremental_sort; reset current_schema; drop schema col_distribute_count_distinct_4 cascade; diff --git a/src/test/regress/sql/col_partition_iterator_elimination.sql b/src/test/regress/sql/col_partition_iterator_elimination.sql index ab58cf4414267a181935c105a6a97e51ba19bdd5..5c54e7ef63e271d42d3df1475b1d8a0af38f9c02 100644 --- a/src/test/regress/sql/col_partition_iterator_elimination.sql +++ b/src/test/regress/sql/col_partition_iterator_elimination.sql @@ -47,6 +47,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; @@ -70,6 +71,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = on; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; @@ -93,6 +95,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = on; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; @@ -116,6 +119,7 @@ set enable_indexscan = on; set enable_bitmapscan = off; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; @@ -139,6 +143,7 @@ set enable_indexscan = on; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = on; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; @@ -162,6 +167,7 @@ set enable_indexscan = on; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = on; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; @@ -185,6 +191,7 @@ set enable_indexscan = off; set enable_bitmapscan = on; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; @@ -208,6 +215,7 @@ set enable_indexscan = off; set enable_bitmapscan = on; set enable_nestloop = off; set enable_hashjoin = on; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; @@ -231,6 +239,7 @@ set enable_indexscan = off; set enable_bitmapscan = on; set enable_nestloop = off; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = on; explain(costs off, verbose on) select * from test_range_pt where a = 30 order by 1,2,3,4; @@ -248,6 +257,7 @@ select count(t1.a) from test_range_pt1 t1 join test_range_pt t2 on t1.b = t2.b w select * from test_range_pt1 t1 join test_range_pt t2 on t1.b = t2.b where t1.a = 5 and t2.a = 5 order by 1,2,3,4; select count(t1.b) from test_range_pt1 t1 join test_range_pt t2 on t1.b = t2.b where t1.a = 5 and t2.a = 5; +RESET enable_incremental_sort; DROP SCHEMA col_partition_iterator_elimination CASCADE; diff --git a/src/test/regress/sql/gpi_index_only.sql b/src/test/regress/sql/gpi_index_only.sql index 83e5be13294353cef4a66f485399e73e991547e6..fc0e556d9e43e65b7fdf5be48c88c7a9e4b5fe45 100644 --- a/src/test/regress/sql/gpi_index_only.sql +++ b/src/test/regress/sql/gpi_index_only.sql @@ -142,6 +142,7 @@ create index gpi_J2_TBL_nonp_t_index on gpi_J2_TBL(t) global; set enable_bitmapscan=off; set enable_seqscan=off; +set enable_material=off; vacuum analyze gpi_J1_TBL; vacuum analyze gpi_J2_TBL; @@ -234,6 +235,7 @@ SELECT distinct(j),k WHERE k>25 and k<35 ORDER BY j,k; +reset enable_material; drop table if exists gpi_J1_TBL; drop table if exists gpi_J2_TBL; set client_min_messages=notice; diff --git a/src/test/regress/sql/llvm_vecsort2.sql b/src/test/regress/sql/llvm_vecsort2.sql index b8d510fe202ea2c24d7c205c3cd71e1c4463957a..4e45b91ff34e06b2828d954b992014cac274d2bf 100644 --- a/src/test/regress/sql/llvm_vecsort2.sql +++ b/src/test/regress/sql/llvm_vecsort2.sql @@ -96,7 +96,9 @@ select sum(col_num1), col_vchar, col_text from llvm_vecsort_table_04 group by ro ---- set enable_nestloop = off; set enable_hashjoin = off; +set enable_incremental_sort = off; select A.col_text, B.col_text, sum(A.col_num1) from llvm_vecsort_table_04 A inner join llvm_vecsort_table_04 B on A.col_text = B.col_text group by rollup(1, 2), 1, 2 order by 1, 2, 3; +reset enable_incremental_sort; reset enable_nestloop; reset enable_hashjoin; diff --git a/src/test/regress/sql/row_partition_iterator_elimination.sql b/src/test/regress/sql/row_partition_iterator_elimination.sql index c2db049fc72078bda9388c299a2f67d47cfb8283..302205a4bb0c29c8ae71eac648e7669df08e5e7b 100644 --- a/src/test/regress/sql/row_partition_iterator_elimination.sql +++ b/src/test/regress/sql/row_partition_iterator_elimination.sql @@ -66,6 +66,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; @@ -95,6 +96,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = on; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; @@ -124,6 +126,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = on; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; @@ -153,6 +156,7 @@ set enable_indexscan = on; set enable_bitmapscan = off; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; @@ -182,6 +186,7 @@ set enable_indexscan = on; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = on; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; @@ -211,6 +216,7 @@ set enable_indexscan = on; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = on; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; @@ -240,6 +246,7 @@ set enable_indexscan = off; set enable_bitmapscan = on; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; @@ -269,6 +276,7 @@ set enable_indexscan = off; set enable_bitmapscan = on; set enable_nestloop = off; set enable_hashjoin = on; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; @@ -298,6 +306,7 @@ set enable_indexscan = off; set enable_bitmapscan = on; set enable_nestloop = off; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = on; explain(costs off, verbose on) select * from test_hash_ht where a = 30 order by 1,2,3,4; @@ -327,6 +336,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; set try_vector_engine_strategy = force; @@ -357,6 +367,7 @@ set enable_indexscan = on; set enable_bitmapscan = off; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; set try_vector_engine_strategy = force; @@ -387,6 +398,7 @@ set enable_indexscan = off; set enable_bitmapscan = on; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; set try_vector_engine_strategy = force; @@ -424,6 +436,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = on; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_hash_ht where a = 30 and ctid = (select ctid from test_hash_ht where a = 30 limit 1); @@ -437,6 +450,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = on; +set enable_incremental_sort = off; set enable_mergejoin = off; explain(costs off, verbose on) select * from test_hash_ht where a = 30 and ctid = (select ctid from test_hash_ht where a = 30 limit 1); @@ -451,6 +465,7 @@ set enable_indexscan = off; set enable_bitmapscan = off; set enable_nestloop = off; set enable_hashjoin = off; +set enable_incremental_sort = off; set enable_mergejoin = on; explain(costs off, verbose on) select * from test_hash_ht where a = 30 and ctid = (select ctid from test_hash_ht where a = 30 limit 1); @@ -458,6 +473,7 @@ explain(costs off, verbose on) select * from (select * from test_hash_ht where a select * from test_hash_ht where a = 30 and ctid = (select ctid from test_hash_ht where a = 30 limit 1); select * from (select * from test_hash_ht where a = 5 and ctid = (select ctid from test_hash_ht where a = 5 limit 1)) t1 join (select * from test_range_pt where a = 5 and ctid = (select ctid from test_range_pt where a = 5 limit 1)) t2 on t1.a = t2.a where t1.a = 5; +RESET enable_incremental_sort; DROP SCHEMA row_partition_iterator_elimination CASCADE;