PostgreSQL的ExecHashJoin依赖其他函数的实现逻辑是什么

2024-04-02 19:55

短信预约 -IT技能 免费直播动态提醒

本篇内容介绍了“PostgreSQL的ExecHashJoin依赖其他函数的实现逻辑是什么”的有关知识，在实际案例的操作过程中，不少人都会遇到这样的困境，接下来就让小编带领大家学习一下如何处理这些情况吧！希望大家仔细阅读，能够学有所成！

一、数据结构

JoinState
Hash/NestLoop/Merge Join的基类


typedef struct JoinState
{
    PlanState   ps;//基类PlanState
    JoinType    jointype;//连接类型
    //在找到一个匹配inner tuple的时候,如需要跳转到下一个outer tuple,则该值为T
    bool        single_match;   
    //连接条件表达式(除了ps.qual)
    ExprState  *joinqual;       
} JoinState;

HashJoinState
Hash Join运行期状态结构体


typedef struct HashJoinTupleData *HashJoinTuple;
typedef struct HashJoinTableData *HashJoinTable;

typedef struct HashJoinState
{
    JoinState   js;             
    ExprState  *hashclauses;//hash连接条件
    List       *hj_OuterHashKeys;   
    List       *hj_InnerHashKeys;   
    List       *hj_HashOperators;   
    HashJoinTable hj_HashTable;//Hash表
    uint32      hj_CurHashValue;//当前的Hash值
    int         hj_CurBucketNo;//当前的bucket编号
    int         hj_CurSkewBucketNo;//行倾斜bucket编号
    HashJoinTuple hj_CurTuple;//当前元组
    TupleTableSlot *hj_OuterTupleSlot;//outer relation slot
    TupleTableSlot *hj_HashTupleSlot;//Hash tuple slot
    TupleTableSlot *hj_NullOuterTupleSlot;//用于外连接的outer虚拟slot
    TupleTableSlot *hj_NullInnerTupleSlot;//用于外连接的inner虚拟slot
    TupleTableSlot *hj_FirstOuterTupleSlot;//
    int         hj_JoinState;//JoinState状态
    bool        hj_MatchedOuter;//是否匹配
    bool        hj_OuterNotEmpty;//outer relation是否为空
} HashJoinState;

HashJoinTable
Hash表数据结构

typedef struct HashJoinTableData
{
    int         nbuckets;       
    int         log2_nbuckets;  

    int         nbuckets_original;  
    int         nbuckets_optimal;   
    int         log2_nbuckets_optimal;  

    
    //bucket [i]是内存中第i个桶中的元组链表的head item
    union
    {
        
        //未共享数组是按批处理存储的，所有元组均如此
        struct HashJoinTupleData **unshared;
        
        //共享数组是每个查询的DSA区域，所有元组均如此
        dsa_pointer_atomic *shared;
    }           buckets;

    bool        keepNulls;      

    bool        skewEnabled;    
    HashSkewBucket **skewBucket;    
    int         skewBucketLen;  
    int         nSkewBuckets;   
    int        *skewBucketNums; 

    int         nbatch;         
    int         curbatch;       

    int         nbatch_original;    
    int         nbatch_outstart;    

    bool        growEnabled;    

    double      totalTuples;    
    double      partialTuples;  
    double      skewTuples;     

    
    BufFile   **innerBatchFile; 
    BufFile   **outerBatchFile; 

    
    FmgrInfo   *outer_hashfunctions;    
    FmgrInfo   *inner_hashfunctions;    
    bool       *hashStrict;     

    Size        spaceUsed;      
    Size        spaceAllowed;   
    Size        spacePeak;      
    Size        spaceUsedSkew;  
    Size        spaceAllowedSkew;   

    MemoryContext hashCxt;      
    MemoryContext batchCxt;     

    
    //用于密集分配元组(到链接块中)
    HashMemoryChunk chunks;     

    
    //并行hash使用的共享和私有状态
    HashMemoryChunk current_chunk;  
    dsa_area   *area;           
    ParallelHashJoinState *parallel_state;//并行执行状态
    ParallelHashJoinBatchAccessor *batches;//并行访问器
    dsa_pointer current_chunk_shared;//当前chunk的开始指针
} HashJoinTableData;

typedef struct HashJoinTableData *HashJoinTable;

HashJoinTupleData
Hash连接元组数据







typedef struct HashJoinTupleData
{
    
    //link同一个桶中的下一个元组
    union
    {
        struct HashJoinTupleData *unshared;
        dsa_pointer shared;
    }           next;
    uint32      hashvalue;      
    
}           HashJoinTupleData;

#define HJTUPLE_OVERHEAD  MAXALIGN(sizeof(HashJoinTupleData))
#define HJTUPLE_MINTUPLE(hjtup)  \
    ((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))

二、源码解读

ExecScanHashBucket
搜索匹配当前outer relation tuple的hash桶,寻找匹配的inner relation元组。




bool
ExecScanHashBucket(HashJoinState *hjstate,
                   ExprContext *econtext)
{
    ExprState  *hjclauses = hjstate->hashclauses;//hash连接条件表达式
    HashJoinTable hashtable = hjstate->hj_HashTable;//Hash表
    HashJoinTuple hashTuple = hjstate->hj_CurTuple;//当前的Tuple
    uint32      hashvalue = hjstate->hj_CurHashValue;//hash值

    
    if (hashTuple != NULL)
        hashTuple = hashTuple->next.unshared;//hashTuple,通过指针获取下一个
    else if (hjstate->hj_CurSkewBucketNo != INVALID_SKEW_BUCKET_NO)
        //如为NULL,而且使用倾斜优化,则从倾斜桶中获取
        hashTuple = hashtable->skewBucket[hjstate->hj_CurSkewBucketNo]->tuples;
    else
        ////如为NULL,不使用倾斜优化,从常规的bucket中获取
        hashTuple = hashtable->buckets.unshared[hjstate->hj_CurBucketNo];

    while (hashTuple != NULL)//循环
    {
        if (hashTuple->hashvalue == hashvalue)//hash值一致
        {
            TupleTableSlot *inntuple;//inner tuple

            
            //把Hash表中的tuple插入到执行器的slot中,作为函数ExecQual的输入使用
            inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
                                             hjstate->hj_HashTupleSlot,
                                             false);    
            econtext->ecxt_innertuple = inntuple;//赋值

            if (ExecQualAndReset(hjclauses, econtext))//判断连接条件是否满足
            {
                hjstate->hj_CurTuple = hashTuple;//满足,则赋值&返回T
                return true;
            }
        }

        hashTuple = hashTuple->next.unshared;//从Hash表中获取下一个tuple
    }

    
    return false;
}



TupleTableSlot *
ExecStoreMinimalTuple(MinimalTuple mtup,
                      TupleTableSlot *slot,
                      bool shouldFree)
{
    
    Assert(mtup != NULL);
    Assert(slot != NULL);
    Assert(slot->tts_tupleDescriptor != NULL);

    if (unlikely(!TTS_IS_MINIMALTUPLE(slot)))//类型检查
        elog(ERROR, "trying to store a minimal tuple into wrong type of slot");
    tts_minimal_store_tuple(slot, mtup, shouldFree);//存储

    return slot;//返回slot
}


static void
tts_minimal_store_tuple(TupleTableSlot *slot, MinimalTuple mtup, bool shouldFree)
{
    MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;//获取slot

    tts_minimal_clear(slot);//清除原来的slot
    //安全检查
    Assert(!TTS_SHOULDFREE(slot));
    Assert(TTS_EMPTY(slot));
    //设置slot信息
    slot->tts_flags &= ~TTS_FLAG_EMPTY;
    slot->tts_nvalid = 0;
    mslot->off = 0;
    //存储到mslot中
    mslot->mintuple = mtup;
    Assert(mslot->tuple == &mslot->minhdr);
    mslot->minhdr.t_len = mtup->t_len + MINIMAL_TUPLE_OFFSET;
    mslot->minhdr.t_data = (HeapTupleHeader) ((char *) mtup - MINIMAL_TUPLE_OFFSET);
    
    //不需要设置t_sefl或者t_tableOid,因为不允许访问
    if (shouldFree)
        slot->tts_flags |= TTS_FLAG_SHOULDFREE;
    else
        Assert(!TTS_SHOULDFREE(slot));
}
 


#ifndef FRONTEND
static inline bool
ExecQualAndReset(ExprState *state, ExprContext *econtext)
{
    bool        ret = ExecQual(state, econtext);//调用ExecQual

    
    //内联ResetExprContext,避免在这个文件中的ordering
    MemoryContextReset(econtext->ecxt_per_tuple_memory);
    return ret;
}
#endif


#define HeapTupleHeaderSetMatch(tup) \
( \
  (tup)->t_infomask2 |= HEAP_TUPLE_HAS_MATCH \
)

三、跟踪分析

测试脚本如下

testdb=# set enable_nestloop=false;
SET
testdb=# set enable_mergejoin=false;
SET
testdb=# explain verbose select dw.*,grjf.grbh,grjf.xm,grjf.ny,grjf.je 
testdb-# from t_dwxx dw,lateral (select gr.grbh,gr.xm,jf.ny,jf.je 
testdb(#                         from t_grxx gr inner join t_jfxx jf 
testdb(#                                        on gr.dwbh = dw.dwbh 
testdb(#                                           and gr.grbh = jf.grbh) grjf
testdb-# order by dw.dwbh;
                                          QUERY PLAN                                           
-----------------------------------------------------------------------------------------------
 Sort  (cost=14828.83..15078.46 rows=99850 width=47)
   Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm, jf.ny, jf.je
   Sort Key: dw.dwbh
   ->  Hash Join  (cost=3176.00..6537.55 rows=99850 width=47)
         Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm, jf.ny, jf.je
         Hash Cond: ((gr.grbh)::text = (jf.grbh)::text)
         ->  Hash Join  (cost=289.00..2277.61 rows=99850 width=32)
               Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm
               Inner Unique: true
               Hash Cond: ((gr.dwbh)::text = (dw.dwbh)::text)
               ->  Seq Scan on public.t_grxx gr  (cost=0.00..1726.00 rows=100000 width=16)
                     Output: gr.dwbh, gr.grbh, gr.xm, gr.xb, gr.nl
               ->  Hash  (cost=164.00..164.00 rows=10000 width=20)
                     Output: dw.dwmc, dw.dwbh, dw.dwdz
                     ->  Seq Scan on public.t_dwxx dw  (cost=0.00..164.00 rows=10000 width=20)
                           Output: dw.dwmc, dw.dwbh, dw.dwdz
         ->  Hash  (cost=1637.00..1637.00 rows=100000 width=20)
               Output: jf.ny, jf.je, jf.grbh
               ->  Seq Scan on public.t_jfxx jf  (cost=0.00..1637.00 rows=100000 width=20)
                     Output: jf.ny, jf.je, jf.grbh
(20 rows)

启动gdb,设置断点

(gdb) b ExecScanHashBucket
Breakpoint 1 at 0x6ff25b: file nodeHash.c, line 1910.
(gdb) c
Continuing.

Breakpoint 1, ExecScanHashBucket (hjstate=0x2bb8738, econtext=0x2bb8950) at nodeHash.c:1910
1910        ExprState  *hjclauses = hjstate->hashclauses;

设置相关变量

1910        ExprState  *hjclauses = hjstate->hashclauses;
(gdb) n
1911        HashJoinTable hashtable = hjstate->hj_HashTable;
(gdb) 
1912        HashJoinTuple hashTuple = hjstate->hj_CurTuple;
(gdb) 
1913        uint32      hashvalue = hjstate->hj_CurHashValue;
(gdb) 
1922        if (hashTuple != NULL)

hash join连接条件

(gdb) p *hjclauses
$1 = {tag = {type = T_ExprState}, flags = 7 '\a', resnull = false, resvalue = 0, resultslot = 0x0, steps = 0x2bc4bc8, 
  evalfunc = 0x6d1a6e <ExecInterpExprStillValid>, expr = 0x2bb60c0, evalfunc_private = 0x6cf625 <ExecInterpExpr>, 
  steps_len = 7, steps_alloc = 16, parent = 0x2bb8738, ext_params = 0x0, innermost_caseval = 0x0, innermost_casenull = 0x0, 
  innermost_domainval = 0x0, innermost_domainnull = 0x0}

hash表

(gdb) p hashtable
$2 = (HashJoinTable) 0x2bc9de8
(gdb) p *hashtable
$3 = {nbuckets = 16384, log2_nbuckets = 14, nbuckets_original = 16384, nbuckets_optimal = 16384, 
  log2_nbuckets_optimal = 14, buckets = {unshared = 0x7f0fc1345050, shared = 0x7f0fc1345050}, keepNulls = false, 
  skewEnabled = false, skewBucket = 0x0, skewBucketLen = 0, nSkewBuckets = 0, skewBucketNums = 0x0, nbatch = 1, 
  curbatch = 0, nbatch_original = 1, nbatch_outstart = 1, growEnabled = true, totalTuples = 10000, partialTuples = 10000, 
  skewTuples = 0, innerBatchFile = 0x0, outerBatchFile = 0x0, outer_hashfunctions = 0x2bdc228, 
  inner_hashfunctions = 0x2bdc280, hashStrict = 0x2bdc2d8, spaceUsed = 677754, spaceAllowed = 16777216, spacePeak = 677754, 
  spaceUsedSkew = 0, spaceAllowedSkew = 335544, hashCxt = 0x2bdc110, batchCxt = 0x2bde120, chunks = 0x2c708f0, 
  current_chunk = 0x0, area = 0x0, parallel_state = 0x0, batches = 0x0, current_chunk_shared = 0}

hash桶中的元组&hash值

(gdb) p *hashTuple
Cannot access memory at address 0x0
(gdb) p hashvalue
$4 = 2324234220
(gdb)

从常规hash桶中获取hash元组

(gdb) n
1924        else if (hjstate->hj_CurSkewBucketNo != INVALID_SKEW_BUCKET_NO)
(gdb) p hjstate->hj_CurSkewBucketNo
$5 = -1
(gdb) n
1927            hashTuple = hashtable->buckets.unshared[hjstate->hj_CurBucketNo];
(gdb) 
1929        while (hashTuple != NULL)
(gdb) p hjstate->hj_CurBucketNo
$7 = 16364
(gdb) p *hashTuple
$6 = {next = {unshared = 0x0, shared = 0}, hashvalue = 1822113772}

判断hash值是否一致

(gdb) n
1931            if (hashTuple->hashvalue == hashvalue)
(gdb) p hashTuple->hashvalue
$8 = 1822113772
(gdb) p hashvalue
$9 = 2324234220
(gdb)

不一致,继续下一个元组

(gdb) n
1948            hashTuple = hashTuple->next.unshared;
(gdb) 
1929        while (hashTuple != NULL)

下一个元组为NULL,返回F,说明没有匹配的元组

(gdb) p *hashTuple
Cannot access memory at address 0x0
(gdb) n
1954        return false;

在ExecStoreMinimalTuple上设置断点(这时候Hash值是一致的)

(gdb) b ExecStoreMinimalTuple
Breakpoint 2 at 0x6e8cbf: file execTuples.c, line 427.
(gdb) c
Continuing.

Breakpoint 1, ExecScanHashBucket (hjstate=0x2bb8738, econtext=0x2bb8950) at nodeHash.c:1910
1910        ExprState  *hjclauses = hjstate->hashclauses;
(gdb) del 1
(gdb) c
Continuing.

Breakpoint 2, ExecStoreMinimalTuple (mtup=0x2be81b0, slot=0x2bb9c18, shouldFree=false) at execTuples.c:427
427     Assert(mtup != NULL);
(gdb) finish
Run till exit from #0  ExecStoreMinimalTuple (mtup=0x2be81b0, slot=0x2bb9c18, shouldFree=false) at execTuples.c:427
0x00000000006ff335 in ExecScanHashBucket (hjstate=0x2bb8738, econtext=0x2bb8950) at nodeHash.c:1936
1936                inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
Value returned is $10 = (TupleTableSlot *) 0x2bb9c18
(gdb) n
1939                econtext->ecxt_innertuple = inntuple;

匹配成功,返回T

(gdb) n
1941                if (ExecQualAndReset(hjclauses, econtext))
(gdb) 
1943                    hjstate->hj_CurTuple = hashTuple;
(gdb) 
1944                    return true;
(gdb) 
1955    }
(gdb)

HJ_SCAN_BUCKET阶段,实现的逻辑是扫描Hash桶,寻找inner relation中与outer relation元组匹配的元组,如匹配,则把匹配的Tuple存储在hjstate->hj_CurTuple中.

“PostgreSQL的ExecHashJoin依赖其他函数的实现逻辑是什么”的内容就介绍到这里了，感谢大家的阅读。如果想了解更多行业相关的知识可以关注亿速云网站，小编将为大家输出更多高质量的实用文章！

免责声明：

① 本站未注明“稿件来源”的信息均来自网络整理。其文字、图片和音视频稿件的所属权归原作者所有。本站收集整理出于非商业性的教育和科研之目的，并不意味着本站赞同其观点或证实其内容的真实性。仅作为临时的测试数据，供内部测试之用。本站并未授权任何人以任何方式主动获取本站任何信息。

② 本站未注明“稿件来源”的临时测试数据将在测试完成后最终做删除处理。有问题或投稿请发送至: 邮箱/279061341@qq.com QQ/279061341

阅读原文内容投诉