Shared Buffer 동작 원리

개요

stateDiagram-v2
    state Memory {
      shared_buffer --> buffered_io
      buffered_io --> shared_buffer

      buffered_io --> os_cache
      os_cache --> buffered_io

      os_cache --> data_file: fsync()
      data_file --> os_cache: fsync()

      shared_buffer: Shared Buffer
      buffered_io: Buffered I/O
      os_cache: OS Cache

      wal_buffers --> wal_file: fsync()

      wal_buffers: WAL BUFFERS
    }

    state Disk {
      data_file: DATA FILE
      wal_file: WAL FILE
    }

성능 향상 위한 Shared Buffer의 3가지 목표

매우 큰(수십, 수백 GB) 버퍼를 빠르게 액세스

많은 사용자가 동시 접근 시 경합 최소화

자주 사용되는 블록은 최대한 오랫동안 버퍼에 생존

Shared Buffer 구조

  1. 해시 테이블
  2. 해시 테이블의 엘리먼트 및 엘리먼트 키
  3. 버퍼 디스크립터: 버퍼 상태 관리
  4. 버퍼 풀: 실제 블록을 저장

공유 리소스

해시 테이블

stateDiagram-v2
  bucket0 --> directory0
  bucket1 --> directory1
  bucket2 --> directory2
  bucket3 --> directory3
  bucketn_1 --> directoryn_1
  bucketn --> directoryn
  
  state HashTable {
    bucket0: Bucket 0 [0-255]
    bucket1: Bucket 1 [0-255]
    bucket2: Bucket 2 [0-255]
    bucket3: Bucket 3 [0-255]
    bucketn_1: Bucket N-1 [0-255]
    bucketn: Bucket N [0-255]
  }

  state Directory{
    directory0: 디렉토리
    directory1: 디렉토리
    directory2: 디렉토리
    directory3: 디렉토리
    directoryn_1: 디렉토리
    directoryn: 디렉토리
  }

해시 테이블 크기와 해시 세그먼트

// src/backend/utils/hash/dynahash.c
#define DEF_SEGSIZE         256
#define DEF_SEGSIZE_SHIFT   8 /* must be log2(DEF_SEGSIZE) */
#define DEF_DIRSIZE         256
#define DEF_FFACTOR         1 /* default fill factor */

/* Number of freelists to be used for a partitioned hash table. */
#define NUM_FREELISTS       32

// src/include/storage/lwlock.h
/* Number of partitions of the shared buffer mapping hashtable */
#define NUM_BUFFER_PARTITIONS  128
// src/backend/utils/hash/dynahash.c
/*
 * Top control structure for a hashtable --- in a shared table, each backend
 * has its own copy (OK since no fields change at runtime)
 */
struct HTAB
{
  HASHHDR           *hctl;        /* => shared control information */
  HASHSEGMENT       *dir;         /* directory of segment starts */
  HashValueFunc     hash;         /* hash function */
  HashCompareFunc   match;        /* key comparison function */
  HashCopyFunc      keycopy;      /* key copying function */
  HashAllocFunc     alloc;        /* memory allocator */
  MemoryContext     hcxt;         /* memory context if default allocator used */
  char              *tabname;     /* table name (for error messages) */
  bool              isshared;     /* true if table is in shared memory */
  bool              isfixed;      /* if true, don't enlarge */

  /* freezing a shared table isn't allowed, so we can keep state here */
  bool              frozen;       /* true = no more inserts allowed */

  /* We keep local copies of these fixed values to reduce contention */
  Size              keysize;      /* hash key length in bytes */
  long              ssize;        /* segment size --- must be power of 2 */
  int               sshift;       /* segment shift = log2(ssize) */
};

/* calculate first power of 2 >= num, bounded to what will fit in an int */
static int
next_pow2_int(long num)
{
 if (num > INT_MAX / 2)
  num = INT_MAX / 2;
 return 1 << my_log2(num); /* my_log2: calculate ceil(log base 2) of num */
}

/*
 * Compute derived fields of hctl and build the initial directory/segment
 * arrays
 */
static bool
init_htab(HTAB *hashp, long nelem)
{
  HASHHDR     *hctl = hashp->hctl;
  HASHSEGMENT *segp;
  int         nbuckets;
  int         nsegs;
  int         i;

  /*
    * initialize mutexes if it's a partitioned table
    */
  if (IS_PARTITIONED(hctl))
    for (i = 0; i < NUM_FREELISTS; i++)
      SpinLockInit(&(hctl->freeList[i].mutex));

  /*
    * Divide number of elements by the fill factor to determine a desired
    * number of buckets.  Allocate space for the next greater power of two
    * number of buckets
    */
  nbuckets = next_pow2_int((nelem - 1) / hctl->ffactor + 1);

  /*
    * In a partitioned table, nbuckets must be at least equal to
    * num_partitions; were it less, keys with apparently different partition
    * numbers would map to the same bucket, breaking partition independence.
    * (Normally nbuckets will be much bigger; this is just a safety check.)
    */
  while (nbuckets < hctl->num_partitions)
    nbuckets <<= 1;

  hctl->max_bucket = hctl->low_mask = nbuckets - 1;
  hctl->high_mask = (nbuckets << 1) - 1;

  /*
    * Figure number of directory segments needed, round up to a power of 2
    */
  nsegs = (nbuckets - 1) / hctl->ssize + 1;
  nsegs = next_pow2_int(nsegs);

  /*
    * Make sure directory is big enough. If pre-allocated directory is too
    * small, choke (caller screwed up).
    */
  if (nsegs > hctl->dsize)
  {
    if (!(hashp->dir))
      hctl->dsize = nsegs;
    else
      return false;
  }

  /* Allocate a directory */
  if (!(hashp->dir))
  {
    CurrentDynaHashCxt = hashp->hcxt;
    hashp->dir = (HASHSEGMENT *)
      hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
    if (!hashp->dir)
      return false;
  }

  /* Allocate initial segments */
  for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
  {
    *segp = seg_alloc(hashp);
    if (*segp == NULL)
      return false;
  }

  /* Choose number of entries to allocate at a time */
  hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
}

버퍼 파티션?

버퍼 파티션 개수

해시 테이블 크기 중간 정리

해시 엘리먼트

구성

  1. 엘리먼트
  2. 엘리먼트 키

엘리먼트 구성 요소

/********** src/include/utils/hsearch.h **********/
/*
 * HASHELEMENT is the private part of a hashtable entry.  The caller's data
 * follows the HASHELEMENT structure (on a MAXALIGN'd boundary).  The hash key
 * is expected to be at the start of the caller's hash entry data structure.
 */
typedef struct HASHELEMENT
{
 struct HASHELEMENT *link; /* link to next entry in same bucket */
 uint32  hashvalue;  /* hash function result for this entry */
} HASHELEMENT;
/********** src/backend/utils/hash/dynahash.c **********/
/*
 * get_hash_value -- exported routine to calculate a key's hash value
 *
 * We export this because for partitioned tables, callers need to compute
 * the partition number (from the low-order bits of the hash value) before
 * searching.
 */
uint32
get_hash_value(HTAB *hashp, const void *keyPtr)
{
 return hashp->hash(keyPtr, hashp->keysize);
}

/********** src/backend/storage/buffer/buf_table.c **********/
/*
 * BufTableHashCode
 *  Compute the hash code associated with a BufferTag
 *
 * This must be passed to the lookup/insert/delete routines along with the
 * tag.  We do it like this because the callers need to know the hash code
 * in order to determine which buffer partition to lock, and we don't want
 * to do the hash computation twice (hash_any is a bit slow).
 */
uint32
BufTableHashCode(BufferTag *tagPtr)
{
 return get_hash_value(SharedBufHash, (void *) tagPtr);
}

엘리먼트 키 구성 요소

/***** src/backend/storage/buffer/buf_table.c *****/
/* entry for buffer lookup hashtable */
typedef struct
{
 BufferTag key;   /* Tag of a disk page */
 int   id;    /* Associated buffer ID */
} BufferLookupEnt;


/***** src/include/storage/buf_internals.h *****/
/*
 * Buffer tag identifies which disk block the buffer contains.
 *
 * Note: the BufferTag data must be sufficient to determine where to write the
 * block, without reference to pg_class or pg_tablespace entries.  It's
 * possible that the backend flushing the buffer doesn't even believe the
 * relation is visible yet (its xact may have started before the xact that
 * created the rel).  The storage manager must be able to cope anyway.
 *
 * Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have
 * to be fixed to zero them, since this struct is used as a hash key.
 */
typedef struct buftag
{
 RelFileNode rnode;   /* physical relation identifier */
 ForkNumber forkNum;
 BlockNumber blockNum;  /* blknum relative to begin of reln */
} BufferTag;


/***** src/include/storage/relfilenode.h *****/
/*
 * RelFileNode must provide all that we need to know to physically access
 * a relation, with the exception of the backend ID, which can be provided
 * separately. Note, however, that a "physical" relation is comprised of
 * multiple files on the filesystem, as each fork is stored as a separate
 * file, and each fork can be divided into multiple segments. See md.c.
 *
 * spcNode identifies the tablespace of the relation.  It corresponds to
 * pg_tablespace.oid.
 *
 * dbNode identifies the database of the relation.  It is zero for
 * "shared" relations (those common to all databases of a cluster).
 * Nonzero dbNode values correspond to pg_database.oid.
 *
 * relNode identifies the specific relation.  relNode corresponds to
 * pg_class.relfilenode (NOT pg_class.oid, because we need to be able
 * to assign new physical files to relations in some situations).
 * Notice that relNode is only unique within a database in a particular
 * tablespace.
 *
 * Note: spcNode must be GLOBALTABLESPACE_OID if and only if dbNode is
 * zero.  We support shared relations only in the "global" tablespace.
 *
 * Note: in pg_class we allow reltablespace == 0 to denote that the
 * relation is stored in its database's "default" tablespace (as
 * identified by pg_database.dattablespace).  However this shorthand
 * is NOT allowed in RelFileNode structs --- the real tablespace ID
 * must be supplied when setting spcNode.
 *
 * Note: in pg_class, relfilenode can be zero to denote that the relation
 * is a "mapped" relation, whose current true filenode number is available
 * from relmapper.c.  Again, this case is NOT allowed in RelFileNodes.
 *
 * Note: various places use RelFileNode in hashtable keys.  Therefore,
 * there *must not* be any unused padding bytes in this struct.  That
 * should be safe as long as all the fields are of type Oid.
 */
typedef struct RelFileNode
{
 Oid   spcNode;  /* tablespace */
 Oid   dbNode;   /* database */
 Oid   relNode;  /* relation */
} RelFileNode;

BufferTag?

해시 엘리먼트 메모리 할당 방식

/***** src/backend/utils/hash/dynahash.c *****/
/*
 * Per-freelist data.
 *
 * In a partitioned hash table, each freelist is associated with a specific
 * set of hashcodes, as determined by the FREELIST_IDX() macro below.
 * nentries tracks the number of live hashtable entries having those hashcodes
 * (NOT the number of entries in the freelist, as you might expect).
 *
 * The coverage of a freelist might be more or less than one partition, so it
 * needs its own lock rather than relying on caller locking.  Relying on that
 * wouldn't work even if the coverage was the same, because of the occasional
 * need to "borrow" entries from another freelist; see get_hash_entry().
 *
 * Using an array of FreeListData instead of separate arrays of mutexes,
 * nentries and freeLists helps to reduce sharing of cache lines between
 * different mutexes.
 */
typedef struct
{
 slock_t  mutex;   /* spinlock for this freelist */
 long  nentries;  /* number of entries in associated buckets */
 HASHELEMENT *freeList;  /* chain of free elements */
} FreeListData;

/*
 * Header structure for a hash table --- contains all changeable info
 *
 * In a shared-memory hash table, the HASHHDR is in shared memory, while
 * each backend has a local HTAB struct.  For a non-shared table, there isn't
 * any functional difference between HASHHDR and HTAB, but we separate them
 * anyway to share code between shared and non-shared tables.
 */
struct HASHHDR
{
 /*
  * The freelist can become a point of contention in high-concurrency hash
  * tables, so we use an array of freelists, each with its own mutex and
  * nentries count, instead of just a single one.  Although the freelists
  * normally operate independently, we will scavenge entries from freelists
  * other than a hashcode's default freelist when necessary.
  *
  * If the hash table is not partitioned, only freeList[0] is used and its
  * spinlock is not used at all; callers' locking is assumed sufficient.
  */
 FreeListData freeList[NUM_FREELISTS];
 /* 이하 생략 */
}

여러 개의 freeList

[
  freeList0 => [
    [
      el4099_key,
      el4099 => [
        /* ...4100개... */
        el0_key,
        el0
      ]
    ]
  ],
  /* ...32개... */
  freeList31 => [
    [
      el4099_key,
      el4099 => [
        /* ...4100개... */
        el0_key,
        el0
      ]
    ]
  ]
]

버퍼 디스크립터

// src/include/storage/buf_internals.h
/*
 * BufferDesc -- shared descriptor/state data for a single shared buffer.
 *
 * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
 * the tag, state or wait_backend_pid fields.  In general, buffer header lock
 * is a spinlock which is combined with flags, refcount and usagecount into
 * single atomic variable.  This layout allow us to do some operations in a
 * single atomic operation, without actually acquiring and releasing spinlock;
 * for instance, increase or decrease refcount.  buf_id field never changes
 * after initialization, so does not need locking.  freeNext is protected by
 * the buffer_strategy_lock not buffer header lock.  The LWLock can take care
 * of itself.  The buffer header lock is *not* used to control access to the
 * data in the buffer!
 *
 * It's assumed that nobody changes the state field while buffer header lock
 * is held.  Thus buffer header lock holder can do complex updates of the
 * state variable in single write, simultaneously with lock release (cleaning
 * BM_LOCKED flag).  On the other hand, updating of state without holding
 * buffer header lock is restricted to CAS, which insure that BM_LOCKED flag
 * is not set.  Atomic increment/decrement, OR/AND etc. are not allowed.
 *
 * An exception is that if we have the buffer pinned, its tag can't change
 * underneath us, so we can examine the tag without locking the buffer header.
 * Also, in places we do one-time reads of the flags without bothering to
 * lock the buffer header; this is generally for situations where we don't
 * expect the flag bit being tested to be changing.
 *
 * We can't physically remove items from a disk page if another backend has
 * the buffer pinned.  Hence, a backend may need to wait for all other pins
 * to go away.  This is signaled by storing its own PID into
 * wait_backend_pid and setting flag bit BM_PIN_COUNT_WAITER.  At present,
 * there can be only one such waiter per buffer.
 *
 * We use this same struct for local buffer headers, but the locks are not
 * used and not all of the flag bits are useful either. To avoid unnecessary
 * overhead, manipulations of the state field should be done without actual
 * atomic operations (i.e. only pg_atomic_read_u32() and
 * pg_atomic_unlocked_write_u32()).
 *
 * Be careful to avoid increasing the size of the struct when adding or
 * reordering members.  Keeping it below 64 bytes (the most common CPU
 * cache line size) is fairly important for performance.
 */
typedef struct BufferDesc
{
 BufferTag tag;   /* ID of page contained in buffer */
 int   buf_id;   /* buffer's index number (from 0) */

 /* state of the tag, containing flags, refcount and usagecount */
 pg_atomic_uint32 state;

 int   wait_backend_pid; /* backend PID of pin-count waiter */
 int   freeNext;  /* link in freelist chain */

 LWLock  content_lock; /* to lock access to buffer contents */
} BufferDesc;


// src/include/storage/buf_internals.h
/*
 * Buffer tag identifies which disk block the buffer contains.
 *
 * Note: the BufferTag data must be sufficient to determine where to write the
 * block, without reference to pg_class or pg_tablespace entries.  It's
 * possible that the backend flushing the buffer doesn't even believe the
 * relation is visible yet (its xact may have started before the xact that
 * created the rel).  The storage manager must be able to cope anyway.
 *
 * Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have
 * to be fixed to zero them, since this struct is used as a hash key.
 */
typedef struct buftag
{
 RelFileNode rnode;   /* physical relation identifier */
 ForkNumber forkNum;
 BlockNumber blockNum;  /* blknum relative to begin of reln */
} BufferTag;

state

// src/include/storage/buf_internals.h
/*
 * Buffer state is a single 32-bit variable where following data is combined.
 *
 * - 18 bits refcount
 * - 4 bits usage count
 * - 10 bits of flags
 *
 * Combining these values allows to perform some operations without locking
 * the buffer header, by modifying them together with a CAS loop.
 *
 * The definition of buffer state components is below.
 */
#define BUF_REFCOUNT_ONE 1
#define BUF_REFCOUNT_MASK ((1U << 18) - 1)
#define BUF_USAGECOUNT_MASK 0x003C0000U
#define BUF_USAGECOUNT_ONE (1U << 18)
#define BUF_USAGECOUNT_SHIFT 18
#define BUF_FLAG_MASK 0xFFC00000U

/* Get refcount and usagecount from buffer state */
#define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
#define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)

flag

/*
 * Flags for buffer descriptors
 *
 * Note: BM_TAG_VALID essentially means that there is a buffer hashtable
 * entry associated with the buffer's tag.
 */
#define BM_LOCKED    (1U << 22) /* buffer header is locked */
#define BM_DIRTY    (1U << 23) /* data needs writing */
#define BM_VALID    (1U << 24) /* data is valid */
#define BM_TAG_VALID   (1U << 25) /* tag is assigned */
#define BM_IO_IN_PROGRESS  (1U << 26) /* read or write in progress */
#define BM_IO_ERROR    (1U << 27) /* previous I/O failed */
#define BM_JUST_DIRTIED   (1U << 28) /* dirtied since write started */
#define BM_PIN_COUNT_WAITER  (1U << 29) /* have waiter for sole pin */
#define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */
#define BM_PERMANENT   (1U << 31) /* permanent buffer (not unlogged, or init fork) */
  flag 의미 비트연산 비트 연산 결과 정수
1 BM_LOCKED 버퍼 헤더 LOCKED (1U « 22) 0000 0000 0100 0000 0000 0000 0000 0000 4,194,304
2 BM_DIRTY 데이터 쓰기 필요 (1U « 23) 0000 0000 1000 0000 0000 0000 0000 0000 8,388,608
3 BM_VALID 데이터 유효함 (1U « 24) 0000 0001 0000 0000 0000 0000 0000 0000 16,777,216
4 BM_TAG_VALID 태그 할당됨 (1U « 25) 0000 0010 0000 0000 0000 0000 0000 0000 33,554,432
5 BM_IO_IN_PROGRESS 읽기 또는 쓰기 진행 중 (1U « 26) 0000 0100 0000 0000 0000 0000 0000 0000 67,108,864
6 BM_IO_ERROR 이전 I/O 실패 (1U « 27) 0000 1000 0000 0000 0000 0000 0000 0000 134,217,728
7 BM_JUST_DIRTIED 쓰기 시작 후 DIRTIED 됨 (1U « 28) 0001 0000 0000 0000 0000 0000 0000 0000 268,435,456
8 BM_PIN_COUNT_WAITER 하나의 pin에 대한 대기 있음 (1U « 29) 0010 0000 0000 0000 0000 0000 0000 0000 536,870,912
9 BM_CHECKPOINT_NEEDED 체크포인트에 쓰기(?) (1U « 30) 0100 0000 0000 0000 0000 0000 0000 0000 1,073,741,824
10 BM_PERMANENT unlogged 아니고 init fork 아닌, 영구적인 버퍼(?) (1U « 31) 1000 0000 0000 0000 0000 0000 0000 0000 2,147,483,648

freeNext

초기 기동
graph LR
  subgraph BufferStrategyControl
  firstFreeBuffer[firstFreeBuffer = 0] --- lastFreeBuffer[lastFreeBuffer = 131,071]
  end
  0["[0]<br>freeNext 1"] --> 1["[1]<br>freeNext 2"] --> 2["[2]<br>freeNext 3"] --> id1[...] --> 131,070["[131,070]<br>freeNext 131,071"] --> 131,071["[131,071]<br>freeNext 131,072"]
  0 --> firstFreeBuffer
  131,071 --> lastFreeBuffer

Number of partitions of the shared buffer mapping hashtable

1개 버퍼 할당
graph LR
  subgraph BufferStrategyControl
  firstFreeBuffer[firstFreeBuffer = 1] --- lastFreeBuffer[lastFreeBuffer = 131,071]
  end
  0["[0]"] 
  1["[1]<br>freeNext 2"] --> 2["[2]<br>freeNext 3"] --> id1[...] --> 131,070["[131,070]<br>freeNext 131,071"] --> 131,071["[131,071]<br>freeNext 131,072"]
  1 --> firstFreeBuffer
  131,071 --> lastFreeBuffer
131,071개 버퍼 할당
graph LR
  subgraph BufferStrategyControl
  firstFreeBuffer[firstFreeBuffer = 131,070] --- lastFreeBuffer[lastFreeBuffer = 131,071]
  end
  0["[0]"] 
  1["[1]"]
  2["[2]"]
  id1[...]
  131,070["[131,070]<br>freeNext 131,071"] --> 131,071["[131,071]<br>freeNext 131,072"]
  131,070 --> firstFreeBuffer
  131,071 --> lastFreeBuffer
131,072개 버퍼 할당
graph LR
  subgraph BufferStrategyControl
  firstFreeBuffer[firstFreeBuffer = -1] --- lastFreeBuffer[lastFreeBuffer = 131,071]
  end
  0["[0]"] 
  1["[1]"]
  2["[2]"]
  id1[...]
  131,070["[131,070]"]
  131,071["[131,071]"]
  131,071 --> lastFreeBuffer

spinlocklight-weight lock

PostgreSQL Oracle
spinlock mutex
light-weight lock latch

spinlocklight-weight lock 차이점

항목 spinlock light-weight lock
사용 부하 매우 매우 적음 매우 적음
Context Switching 발생하지 않음 발생할 수 있음
동작 방식 Spin 방식 큐 & 포스팅 방식
사용 용도 구조체 내 변수를 액세스할 때 사용 구조체를 액세스할 때 사용
사용 모드 EXCLUSIVE SHARE & EXCLUSIVE

spinlock

spinlock 구현 방식
  구현 방식 db
1 mutext 이용 Oracle
2 TAS(Test & Set)를 인라인 어셈블리어로 구현 PostgreSQL
// src/include/storage/s_lock.h
#ifdef __x86_64__  /* AMD Opteron, Intel EM64T */
#define HAS_TEST_AND_SET

typedef unsigned char slock_t;

#define TAS(lock) tas(lock)

/*
 * On Intel EM64T, it's a win to use a non-locking test before the xchg proper,
 * but only when spinning.
 *
 * See also Implementing Scalable Atomic Locks for Multi-Core Intel(tm) EM64T
 * and IA32, by Michael Chynoweth and Mary R. Lee. As of this writing, it is
 * available at:
 * http://software.intel.com/en-us/articles/implementing-scalable-atomic-locks-for-multi-core-intel-em64t-and-ia32-architectures
 */
#define TAS_SPIN(lock)    (*(lock) ? 1 : TAS(lock))

static __inline__ int
tas(volatile slock_t *lock)
{
 register slock_t _res = 1;

 __asm__ __volatile__(
  " lock   \n"
  " xchgb %0,%1 \n"
:  "+q"(_res), "+m"(*lock)
:  /* no inputs */
:  "memory", "cc");
 return (int) _res;
}

#define SPIN_DELAY() spin_delay()

static __inline__ void
spin_delay(void)
{
 /*
  * Adding a PAUSE in the spin delay loop is demonstrably a no-op on
  * Opteron, but it may be of some use on EM64T, so we keep it.
  */
 __asm__ __volatile__(
  " rep; nop   \n");
}

#endif  /* __x86_64__ */
Hardware Synchronization Algorithms : Unlock and Lock, Test and Set, Swap
//Shared variable lock initialized to false
boolean lock;

boolean TestAndSet (boolean &target){
    boolean rv = target;
    target = true;
    return rv;
}

while(1){
    while (TestAndSet(lock));
    critical section
    lock = false;
    remainder section
}
Mutex lock for Linux Thread Synchronization

light-weight lock

light-weight lock 획득 절차
// src/backend/storage/lmgr/lwlock.c
/*
 * LWLockAcquire - acquire a lightweight lock in the specified mode
 *
 * If the lock is not available, sleep until it is.  Returns true if the lock
 * was available immediately, false if we had to sleep.
 *
 * Side effect: cancel/die interrupts are held off until lock release.
 */
bool
LWLockAcquire(LWLock *lock, LWLockMode mode)
{
 PGPROC    *proc = MyProc;
 bool  result = true;
 int   extraWaits = 0;
#ifdef LWLOCK_STATS
 lwlock_stats *lwstats;

 lwstats = get_lwlock_stats_entry(lock);
#endif

 AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE);

 PRINT_LWDEBUG("LWLockAcquire", lock, mode);

#ifdef LWLOCK_STATS
 /* Count lock acquisition attempts */
 if (mode == LW_EXCLUSIVE)
  lwstats->ex_acquire_count++;
 else
  lwstats->sh_acquire_count++;
#endif       /* LWLOCK_STATS */

 /*
  * We can't wait if we haven't got a PGPROC.  This should only occur
  * during bootstrap or shared memory initialization.  Put an Assert here
  * to catch unsafe coding practices.
  */
 Assert(!(proc == NULL && IsUnderPostmaster));

 /* Ensure we will have room to remember the lock */
 if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
  elog(ERROR, "too many LWLocks taken");

 /*
  * Lock out cancel/die interrupts until we exit the code section protected
  * by the LWLock.  This ensures that interrupts will not interfere with
  * manipulations of data structures in shared memory.
  */
 HOLD_INTERRUPTS();

 /*
  * Loop here to try to acquire lock after each time we are signaled by
  * LWLockRelease.
  *
  * NOTE: it might seem better to have LWLockRelease actually grant us the
  * lock, rather than retrying and possibly having to go back to sleep. But
  * in practice that is no good because it means a process swap for every
  * lock acquisition when two or more processes are contending for the same
  * lock.  Since LWLocks are normally used to protect not-very-long
  * sections of computation, a process needs to be able to acquire and
  * release the same lock many times during a single CPU time slice, even
  * in the presence of contention.  The efficiency of being able to do that
  * outweighs the inefficiency of sometimes wasting a process dispatch
  * cycle because the lock is not free when a released waiter finally gets
  * to run.  See pgsql-hackers archives for 29-Dec-01.
  */
 for (;;)
 {
  bool  mustwait;

  /*
   * Try to grab the lock the first time, we're not in the waitqueue
   * yet/anymore.
   */
  mustwait = LWLockAttemptLock(lock, mode);

  if (!mustwait)
  {
   LOG_LWDEBUG("LWLockAcquire", lock, "immediately acquired lock");
   break;    /* got the lock */
  }

  /*
   * Ok, at this point we couldn't grab the lock on the first try. We
   * cannot simply queue ourselves to the end of the list and wait to be
   * woken up because by now the lock could long have been released.
   * Instead add us to the queue and try to grab the lock again. If we
   * succeed we need to revert the queuing and be happy, otherwise we
   * recheck the lock. If we still couldn't grab it, we know that the
   * other locker will see our queue entries when releasing since they
   * existed before we checked for the lock.
   */

  /* add to the queue */
  LWLockQueueSelf(lock, mode);

  /* we're now guaranteed to be woken up if necessary */
  mustwait = LWLockAttemptLock(lock, mode);

  /* ok, grabbed the lock the second time round, need to undo queueing */
  if (!mustwait)
  {
   LOG_LWDEBUG("LWLockAcquire", lock, "acquired, undoing queue");

   LWLockDequeueSelf(lock);
   break;
  }

  /*
   * Wait until awakened.
   *
   * Since we share the process wait semaphore with the regular lock
   * manager and ProcWaitForSignal, and we may need to acquire an LWLock
   * while one of those is pending, it is possible that we get awakened
   * for a reason other than being signaled by LWLockRelease. If so,
   * loop back and wait again.  Once we've gotten the LWLock,
   * re-increment the sema by the number of additional signals received,
   * so that the lock manager or signal manager will see the received
   * signal when it next waits.
   */
  LOG_LWDEBUG("LWLockAcquire", lock, "waiting");

#ifdef LWLOCK_STATS
  lwstats->block_count++;
#endif

  LWLockReportWaitStart(lock);
  if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
   TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode);

  for (;;)
  {
   PGSemaphoreLock(proc->sem);
   if (!proc->lwWaiting)
    break;
   extraWaits++;
  }

  /* Retrying, allow LWLockRelease to release waiters again. */
  pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);

#ifdef LOCK_DEBUG
  {
   /* not waiting anymore */
   uint32  nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);

   Assert(nwaiters < MAX_BACKENDS);
  }
#endif

  if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
   TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode);
  LWLockReportWaitEnd();

  LOG_LWDEBUG("LWLockAcquire", lock, "awakened");

  /* Now loop back and try to acquire lock again. */
  result = false;
 }

 if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_ENABLED())
  TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), mode);

 /* Add lock to list of locks held by this backend */
 held_lwlocks[num_held_lwlocks].lock = lock;
 held_lwlocks[num_held_lwlocks++].mode = mode;

 /*
  * Fix the process wait semaphore's count for any absorbed wakeups.
  */
 while (extraWaits-- > 0)
  PGSemaphoreUnlock(proc->sem);

 return result;
}

Shared Buffer에서 버퍼 읽기

Shared Buffer 내에 있는 블록 읽는 경우

항목 계산 방식 Value
버퍼 수 256 MiB($256 \times{1,024} \times{1,024}$) / 8 KiB($8 \times{1,024}$) 32,768
버퍼 파티션 수 #define NUM_BUFFER_PARTITIONS 128 128
nelem 버퍼 수 + 버퍼 파티션 수 32,896
htcl-ffactor #define DEF_FFACTOR 1 1
해시테이블 배열 버킷 수 nbuckets = next_pow2_int( (nelem - 1) / hctl->ffactor + 1 ); 65,536($2^{16}$)
해시 세그먼트 크기 #define DEF_SEGSIZE 256 256
해시 세그먼트 수 $\text{해시테이블 배열 버킷 수} \div \text{해시 세그먼트 크기} = 2^{16} \div 2^{8}$ 256
freeList #define NUM_FREELISTS 32 32
해시 엘리먼트 풀 크기 nelem 32,896
버퍼 디스크립터 배열 버퍼 수 32,767
버퍼 풀 배열 버퍼 수 32,767

next_pow2_int( (nelem - 1) / hctl->ffactor + 1 );
= next_pow2_int( ( (32896 - 1) / 1 ) + 1 )
= next_pow2_int( ( (32895) / 1 ) + 1 )
= next_pow2_int( ( 32895 ) + 1 )
= next_pow2_int( 32896 ) // 2^15 = 32,768, 다음 2의 제곱수는 2^16
= 65,536

처리 순서

  1. BufferTag 생성
    1. 생성된 BufferTag 이용해서 hashvalue 계산
    2. 계산된 hashvalue를 이용해서 버퍼 파티션 번호 계산
  2. 해당 번호의 버퍼 파티션에 대한 LW 락을 SHARE 모드로 획득
  3. hashvalue를 이용해서 해시 테이블 버킷 번호를 계산(해시 체인 검색)
    1. 버킷 번호 $\to$ 해시 세그먼트 번호인덱스 번호로 치환
    2. 해시 체인을 따라가면서 Tag_B를 찾는다
  4. 해시 엘리먼트 Tag_B에는 ID of Tag_B가 있고, ${버퍼 디스크립터 배열['ID of Tag_B']}PIN을 설정
    1. 이때 refcount + 1, usagecount + 1
  5. 해당 번호의 버퍼 파티션에 대한 LW 락을 해제
  6. 버퍼 디스크립터 배열에 사응하는 ${버퍼 풀['ID of Tag_B']} 내용을 읽는다
  7. PIN 해제하고 refcount - 1
CALC_BUCKET 함수 설명
// src/backend/utils/hash/dynahash.c
/* Convert a hash value to a bucket number */
static inline uint32
calc_bucket(HASHHDR *hctl, uint32 hash_val)
{
 uint32  bucket;

 bucket = hash_val & hctl->high_mask;
 if (bucket > hctl->max_bucket)
  bucket = bucket & hctl->low_mask;

 return bucket;
}

기타

참고 링크