Bolt源码解析（四）：事务与数据一致性

1、创建事务

事务分为读写事务和读事务，读写事务不能并发，读事务可以并发。

func (db *DB) Begin(writable bool) (*Tx, error) {
    if writable {
        return db.beginRWTx()//读写事务
    }
    return db.beginTx()//只读事务
}
 func (db *DB) beginRWTx() (*Tx, error) {
    // If the database was opened with Options.ReadOnly, return an error.
    if db.readOnly {
        return nil, ErrDatabaseReadOnly
    }

    db.rwlock.Lock()

    // 锁定meat就可以保证只有本tx可以修改meta
    db.metalock.Lock()
    defer db.metalock.Unlock()

    // Exit if the database is not open yet.
    if !db.opened {
        db.rwlock.Unlock()
        return nil, ErrDatabaseNotOpen
    }

    // Create a transaction associated with the database.
    t := &Tx{writable: true}
    t.init(db)
    db.rwtx = t

    // 找到目前被使用过的最小txid
    var minid txid = 0xFFFFFFFFFFFFFFFF
    for _, t := range db.txs {
        if t.meta.txid < minid {
            minid = t.meta.txid
        }
}
//暂存在pending中的page小于最小txid的都释放到free中
    if minid > 0 {
        db.freelist.release(minid - 1)
    }

    return t, nil
}

func (tx *Tx) init(db *DB) {
    tx.db = db
    tx.pages = nil

    // Copy the meta page since it can be changed by the writer.
    tx.meta = &meta{}
    db.meta().copy(tx.meta)//对db中的mate做个快照

    // Copy over the root bucket.
    tx.root = newBucket(tx) //创建root bucket
    tx.root.bucket = &bucket{}
    *tx.root.bucket = tx.meta.root//db-mete做快照

    // Increment the transaction id and add a page cache for writable transactions.
if tx.writable {
       //读写事务需要创建pages记录变化
        tx.pages = make(map[pgid]*page)
        tx.meta.txid += txid(1) //txid递增
    }
}

创建事务就是把db中的元数据赋值一份到tx中，这些元数据写的过程中会变化，为了保护之前的数据一致性不被破坏，这里需要拷贝一份新数据，事务提交之后使用tx中的数据在把db中数据覆盖一遍。

2、事务提交

func (tx *Tx) Commit() error {
    _assert(!tx.managed, "managed tx commit not allowed")
    if tx.db == nil {
        return ErrTxClosed
    } else if !tx.writable {
        return ErrTxNotWritable
    }
var startTime = time.Now()
//对于前面介绍删除的节点，会导致有点节点不满足b+tree的条件，对于这样的节点就需要进行节点合并等操作。后面详细介绍

tx.root.rebalance()
if tx.stats.Rebalance > 0 {
        tx.stats.RebalanceTime += time.Since(startTime)
    }

    // spill data onto dirty pages.
startTime = time.Now()
//当事务内有插入操作时，会导致节点变大，为了让节点满足b+tree的规则需要对节点进行拆分，后面详细介绍
    if err := tx.root.spill(); err != nil {
        tx.rollback()
        return err
    }
    tx.stats.SpillTime += time.Since(startTime)

    // 更新meta
    tx.meta.root.root = tx.root.root
//记录当前分配的page个数
    opgid := tx.meta.pgid

    // 释放数据被拷贝到新page上的老page，只是挂到pending中并未真正释放
tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist))
//计算记录空闲page所需空间并申请
    p, err := tx.allocate((tx.db.freelist.size() / tx.db.pageSize) + 1)
    if err != nil {
        tx.rollback()
        return err
}
//将空闲page记录到p对应的空间
    if err := tx.db.freelist.write(p); err != nil {
        tx.rollback()
        return err
}
//更新freelist指向新的page位置
    tx.meta.freelist = p.id

    // 如果超过当前空间，扩大db文件
    if tx.meta.pgid > opgid {
        if err := tx.db.grow(int(tx.meta.pgid+1) * tx.db.pageSize); err != nil {
            tx.rollback()
            return err
        }
    }

    // Write dirty pages to disk.
startTime = time.Now()
//将Tx.pages中记录的脏页写入db文件
    if err := tx.write(); err != nil {
        tx.rollback()
        return err
    }

    // 校验忽略
    if tx.db.StrictMode {
       
    }

    // 把meta 写入磁盘，meta记录的是根节点
    if err := tx.writeMeta(); err != nil {
        tx.rollback()
        return err
    }
    tx.stats.WriteTime += time.Since(startTime)

    // Finalize the transaction.
    tx.close()

    // Execute commit handlers now that the locks have been removed.
    for _, fn := range tx.commitHandlers {
        fn()
    }

    return nil
}

2.1节点拆分

spill是将节点分割，分割后的节点会使节点数变多，为了符合b+tree的规则就要对tree做拆分。按照从leaf向root节点的顺序依次拆分。

func (b *Bucket) spill() error {
    // Spill all child buckets first.
    for name, child := range b.buckets {//遍历所有bucket
       
        var value []byte
        if child.inlineable() {//inline类型处理
            child.free()
            value = child.write()
        } else {
                        //处理该子bucket 
            if err := child.spill(); err != nil {
                return err
            }
                       //该子bucket经过spill会导致该子bucket从roo到lead整条路径的变化，记录变化后的root（bucket）
            value = make([]byte, unsafe.Sizeof(bucket{}))
            var bucket = (*bucket)(unsafe.Pointer(&value[0]))
            *bucket = *child.bucket
        }

        // Skip writing the bucket if there are no materialized nodes.
        if child.rootNode == nil {
            continue
        }

        // 子bucket也是有父节点的，找到父节点并更新
        var c = b.Cursor()
        k, _, flags := c.seek([]byte(name))
        if !bytes.Equal([]byte(name), k) {
            panic(fmt.Sprintf("misplaced bucket header: %x -> %x", []byte(name), k))
        }
        if flags&bucketLeafFlag == 0 {
            panic(fmt.Sprintf("unexpected bucket header flag: %x", flags))
        }
                //更新
        c.node().put([]byte(name), []byte(name), value, 0, bucketLeafFlag)
    }

    // Ignore if there's not a materialized root node.
    if b.rootNode == nil {
        return nil
    }

    //bucket对应的node节点拆分
    if err := b.rootNode.spill(); err != nil {
        return err
}
//更新根节点
    b.rootNode = b.rootNode.root()

    // Update the root node for this bucket.
    if b.rootNode.pgid >= b.tx.meta.pgid {
        panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", b.rootNode.pgid, b.tx.meta.pgid))
}
//既然更新了rootNode也必须更新root page对应的id
    b.root = b.rootNode.pgid

    return nil
}

具体的spill执行还是在node中：

func (n *node) spill() error {
    var tx = n.bucket.tx
    if n.spilled {
        return nil
    }

    // 从叶子节点开始向上拆分，此处递归调用spill
    sort.Sort(n.children)
    for i := 0; i < len(n.children); i++ {
        if err := n.children[i].spill(); err != nil {
            return err
        }
    }
    n.children = nil

    // 拆分node成一组nodes
    var nodes = n.split(tx.db.pageSize)
    for _, node := range nodes {//遍历每一个拆分出来的node
        // Add node's page to the freelist if it's not new.
        if node.pgid > 0 {//如果有对应老的page，释放
            tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
            node.pgid = 0
        }

        // 为该node分配新的page
        p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
        if err != nil {
            return err
        }

        // Write the node.
        if p.id >= tx.meta.pgid {
            panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
        }
        node.pgid = p.id//记录node对应page id
        node.write(p) //将node写入page
        node.spilled = true//标记该node已处理

        // 更新父节点
        if node.parent != nil {
            var key = node.key
            if key == nil {
                key = node.inodes[0].key
            }
                        //将拆分出来的每一个node读拆入其父节点中，叶子节点是插入kv，非叶子节点插入k-pgid
            node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
            node.key = node.inodes[0].key
            _assert(len(node.key) > 0, "spill: zero-length node key")
        }

        // Update the statistics.
        tx.stats.Spill++
    }

    // 如果所有node都spill完，并且有新的root被拆分出来，处理root节点的拆分
    if n.parent != nil && n.parent.pgid == 0 {
        n.children = nil
        return n.parent.spill()
    }

    return nil
}

继续向下看具体的node节点是如何拆分的：

func (n *node) splitTwo(pageSize int) (*node, *node) {
    // 如果不用拆分就返回
    if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
        return n, nil
    }

    // 确定拆分节点的阀值
    var fillPercent = n.bucket.FillPercent
    if fillPercent < minFillPercent {
        fillPercent = minFillPercent
    } else if fillPercent > maxFillPercent {
        fillPercent = maxFillPercent
    }
    threshold := int(float64(pageSize) * fillPercent)

    // 确定拆分的index
    splitIndex, _ := n.splitIndex(threshold)

    // 已经拆分没有父节点怎么可以
    if n.parent == nil {
        n.parent = &node{bucket: n.bucket, children: []*node{n}}
    }

    // 生成新节点
    next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
    n.parent.children = append(n.parent.children, next)

    // inodes 根据定好的index分到两个节点内
    next.inodes = n.inodes[splitIndex:]
    n.inodes = n.inodes[:splitIndex]
    // Update the statistics.
    n.bucket.tx.stats.Split++
    return n, next
}

总结下spill，一个原则就是自下而上，层层拆分，Bucket拆分完后，在更新bucket所在的node。其实这是两棵树，一颗bucket树，一颗node树。看下图即可明白：

2.2 freelist 管理

free 给定 txid, 释放 page 及 overflow 所有的页，将所有 pid 追加到 f.pending[txid] 数组中，并添加到 cache map

// page retrieves a page reference from the mmap based on the current page size.
func (db *DB) page(id pgid) *page {
    pos := id * pgid(db.pageSize)
    return (*page)(unsafe.Pointer(&db.data[pos]))
}
func (f *freelist) free(txid txid, p *page) {
    if p.id <= 1 {
        panic(fmt.Sprintf("cannot free page 0 or 1: %d", p.id))
    }

    // Free page and all its overflow pages.
    var ids = f.pending[txid]
    for id := p.id; id <= p.id+pgid(p.overflow); id++ {
        // Verify that page is not already free.
        if f.cache[id] {
            panic(fmt.Sprintf("page %d already freed", id))
        }

        // Add to the freelist and cache.
        ids = append(ids, id)
        f.cache[id] = true
    }
    f.pending[txid] = ids
}

release 指定最大 txid 释放全部无效事务的引用页，遍历所有 pending 事务 map，所有事务 id 小于给定 txid 的全部释放，并追加到 f.ids 数组中，然后排序

func (f *freelist) release(txid txid) {
    m := make(pgids, 0)
    for tid, ids := range f.pending {
        if tid <= txid {
            // Move transaction's pending pages to the available freelist.
            // Don't remove from the cache since the page is still free.
            m = append(m, ids...)
            delete(f.pending, tid)
        }
    }
    sort.Sort(m)
    f.ids = pgids(f.ids).merge(m)
}

分配 freelist，分配页时，返回 n 个连续可用页的首个 id. 这么做是有好处的，对这片磁盘的写都是顺序写，减少随机写。但是这里有个问题，如果一直找不到，那就会大量的从物理空间申请，会有碎片化问题

扫描二维码关注公众号，回复： 11121832 查看本文章

func (f *freelist) allocate(n int) pgid {
    if len(f.ids) == 0 {
        return 0
    }

    var initial, previd pgid
    for i, id := range f.ids {
        if id <= 1 {
            panic(fmt.Sprintf("invalid page allocation: %d", id))
        }

        // Reset initial page if this is not contiguous.
        if previd == 0 || id-previd != 1 {
            initial = id
        }
        if (id-initial)+1 == pgid(n) {
            if (i + 1) == n {
                f.ids = f.ids[i+1:]
            } else {
                copy(f.ids[i-n+1:], f.ids[i+1:])
                f.ids = f.ids[:len(f.ids)-n]
            }

            // Remove from the free cache.
            for i := pgid(0); i < pgid(n); i++ {
                delete(f.cache, initial+i)
            }

            return initial
        }

        previd = id
    }
    return 0
}

2.3 脏page刷盘

func (tx *Tx) write() error {
    // Sort pages by id.
    pages := make(pages, 0, len(tx.pages))
    for _, p := range tx.pages {
        pages = append(pages, p)
    }
    // Clear out page cache early.
    tx.pages = make(map[pgid]*page)
    sort.Sort(pages)

    // 遍历所有的脏page
    for _, p := range pages {
        size := (int(p.overflow) + 1) * tx.db.pageSize
        offset := int64(p.id) * int64(tx.db.pageSize)//写入offset
        ptr := (*[maxAllocSize]byte)(unsafe.Pointer(p))
        for {
                       //将p写入文件偏移offset
            sz := size
            if sz > maxAllocSize-1 {
                sz = maxAllocSize - 1
            }
            buf := ptr[:sz]
            if _, err := tx.db.ops.writeAt(buf, offset); err != nil {
                return err
            }
            tx.stats.Write++
            size -= sz
            if size == 0 {
                break
            }
            offset += int64(sz)
            ptr = (*[maxAllocSize]byte)(unsafe.Pointer(&ptr[sz]))
        }
    }

    // Ignore file sync if flag is set on DB.
    if !tx.db.NoSync || IgnoreNoSync {
        if err := fdatasync(tx.db); err != nil {
            return err
        }
    }

    // 将释放的page归还到内存池中
    for _, p := range pages {
        if int(p.overflow) != 0 {
            continue
        }

        buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:tx.db.pageSize]        for i := range buf {
            buf[i] = 0
        }
        tx.db.pagePool.Put(buf)
    }

    return nil
}

2.4 meta刷盘

func (tx *Tx) writeMeta() error {
    // Create a temporary buffer for the meta page.
    buf := make([]byte, tx.db.pageSize)
    p := tx.db.pageInBuffer(buf, 0)
    tx.meta.write(p)

    // Write the meta page to file.
    if _, err := tx.db.ops.writeAt(buf, int64(p.id)*int64(tx.db.pageSize)); err != nil {
        return err
    }
    if !tx.db.NoSync || IgnoreNoSync {
        if err := fdatasync(tx.db); err != nil {
            return err
        }
    }

    // Update statistics.
    tx.stats.Write++

    return nil
}

Meta写入文件比较简单，不在详细介绍。

Rebalance 没有详细介绍，理解了spill流程Rebalance应该很容易理解。

3、回滚

回滚操作做两件事,1:加入pending 、cache中的page再删除掉。2：从ids删除的空闲page在加回去。数据不需要回滚，因为有cow可以保证数据一致性

func (tx *Tx) rollback() {
    if tx.db == nil {
        return
    }
if tx.writable {
        //从pending 、cache删除挂起的page
        tx.db.freelist.rollback(tx.meta.txid)
                 //回滚空闲page
        tx.db.freelist.reload(tx.db.page(tx.db.meta().freelist))
    }
    tx.close()
}

func (f *freelist) rollback(txid txid) {
    // 把加入cache/pending中的page再删除掉
    for _, id := range f.pending[txid] {
        delete(f.cache, id)
    }

    // Remove pages from pending list.
    delete(f.pending, txid)
}

func (f *freelist) reload(p *page) {
    f.read(p)

    // Build a cache of only pending pages.
    pcache := make(map[pgid]bool)
    for _, pendingIDs := range f.pending {
        for _, pendingID := range pendingIDs {
            pcache[pendingID] = true
        }
    }

// 重新构建ids
var a []pgid
    for _, id := range f.ids {
        if !pcache[id] {
            a = append(a, id)
        }
    }
    f.ids = a
    f.reindex()
}

游侠souy

发布了48 篇原创文章 · 获赞 9 · 访问量 1万+

私信关注