pkg/xfs/xfs.go from vorteil/vorteil

pkg/xfs/xfs.go
Summary

Maintainability

1 wk
Test Coverage

Issues
package xfs

import (
    "bytes"
    "context"
    "encoding/binary"
    "errors"
    "fmt"
    "io"
    "io/ioutil"
    "strings"

    "github.com/davidminor/uint128"
    "github.com/vorteil/vorteil/pkg/elog"
    "github.com/vorteil/vorteil/pkg/vcfg"
    "github.com/vorteil/vorteil/pkg/vio"
)

type precompilerArgs struct {
    Logger   elog.Logger
    Contents vio.FileTree
    Options  struct {
        MinimumFreeInodes int64
        MinimumFreeSpace  int64
    }
}

type constants struct {
    exponents struct {
        sectorSize          uint8
        blockSize           uint8
        blocksPerAllocGroup uint8
        inodeSize           uint8
        directoryBlockSize  uint8
    }
    targetAllocGroupCount int64
    pageSize              int64
    inodeDataCapacity     int64
    allocGroups           int64
    journalBlocks         int64
    treeBlocks            int64

    usedInodes  int64
    freeInodes  int64
    totalInodes int64

    usedBlocks  int64
    freeBlocks  int64
    totalBlocks int64
}

// NOTE: these translation functions are a post-hoc hack to fix a bug I caused by not understanding XFS.
func (c *constants) translateRelativeInodeNumber(x int64) uint64 {
    ag := x / c.inodesPerAllocGroup()
    rel := x % c.inodesPerAllocGroup()
    rel += c.inodesPerBlock() * (c.superMetaBlocks() + 7)
    if ag == 0 {
        rel += c.journalBlocks * c.inodesPerBlock()
    }
    return uint64(rel)
}

var overrideInodeTranslator func(x int64) uint64

func (c *constants) translateAbsoluteInodeNumber(x int64) uint64 {

    if overrideInodeTranslator != nil {
        return overrideInodeTranslator(x)
    }

    ag := x / c.inodesPerAllocGroup()
    rel := x % c.inodesPerAllocGroup()
    rel += c.inodesPerBlock() * (c.superMetaBlocks() + 7)
    if ag == 0 {
        rel += c.journalBlocks * c.inodesPerBlock()
    }
    return c.inodeNumber(ag, rel)
}

func (c *constants) inodeNumber(ag int64, rel int64) uint64 {
    bits := c.exponents.blocksPerAllocGroup + (c.exponents.blockSize - c.exponents.inodeSize)
    return (uint64(ag) << bits) | uint64(rel)
}

func (c *constants) translateRelativeBlockNumber(x int64) uint64 {
    return uint64(x % c.blocksPerAllocGroup())
}

func (c *constants) translateAbsoluteBlockNumber(x int64) uint64 {
    ag := x / c.blocksPerAllocGroup()
    rel := x % c.blocksPerAllocGroup()
    return c.blockNumber(ag, rel)
}

func (c *constants) blockNumber(ag int64, rel int64) uint64 {
    bits := c.exponents.blocksPerAllocGroup
    return (uint64(ag) << bits) | uint64(rel)
}

func (c *constants) sectorSize() int64 {
    return 1 << c.exponents.sectorSize
}

func (c *constants) blockSize() int64 {
    return 1 << c.exponents.blockSize
}

func (c *constants) inodeSize() int64 {
    return 1 << c.exponents.inodeSize
}

func (c *constants) inodesPerBlock() int64 {
    return 1 << (c.exponents.blockSize - c.exponents.inodeSize)
}

func (c *constants) blocksPerAllocGroup() int64 {
    return 1 << c.exponents.blocksPerAllocGroup
}

func (c *constants) inodeBlocksPerAllocGroup() int64 { // TODO: unit test if it's possible to get an odd number out of this (I'm suspicious...)
    inodesPerAllocGroup := divide(c.totalInodes, c.allocGroups)
    inodesPerAllocGroup = align(inodesPerAllocGroup, 64) // round up to nearest 64
    return divide(inodesPerAllocGroup, c.inodesPerBlock())
}

func (c *constants) inodesPerAllocGroup() int64 {
    return c.inodeBlocksPerAllocGroup() * c.inodesPerBlock()
}

func (c *constants) directoryBlockSize() int64 {
    return c.blockSize() * (1 << c.exponents.directoryBlockSize)
}

func (c *constants) calculateInodes(used, minimumFree int64) {
    c.usedInodes = used
    c.freeInodes = minimumFree
    c.totalInodes = c.usedInodes + c.freeInodes
    c.totalInodes = c.inodesPerAllocGroup() * c.allocGroups
    c.freeInodes = c.totalInodes - c.usedInodes
}

func (c *constants) superMetaBlocks() int64 {
    return divide(4*c.sectorSize(), c.blockSize()) // 4 sectors
}

func (c *constants) metadataBlocksPerAllocGroup() int64 {
    return c.superMetaBlocks() + 7 + c.inodeBlocksPerAllocGroup() // + 7 blocks for b+ trees and the free list
}

func (c *constants) partialCalculateSpace() {
    c.usedBlocks = c.treeBlocks + c.journalBlocks
    metadataBlocks := c.metadataBlocksPerAllocGroup() * c.allocGroups
    c.usedBlocks += metadataBlocks
}

type precompiler struct {
    log elog.Logger
    constants
    args       precompilerArgs
    tree       vio.FileTree
    nodeBlocks []uint32 // uint32 gives us a file-size limit of 16 TiB with 4 KiB blocks
}

func (p *precompiler) setBlockSize(size int64) error {

    if size == 0 {
        p.exponents.blockSize = 12 // 4 KiB
        return nil
    }

    if size < p.sectorSize() {
        return fmt.Errorf("minimum block size is %d", p.sectorSize())
    }

    if size > p.pageSize {
        return fmt.Errorf("maximum block size is %d", p.pageSize)
    }

    for x, y := p.exponents.sectorSize, p.sectorSize(); y <= p.pageSize; x, y = x+1, 2*y {
        if y == size {
            p.exponents.blockSize = x
            break
        }
    }

    if p.exponents.blockSize == 0 {
        return errors.New("block size must be a power of two")
    }

    return nil
}

func (p *precompiler) calculateMinimumSize(ctx context.Context) error {

    var err error

    p.allocGroups = 1
    p.exponents.blocksPerAllocGroup = 12 // This makes for 16 MiB groups when using 4 KiB blocks. Big enough for a small journal log.

    // scan file tree to calculate space & inode requirements
    var blocks int64
    var k int
    treeNodes := p.tree.NodeCount()

    // realtime stuff
    var root *vio.TreeNode
    // blocks += 1
    treeNodes += 2
    p.nodeBlocks = make([]uint32, treeNodes)
    // p.nodeBlocks[2] = 1

    err = p.tree.WalkNode(func(path string, node *vio.TreeNode) error {

        if ctx.Err() != nil {
            return err
        }

        if root == nil {
            root = node
            root.Parent = root
        } else {
            node.NodeSequenceNumber += 2 // offset for realtime devices
            k = int(node.NodeSequenceNumber)
        }

        var x int64
        f := node.File

        if f.IsDir() {

            var ls int64 // length (short form)
            var lb int64 // length (block form)
            var ll int64 // length (leaf form)
            // TODO: var ln int64 // length (node form)

            ls = 6 - 17 // Header + negative for . & .. NOTE: the 6 could be 10 if we ever expect to have a huge number of inodes
            lb = 24     // Magic number, best free array, and block tail.
            ll = 16     // Magic number and best free array.

            grow := func(child string) {
                l := int64(len(child))
                ls += l + 7 // NOTE: this could be +11 if we ever expect to have a huge number of inodes
                l = 11 + l
                l = align(l, 8) // round up to nearest 8
                lb += l + 8     // +8: leaf entry

                // TODO: shuffle entries to optimize used space

                delta := align(ll, p.blockSize()) - ll
                if delta < l || delta-l < 16 {
                    ll += delta + 16
                }

                ll += l
            }

            grow(".")
            grow("..")

            for _, child := range node.Children {
                grow(child.File.Name())
            }

            entries := int64(2 + len(node.Children))

            if ls < p.inodeDataCapacity {
                // directory can be stored in short form
                x = 0
            } else if lb < p.directoryBlockSize() {
                // directory can be stored in block form
                x = 1
            } else {
                // determine how many extents are necessary to store the data
                next := divide(ll, p.blockSize()) // number of extents for the directory data
                for {
                    lfll := 16 + 4 + 2*next + (8 * entries) // leaf-form leaf length
                    ddl := ll                               // directory data length
                    ddb := divide(ddl, p.directoryBlockSize())
                    // if ddb > 1 && next == 1 {
                    //     next = 2
                    //     continue
                    //     // NOTE: This is a hack for leaf-form directories. A leaf form directory
                    //     // cannot conceivably be part of more than two allocation groups in this
                    //     // implementation, but calculating how many extents are genuinely needed
                    //     // is challenging. The solution here is to assume it's spread over the
                    //     // maximum number of alloc groups.
                    // }
                    if lfll > p.directoryBlockSize() {
                        // directory must be stored in node form
                        x = ddb // data blocks
                        x += 1  // node block

                        leafBlocksBytes := 16 + (8 * entries)
                        headerSize := int64(16)
                        leafBlocks := divide(leafBlocksBytes, p.directoryBlockSize()-headerSize)
                        // leafBlocksBytes += leafBlocks * headerSize
                        x += leafBlocks // leaf blocks

                        freeIndexBytes := 16 + 2*next
                        freeIndexBlocks := divide(freeIndexBytes, p.directoryBlockSize())
                        x += freeIndexBlocks // freeindex blocks
                        break
                    }
                    x = 1 + ddb
                    break
                }
            }
        } else if f.IsSymlink() {
            x = int64(f.Size())
            if f.SymlinkIsCached() && x < p.inodeDataCapacity {
                x = 0
            }
            x = divide(x, p.blockSize())
        } else {
            x = int64(f.Size())
            x = divide(x, p.blockSize())
        }

        if x >= 0x100000000 {
            return errors.New("object too large to include")
        }

        p.nodeBlocks[k] = uint32(x)
        blocks += x

        return nil
    })
    if err != nil {
        goto fail
    }

    p.treeBlocks = blocks
    p.journalBlocks = 1368 // no idea why, but this seems to be the smallest number the reference implementation uses

    k = -1

    for {
        k++
        if k > 0 {
            if k > 16 {
                err = errors.New("failed to pre-calculate minimum file-system size within a reasonable number of iterations")
                goto fail
            }
        }

        err = ctx.Err()
        if err != nil {
            goto cancel
        }

        // inodes
        p.calculateInodes(int64(treeNodes), p.args.Options.MinimumFreeInodes)

        // space
        minFreeBlocks := divide(p.args.Options.MinimumFreeSpace, p.blockSize())
        p.freeBlocks = minFreeBlocks
        p.partialCalculateSpace()
        p.totalBlocks = p.usedBlocks + p.freeBlocks

        minBlocks := (p.allocGroups-1)*p.blocksPerAllocGroup() + p.metadataBlocksPerAllocGroup()
        if p.totalBlocks < minBlocks {
            p.totalBlocks = minBlocks
            p.freeBlocks = p.totalBlocks - p.usedBlocks
        }

        //
        // reiteration checks
        //

        if p.totalBlocks > p.allocGroups*p.blocksPerAllocGroup() {
            // We need to grow the disk. We can achieve this by adding more alloc groups or increasing the size of alloc groups.
            // This logic attempts to strike a balance between the two, since larger alloc groups allow for bigger journals and
            // wastes less disk space on metadata, while more numerous alloc groups increases parallel performance.
            x := divide(p.totalBlocks, p.blocksPerAllocGroup())
            if x >= p.targetAllocGroupCount*2 {
                p.exponents.blocksPerAllocGroup++
                x = divide(p.totalBlocks, p.blocksPerAllocGroup())
            }
            p.allocGroups = x
            continue
        }

        // NOTE: If we want to ensure a good ratio between inode metadata and data blocks we should do it here.
        //        I have avoided doing so here because I think it might be counter-intuitive with Vorteil's "+X MiB" approach.

        if p.journalBlocks < p.blocksPerAllocGroup()-p.metadataBlocksPerAllocGroup() {
            // Check if we can grow the journal.
            growth := (p.blocksPerAllocGroup() - p.metadataBlocksPerAllocGroup()) - p.journalBlocks
            if x := p.freeBlocks - minFreeBlocks; x < growth {
                growth = x
            }
            if growth > 0 {
                p.journalBlocks += growth
                continue
            }
        }

        break
    }

    return nil

fail:
    if err == context.Canceled || err == context.DeadlineExceeded {
        goto cancel
    }

    return err

cancel:
    return err

}

func newBuild(ctx context.Context, args *precompilerArgs) (*precompiler, error) {

    var err error

    p := &precompiler{
        log: args.Logger,
        constants: constants{
            pageSize:              0x10000, // 64 KiB
            targetAllocGroupCount: 8,       // larger numbers might increase parallel performance, smaller numbers might increase journal size (which can also increase performance)
        },
        args: *args,
        tree: args.Contents,
    }
    p.exponents.sectorSize = 9 // 512 bytes
    p.exponents.inodeSize = 9  // 512 bytes
    p.inodeDataCapacity = p.inodeSize() - 100
    p.exponents.directoryBlockSize = 0

    err = p.setBlockSize(0x1000) // 4 KiB
    if err != nil {
        goto fail
    }

    if err = ctx.Err(); err != nil {
        goto cancel
    }

    err = p.calculateMinimumSize(ctx)
    if err != nil {
        goto fail
    }

    return p, nil

fail:
    if err == context.Canceled || err == context.DeadlineExceeded {
        goto cancel
    }

    return nil, err

cancel:
    return nil, err

}

func (p *precompiler) MinimumSize() int64 {
    return p.totalBlocks * p.blockSize()
}

type compiler struct {
    log elog.Logger
    constants
    args                      precompilerArgs
    tree                      vio.FileTree
    data, nodes               chan *vio.TreeNode
    dataError, nodesError     error
    dataReader                io.Reader
    dataReaderBlocksRemaining int64
    nodeCounter               int

    bitmap               []uint64
    nodeBlocks           []uint32
    nodeExtents          [][]*extent
    lastCalculatedNode   int64
    allocGroupFreeBlocks []int64
    allocGroupFreeInodes []int64

    countedNodeBlocks int64
}

func (p *precompiler) Precompile(ctx context.Context, fsSize int64) (*compiler, error) {

    var err error

    c := &compiler{
        log:                p.log,
        constants:          p.constants,
        args:               p.args,
        tree:               p.tree,
        nodeBlocks:         p.nodeBlocks,
        nodeExtents:        make([][]*extent, len(p.nodeBlocks)),
        lastCalculatedNode: -1,
    }

    if fsSize%p.blockSize() != 0 {
        return nil, fmt.Errorf("file-system must be a multiple of the block size (%v)", p.blockSize())
    }

    if fsSize < p.MinimumSize() {
        return nil, fmt.Errorf("file-system needs at least %v more space", vcfg.Bytes(p.MinimumSize()-fsSize))
    }

    if err = ctx.Err(); err != nil {
        goto cancel
    }

    c.totalBlocks = fsSize / p.blockSize()
    err = c.precompile(ctx)
    if err != nil {
        goto fail
    }

    return c, nil

fail:
    if err == context.Canceled || err == context.DeadlineExceeded {
        goto cancel
    }

    return nil, err

cancel:
    return nil, err

}

func (c *compiler) precompile(ctx context.Context) error {

    var err error

    // alloc group sizes
    for {
        c.allocGroups = divide(c.totalBlocks, c.blocksPerAllocGroup())
        if c.allocGroups < 2*c.targetAllocGroupCount {
            break
        }
        c.exponents.blocksPerAllocGroup++
    }

    c.calculateInodes(c.usedInodes, c.args.Options.MinimumFreeInodes)

    minFreeBlocks := divide(c.args.Options.MinimumFreeSpace, c.blockSize())
    k := -1

    for {
        k++
        if k > 16 {
            err = errors.New("failed to calculate space in a reasonable number of iterations")
            goto fail
        }

        c.partialCalculateSpace()
        c.freeBlocks = c.totalBlocks - c.usedBlocks

        // Check if we can grow the journal.
        if c.journalBlocks+1 < c.blocksPerAllocGroup()-c.metadataBlocksPerAllocGroup() {
            growth := (c.blocksPerAllocGroup() - c.metadataBlocksPerAllocGroup()) - c.journalBlocks
            if x := c.freeBlocks - minFreeBlocks; x < growth {
                growth = x
            }

            // NOTE: this prevents the inodes section from becoming misaligned
            if growth%2 > 0 {
                growth--
            }

            if growth > 0 {
                c.journalBlocks += growth
                continue
            }
        }

        break
    }

    if err = ctx.Err(); err != nil {
        goto cancel
    }

    // calculate use bitmap
    c.buildBitmap()

    return nil

fail:
    if err == context.Canceled || err == context.DeadlineExceeded {
        goto cancel
    }

    return err

cancel:
    return err

}

func (c *compiler) buildBitmap() {
    c.allocGroupFreeBlocks = make([]int64, c.allocGroups)
    c.allocGroupFreeInodes = make([]int64, c.allocGroups)

    for i := int64(0); i < c.allocGroups; i++ {
        c.allocGroupFreeBlocks[i] = c.blocksPerAllocGroup() - c.metadataBlocksPerAllocGroup()
        c.allocGroupFreeInodes[i] = c.inodesPerAllocGroup()
    }

    // trim space off the last alloc group
    if c.totalBlocks != c.blocksPerAllocGroup()*c.allocGroups {
        c.allocGroupFreeBlocks[c.allocGroups-1] = (c.totalBlocks % c.blocksPerAllocGroup()) - c.metadataBlocksPerAllocGroup()
    }

    // subtract space from first alloc group for the journal
    c.allocGroupFreeBlocks[0] -= c.journalBlocks

    // distribute nodes amongst alloc groups TODO: see if files can be spread out better amongst the alloc groups
    inodes := int64(len(c.nodeBlocks))
    idx := int64(0)
    for {
        delta := c.inodesPerAllocGroup()
        if delta >= inodes {
            c.allocGroupFreeInodes[idx] -= inodes
            inodes = 0
            break
        }
        c.allocGroupFreeInodes[idx] = 0
        inodes -= delta
        idx++
        if idx >= c.allocGroups {
            panic(errors.New("failed to distribute inode but it should have been possible"))
        }
    }

    idx = 0
    for _, nodeBlocks := range c.nodeBlocks {
        blocks := int64(nodeBlocks)
        for {
            delta := c.allocGroupFreeBlocks[idx]
            if delta >= blocks {
                c.allocGroupFreeBlocks[idx] -= blocks
                blocks = 0
                break
            }
            c.allocGroupFreeBlocks[idx] = 0
            blocks -= delta
            idx++
            if idx >= c.allocGroups {
                panic(errors.New("failed to distribute blocks but it should have been possible"))
            }
        }
    }

    // build the bitmap
    c.bitmap = make([]uint64, divide(c.totalBlocks, 64))
    for ag := int64(0); ag < c.allocGroups; ag++ {
        l := c.allocGroupFreeBlocks[ag]
        first := c.blocksPerAllocGroup() - l

        // exception for the final alloc group
        if ag == c.allocGroups-1 && c.totalBlocks != c.blocksPerAllocGroup()*c.allocGroups {
            first -= c.blocksPerAllocGroup() - (c.totalBlocks % c.blocksPerAllocGroup())
        }

        // TODO: improve performance here by treating uint64s instead of bits
        for i := int64(0); i < l; i++ {
            block := first + i
            chunk := block / 64
            bit := block % 64
            c.bitmap[chunk] |= (0x1 << bit)
        }
    }

}

func (c *compiler) Size() int64 {
    return c.blockSize() * c.totalBlocks
}

func (c *compiler) Compile(ctx context.Context, w io.WriteSeeker) error {

    var err error

    c.data = make(chan *vio.TreeNode)  // TODO: close this safely
    c.nodes = make(chan *vio.TreeNode) // TODO: close this safely
    go c.treeWalkers()

    defer func() {
        c.tree.Close()
        for {
            if _, more := <-c.data; !more {
                break
            }
        }
        for {
            if _, more := <-c.nodes; !more {
                break
            }
        }
    }()

    err = c.writeAllocGroups(ctx, w)
    if err != nil {
        goto fail
    }

    return nil

fail:
    if err == context.Canceled || err == context.DeadlineExceeded {
        goto cancel
    }

    return err

cancel:
    return err
}

func (c *compiler) writeAllocGroups(ctx context.Context, w io.WriteSeeker) error {
    var err error
    for ag := int64(0); ag < c.allocGroups; ag++ {
        err = c.writeAllocGroup(ctx, w, ag)
        if err != nil {
            return err
        }
    }
    return nil
}

func (c *compiler) writeAllocGroup(ctx context.Context, w io.WriteSeeker, ag int64) error {
    var err error

    allocGroupOffset := ag * c.blocksPerAllocGroup() * c.blockSize()
    _, err = w.Seek(allocGroupOffset, io.SeekStart)
    if err != nil {
        return err
    }

    uuid, err := generateUID()
    if err != nil {
        return err
    }

    // superblock
    sb := &SuperBlock{
        MagicNumber: SBMagicNumber,
        BlockSize:   uint32(c.blockSize()),
        DataBlocks:  uint64(c.totalBlocks),
        // UUID:                       [16]byte{0x65, 0x7b, 0x9b, 0x73, 0xfe, 0xa4, 0x44, 0x3e, 0x98, 0x51, 0x6b, 0x01, 0x55, 0x7f, 0xea, 0xab}, // should this be generated?
        UUID:                       uuid,
        LogStart:                   uint64(c.superMetaBlocks() + 7), // first non-metadata thing on the first alloc group
        RootInode:                  c.translateAbsoluteInodeNumber(0),
        RealtimeBitmapInode:        c.translateAbsoluteInodeNumber(1),
        RealtimeSummaryInode:       c.translateAbsoluteInodeNumber(2),
        RealtimeExtentBlocks:       1,
        AGBlocks:                   uint32(c.blocksPerAllocGroup()),
        AGCount:                    uint32(c.allocGroups),
        LogBlocks:                  uint32(c.journalBlocks),
        VersionNum:                 VersionNumber | VersionAlignBit | VersionNlinkBit | VersionLogV2Bit | VersionExtFlgBit | VersionDirV2Bit | VersionMoreBitsBit,
        SectorSize:                 SectorSize,
        InodeSize:                  uint16(c.inodeSize()),
        InodesPerBlock:             uint16(c.inodesPerBlock()),
        FSName:                     [12]byte{'x', 'f', 's', 0, 0, 0, 0, 0, 0, 0, 0, 0}, // should this be configurable?
        BlockSizeLogarithmic:       uint8(c.exponents.blockSize),
        SectorSizeLogarithmic:      sectorSizeLog,
        InodeSizeLogarithmic:       c.exponents.inodeSize,
        InodesPerBlockLogarithmic:  uint8(c.exponents.blockSize - c.exponents.inodeSize),
        AGBlocksLogarithmic:        uint8(c.exponents.blocksPerAllocGroup),
        InodesMaxPercentage:        inodesPercentage,
        InodesAllocated:            uint64(c.allocGroups * c.inodesPerAllocGroup()),
        InodesFree:                 uint64(c.freeInodes),
        DataFree:                   uint64(c.freeBlocks + 4*c.allocGroups), // NOTE: +4/ag is for the reserved free-list blocks, I think.
        DirectoryBlocksLogarithmic: dirBlockAllocLog,
        LogSectorSizeLogarithmic:   0,
        LogSectorSize:              0,
        MoreFeatures:               Version2LazySBCountBit,
        BadFeatures:                Version2LazySBCountBit,
        // UserQuotasInode:            0xffffffff,
        // GroupQuotasInode:           0xffffffff,
        InodeChunkAlignment: 2,
        LogStripeUnit:       1,
    }

    err = binary.Write(w, binary.BigEndian, sb)
    if err != nil {
        return err
    }

    // alloc group free block info
    _, err = w.Seek(allocGroupOffset+c.sectorSize(), io.SeekStart)
    if err != nil {
        return err
    }

    abtbAddr := uint32(c.superMetaBlocks()) + 1
    abtcAddr := abtbAddr + 1
    freeBlocks := uint32(c.allocGroupFreeBlocks[ag])

    agf := &AGF{
        Magic:       AGFMagicNumber,
        Version:     AGFVersion,
        SeqNo:       uint32(ag),
        Length:      uint32(c.blocksPerAllocGroup()),
        Roots:       [2]uint32{abtbAddr, abtcAddr},
        Levels:      [2]uint32{1, 1},
        FLFirst:     0,
        FLLast:      3,
        FLCount:     4,
        FreeBlocks:  freeBlocks,
        Longest:     uint32(c.allocGroupFreeBlocks[ag]),
        BTreeBlocks: 0,
    }

    if ag == c.allocGroups-1 && c.totalBlocks%c.blocksPerAllocGroup() != 0 {
        agf.Length = uint32(c.totalBlocks % c.blocksPerAllocGroup())
    }

    err = binary.Write(w, binary.BigEndian, agf)
    if err != nil {
        return err
    }

    // inode b+ tree info
    _, err = w.Seek(allocGroupOffset+c.sectorSize()*2, io.SeekStart)
    if err != nil {
        return err
    }

    inodeChunks := c.inodesPerAllocGroup() / 64
    chunksPerLeaf := (c.blockSize() - 16) / 16 // -16 for header, /16 for record size
    leavesPerNode := (c.blockSize() - 16) / 8  // -16 for header, /8 for key/pointer pairs
    leavesNeeded := inodeChunks / chunksPerLeaf
    level := uint32(1)
    if leavesNeeded > 1 {
        level++
        if leavesNeeded > leavesPerNode {
            return errors.New("deep b+ trees unimplemented") // this will trigger if we try to have more than 8323200 inodes per alloc group
        }
        return errors.New("deep b+ trees unimplemented") // this will trigger if we try to have more than 16320 inodes per alloc group
    }

    agi := &AGI{
        Magic:     AGIMagicNumber,
        Version:   AGIVersion,
        SeqNo:     uint32(ag),
        Length:    uint32(c.blocksPerAllocGroup()),
        Count:     uint32(c.inodesPerAllocGroup()), // TODO: - c.allocGroupFreeInodes[ag]), -- is it meant to be this way?
        Root:      uint32(c.superMetaBlocks()),
        Level:     level,
        FreeCount: uint32(c.allocGroupFreeInodes[ag]),
        NewIno:    uint32(c.translateRelativeInodeNumber(ag * c.inodesPerAllocGroup())),
        DirIno:    uint32(0xFFFFFFFF), // NOTE: according to the spec this is NULL (-1)
    }

    for i := 0; i < 64; i++ {
        agi.Unlinked[i] = 0xFFFFFFFF // NOTE: according to the spec this is NULL (-1)
    }

    if ag == c.allocGroups-1 && c.totalBlocks%c.blocksPerAllocGroup() != 0 {
        agi.Length = uint32(c.totalBlocks % c.blocksPerAllocGroup())
    }

    err = binary.Write(w, binary.BigEndian, agi)
    if err != nil {
        return err
    }

    // internal free list info
    _, err = w.Seek(allocGroupOffset+c.sectorSize()*3, io.SeekStart)
    if err != nil {
        return err
    }

    agfl := []uint32{uint32(c.superMetaBlocks()) + 3, uint32(c.superMetaBlocks()) + 4, uint32(c.superMetaBlocks()) + 5, uint32(c.superMetaBlocks()) + 6}

    err = binary.Write(w, binary.BigEndian, agfl)
    if err != nil {
        return err
    }

    err = binary.Write(w, binary.BigEndian, bytes.Repeat([]byte{0xFF}, int(c.sectorSize()-16)))

    // write inode b+ tree
    ibt := &BTreeSBlock{
        Magic:    IBTMagicNumber,
        Level:    0,
        NumRecs:  uint16(inodeChunks),
        LeftSIB:  0xFFFFFFFF,
        RightSIB: 0xFFFFFFFF,
    }

    _, err = w.Seek(allocGroupOffset+int64(agi.Root)*c.blockSize(), io.SeekStart)
    if err != nil {
        return err
    }

    err = binary.Write(w, binary.BigEndian, ibt)
    if err != nil {
        return err
    }

    for i := int64(0); i < inodeChunks; i++ {
        var used int64
        used = c.usedInodes - (ag*c.inodesPerAllocGroup() + i*64)
        if used < 0 {
            used = 0
        } else if used > 64 {
            used = 64
        }
        rec := &InodeBTRecord{
            StartIno:  uint32(c.translateRelativeInodeNumber(i*64 + ag*c.inodesPerAllocGroup())),
            FreeCount: uint32(64 - used),
            Free:      0xFFFFFFFFFFFFFFFF << used,
        }
        err = binary.Write(w, binary.BigEndian, rec)
        if err != nil {
            return err
        }
    }

    // write free space b+ trees
    abtb := &BTreeSBlock{
        Magic:    ABTBMagicNumber,
        Level:    0,
        NumRecs:  1,
        LeftSIB:  0xFFFFFFFF,
        RightSIB: 0xFFFFFFFF,
    }

    firstFreeBlock := agf.Length - freeBlocks
    var ar *AllocRecord

    if c.allocGroupFreeBlocks[ag] > 0 {
        ar = &AllocRecord{
            StartBlock: firstFreeBlock,
            BlockCount: freeBlocks,
        }
    } else {
        abtb.NumRecs = 0
    }

    _, err = w.Seek(allocGroupOffset+int64(agf.Roots[0])*c.blockSize(), io.SeekStart)
    if err != nil {
        return err
    }

    err = binary.Write(w, binary.BigEndian, abtb)
    if err != nil {
        return err
    }

    if ar != nil {
        err = binary.Write(w, binary.BigEndian, ar)
        if err != nil {
            return err
        }
    }

    _, err = w.Seek(allocGroupOffset+int64(agf.Roots[1])*c.blockSize(), io.SeekStart)
    if err != nil {
        return err
    }

    abtc := &BTreeSBlock{
        Magic:    ABTCMagicNumber,
        Level:    0,
        NumRecs:  abtb.NumRecs,
        LeftSIB:  0xFFFFFFFF,
        RightSIB: 0xFFFFFFFF,
    }

    err = binary.Write(w, binary.BigEndian, abtc)
    if err != nil {
        return err
    }

    if ar != nil {
        err = binary.Write(w, binary.BigEndian, ar)
        if err != nil {
            return err
        }
    }

    // free list
    // NOTE: free list should be completely empty for us so we can ignore it

    // journal
    remainder := int64(agf.Length) - c.metadataBlocksPerAllocGroup() // this is how many blocks we have left for copying data
    if ag == 0 {
        _, err = w.Seek(int64(sb.LogStart)*c.blockSize(), io.SeekStart)
        if err != nil {
            return err
        }

        recHeader := &XLogRecHeader{
            Magic:     XLogMagicNumber,
            Cycle:     1,
            Version:   2,
            Len:       uint32(c.sectorSize()),
            LSN:       0x100000000,
            TailLSN:   0x100000000,
            CRC:       0,
            PrevBlock: 0xFFFFFFFF,
            NumLogOps: 1,
            Fmt:       1,
            FSUUID:    sb.UUID,
            Size:      0x8000,
        }
        recHeader.CycleData[0] = 0xB0C0D0D0 // TODO: figure out what this is supposed to mean.

        err = binary.Write(w, binary.BigEndian, recHeader)
        if err != nil {
            return err
        }

        rec := &XLogRecord{
            TransactionID: 1,
            Length:        8,
            ClientID:      0xAA,   // client id (XFS_LOG)
            Flags:         0x20,   // flags (XLOG_UNMOUNT_TRANS)
            Unknown:       0x6E55, // TODO: Figure out what this is supposed to mean.
        }

        err = binary.Write(w, binary.BigEndian, rec)
        if err != nil {
            return err
        }

        remainder -= c.journalBlocks
    }

    // inodes
    for ino := int64(0); ino < c.inodesPerAllocGroup(); ino++ {

        inodeNumber := c.translateRelativeInodeNumber(ino + c.inodesPerAllocGroup()*ag)

        _, err = w.Seek(allocGroupOffset+int64(inodeNumber)*c.inodeSize(), io.SeekStart)
        if err != nil {
            return err
        }

        if ino < c.inodesPerAllocGroup()-c.allocGroupFreeInodes[ag] {
            rdr := c.popInode()
            if rdr == nil {
                return c.nodesError
            }

            _, err = io.CopyN(w, rdr, c.inodeSize())
            if err != nil && err != io.EOF {
                return err
            }
        } else {
            err = binary.Write(w, binary.BigEndian, &InodeCore{
                Magic:        InodeMagicNumber,
                Format:       uint8(ino),
                Version:      2,
                NextUnlinked: 0xFFFFFFFF,
            })
            if err != nil {
                return err
            }
        }
    }

    if err = ctx.Err(); err != nil {
        return err
    }

    _, err = w.Seek(allocGroupOffset+(int64(agf.Length)-remainder)*c.blockSize(), io.SeekStart)
    if err != nil {
        return err
    }

    // data & metadata
    for remainder > int64(freeBlocks) {
        rdr := c.popDataBlock()
        if rdr == nil {
            return c.dataError
        }

        _, err = io.CopyN(w, rdr, c.blockSize())
        if err != nil && err != io.EOF {
            return err
        }
        remainder--
    }

    if remainder != 0 { // force a write to the end of the alloc group just in case that matters
        _, err = w.Seek(allocGroupOffset+int64(agf.Length)*c.blockSize()-1, io.SeekStart)
        if err != nil {
            return err
        }
        _, err = io.Copy(w, bytes.NewReader([]byte{0}))
        if err != nil {
            return err
        }
    }

    return nil
}

func (c *compiler) inodeNumberFromNode(n *vio.TreeNode) uint64 {
    return c.translateAbsoluteInodeNumber(n.NodeSequenceNumber)
}

func (c *compiler) popDataBlock() io.Reader {

    for {
        if c.dataReaderBlocksRemaining > 0 {
            c.dataReaderBlocksRemaining--
            return c.dataReader
        }

        n, more := <-c.data
        if !more {
            return nil
        }

        c.dataReaderBlocksRemaining = int64(c.nodeBlocks[c.nodeCounter])

        c.nodeCounter++

        if c.dataReaderBlocksRemaining == 0 {
            continue
        }

        if n.File.IsDir() {
            c.dataReader = c.generateDirectoryBlockData(n, c.dataReaderBlocksRemaining)
        } else if n.File.IsSymlink() {
            // TODO: does this work?
            _ = c.computeNodeExtents(n.NodeSequenceNumber, &dataRange{blocks: c.dataReaderBlocksRemaining}) // called here to ensure things are computed in order
            c.dataReader = io.MultiReader(n.File, io.LimitReader(vio.Zeroes, c.dataReaderBlocksRemaining*c.blockSize()-int64(n.File.Size())))
        } else {
            _ = c.computeNodeExtents(n.NodeSequenceNumber, &dataRange{blocks: c.dataReaderBlocksRemaining}) // called here to ensure things are computed in order
            c.dataReader = io.MultiReader(n.File, io.LimitReader(vio.Zeroes, c.dataReaderBlocksRemaining*c.blockSize()-int64(n.File.Size())))
        }
    }
}

type extent struct {
    first  uint64
    length int64
    offset int64
}

//
func (c *compiler) translateBlock(x int64) (uint64, int64) { // return (blockAddr, maxExtentLength)
    block := int64(0)
    block += c.metadataBlocksPerAllocGroup()
    block += c.journalBlocks
    delta := c.blocksPerAllocGroup() - block
    for i := int64(0); true; i++ {
        if x < delta && delta > 0 {
            delta -= x
            block += x
            x = 0
            break
        } else {
            block += delta
            x -= delta
            block += c.metadataBlocksPerAllocGroup()
            delta = c.blocksPerAllocGroup() - c.metadataBlocksPerAllocGroup()
        }
    }

    return c.translateAbsoluteBlockNumber(block), delta
}

func (c *compiler) computeExtents(x int64, commit bool) []*extent {
    var extents = []*extent{}
    first := c.countedNodeBlocks
    length := x
    offset := int64(0)

    for length > 0 {
        e := &extent{}
        e.first, e.length = c.translateBlock(first)
        if e.length > length {
            e.length = length
        }
        first += e.length
        length -= e.length
        e.offset = offset
        offset += e.length
        extents = append(extents, e)
    }

    if commit {
        c.countedNodeBlocks += x
    }
    return extents
}

type dataRange struct {
    blocks int64
    offset int64
}

func (c *compiler) computeNodeExtents(ino int64, fragments ...*dataRange) []*extent {

    var e []*extent

    if e = c.nodeExtents[ino]; e != nil {
        c.nodeExtents[ino] = nil // free memory (this should never be needed more than twice)
        return e
    }

    if c.lastCalculatedNode >= ino {
        panic(errors.New("went backwards calculating node extents"))
    }

    if len(fragments) == 0 {
        panic(errors.New("must compute at least one extent fragment"))
    }

    for _, frag := range fragments {
        x := c.computeExtents(frag.blocks, true)
        offset := frag.offset
        for i := 0; i < len(x); i++ {
            x[i].offset = offset
            offset += x[i].length
        }
        e = append(e, x...)
    }

    c.lastCalculatedNode = ino
    c.nodeExtents[ino] = e
    return e
}

func (c *compiler) computeNodeExtentsDryrun(ino int64, fragments ...int64) []*extent {

    var e []*extent

    if e = c.nodeExtents[ino]; e != nil {
        return e
    }

    if c.lastCalculatedNode >= ino {
        panic(errors.New("went backwards calculating node extents"))
    }

    if len(fragments) == 0 {
        panic(errors.New("must compute at least one extent fragment"))
    }

    for _, frag := range fragments {
        e = append(e, c.computeExtents(frag, false)...)
    }

    // c.lastCalculatedNode = ino
    // c.nodeExtents[ino] = e
    return e
}

func (c *compiler) popInode() io.Reader {

    n, more := <-c.nodes
    if !more {
        return nil
    }

    var format uint8
    var mode uint16
    var nextents int32
    var size int64
    var nblocks uint64

    extents := make([]*extent, 0)
    nblocks = uint64(c.nodeBlocks[n.NodeSequenceNumber])
    var data []byte

    if n.File.IsDir() {
        mode = 0x4000 | 0700
        format = InodeFormatLocal
        if nblocks > 0 {
            format = InodeFormatExtents
        }
        size, data, extents = c.generateDirectory(n)
    } else if n.File.IsSymlink() {
        mode = 0xA000 | 0700
        format = InodeFormatLocal
        if nblocks > 0 {
            extents = c.computeNodeExtents(n.NodeSequenceNumber, &dataRange{blocks: int64(nblocks)})
            format = InodeFormatExtents
        }
        size = int64(n.File.Size())
    } else { // regular file
        mode = 0x8000 | 0700
        format = InodeFormatExtents
        size = int64(n.File.Size())
        if size > 0 {
            extents = c.computeNodeExtents(n.NodeSequenceNumber, &dataRange{blocks: int64(nblocks)})
        }
    }

    nextents = int32(len(extents))

    core := &InodeCore{
        Magic:   InodeMagicNumber,
        Mode:    mode,
        Version: 2,
        Format:  format,
        // Onlink:  uint16(n.Links),
        UID:     1000,
        GID:     1000,
        Nlink:   uint32(n.Links),
        Size:    size,
        NBlocks: nblocks,
        // ExtSize TODO: is this necessary?
        NExtents:     nextents,
        AFormat:      InodeFormatExtents,
        NextUnlinked: 0xFFFFFFFF,
    }

    buf := new(bytes.Buffer)
    err := binary.Write(buf, binary.BigEndian, core)
    if err != nil {
        panic(err)
    }

    // write data fork

    if format == InodeFormatExtents {
        maxExtents := c.inodeDataCapacity / 16
        if int64(nextents) > maxExtents {
            panic(errors.New("super nested extents not yet supported")) // TODO
        }
        for _, e := range extents {

            var xe, xoffset, xnumber, xblocks uint128.Uint128

            xblocks.L = uint64(e.length)
            xblocks.L &= 0x1FFFFF
            xblocks = xblocks.ShiftLeft(0)

            xnumber.L = uint64(e.first)
            xnumber.L &= 0x0FFFFFFFFFFFFF
            xnumber = xnumber.ShiftLeft(21)

            xoffset.L = uint64(e.offset)
            xoffset.L &= 0x3FFFFFFFFFFFFF
            xoffset = xoffset.ShiftLeft(73)

            xe = xblocks.Or(xnumber).Or(xoffset)

            err = binary.Write(buf, binary.BigEndian, xe)
            if err != nil {
                panic(err)
            }

            fpath := n.File.Name()
            npath := n
            for {
                if npath.Parent == nil || npath.Parent == npath {
                    break
                }
                npath = npath.Parent
                fpath = npath.File.Name() + "/" + fpath
            }

            // TODO: check this (I'm suspicious of the quality of my work when I wrote it)
        }
    } else if format == InodeFormatLocal {
        if n.File.IsDir() {
            _, err = io.Copy(buf, bytes.NewReader(data))
            if err != nil {
                panic(err)
            }
        } else if n.File.IsSymlink() {
            _, err := io.Copy(buf, strings.NewReader(n.File.Symlink()))
            if err != nil {
                panic(err) // NOTE: this is only supported if the filetree supports looking up and caching this value in advance, which should make errors impossible
            }
        } else {
            panic(errors.New("attempted to write local-format inode for an unsupported file-type"))
        }
    } else {
        panic(errors.New("attempted to write an unsupported inode format"))
    }

    return bytes.NewReader(buf.Bytes())

}

func (c *compiler) treeWalkers() {

    // realtime bitmap
    rtbmap := &vio.TreeNode{
        File: vio.CustomFile(vio.CustomFileArgs{
            Size:       0,
            ReadCloser: ioutil.NopCloser(bytes.NewReader([]byte{})),
        }),
        NodeSequenceNumber: 1,
        Links:              1,
    }

    // realtime summary
    rtsummary := &vio.TreeNode{
        File: vio.CustomFile(vio.CustomFileArgs{
            Size:       0,
            ReadCloser: ioutil.NopCloser(bytes.NewReader([]byte{})),
        }),
        NodeSequenceNumber: 2,
        Links:              1,
    }

    // one walker for data blocks
    go func() {
        var previous *vio.TreeNode
        c.dataError = c.tree.WalkNode(func(path string, n *vio.TreeNode) error {
            c.data <- n
            if previous != nil && previous.File != nil {
                err := previous.File.Close()
                if err != nil {
                    return err
                }
            }
            previous = n

            // insert realtime nodes
            if path == "." {
                c.data <- rtbmap
                c.data <- rtsummary
            }

            return nil
        })
        close(c.data)
    }()

    // one walker for inode information
    go func() {
        c.nodesError = c.tree.WalkNode(func(path string, n *vio.TreeNode) error {
            c.nodes <- n

            // insert realtime inodes
            if path == "." {
                c.nodes <- rtbmap
                c.nodes <- rtsummary
            }

            return nil
        })
        close(c.nodes)
    }()

}