tools/wrk/deps/luajit/src/vm_x86.dasc

Summary

Maintainability
Test Coverage
|// Low-level VM code for x86 CPUs.
|// Bytecode interpreter, fast functions and helper functions.
|// Copyright (C) 2005-2013 Mike Pall. See Copyright Notice in luajit.h
|
|.if P64
|.arch x64
|.else
|.arch x86
|.endif
|.section code_op, code_sub
|
|.actionlist build_actionlist
|.globals GLOB_
|.globalnames globnames
|.externnames extnames
|
|//-----------------------------------------------------------------------
|
|.if P64
|.define X64, 1
|.define SSE, 1
|.if WIN
|.define X64WIN, 1
|.endif
|.endif
|
|// Fixed register assignments for the interpreter.
|// This is very fragile and has many dependencies. Caveat emptor.
|.define BASE,        edx        // Not C callee-save, refetched anyway.
|.if not X64
|.define KBASE,        edi        // Must be C callee-save.
|.define KBASEa,    KBASE
|.define PC,        esi        // Must be C callee-save.
|.define PCa,        PC
|.define DISPATCH,    ebx        // Must be C callee-save.
|.elif X64WIN
|.define KBASE,        edi        // Must be C callee-save.
|.define KBASEa,    rdi
|.define PC,        esi        // Must be C callee-save.
|.define PCa,        rsi
|.define DISPATCH,    ebx        // Must be C callee-save.
|.else
|.define KBASE,        r15d        // Must be C callee-save.
|.define KBASEa,    r15
|.define PC,        ebx        // Must be C callee-save.
|.define PCa,        rbx
|.define DISPATCH,    r14d        // Must be C callee-save.
|.endif
|
|.define RA,        ecx
|.define RAH,        ch
|.define RAL,        cl
|.define RB,        ebp        // Must be ebp (C callee-save).
|.define RC,        eax        // Must be eax.
|.define RCW,        ax
|.define RCH,        ah
|.define RCL,        al
|.define OP,        RB
|.define RD,        RC
|.define RDW,        RCW
|.define RDL,        RCL
|.if X64
|.define RAa, rcx
|.define RBa, rbp
|.define RCa, rax
|.define RDa, rax
|.else
|.define RAa, RA
|.define RBa, RB
|.define RCa, RC
|.define RDa, RD
|.endif
|
|.if not X64
|.define FCARG1,    ecx        // x86 fastcall arguments.
|.define FCARG2,    edx
|.elif X64WIN
|.define CARG1,        rcx        // x64/WIN64 C call arguments.
|.define CARG2,        rdx
|.define CARG3,        r8
|.define CARG4,        r9
|.define CARG1d,    ecx
|.define CARG2d,    edx
|.define CARG3d,    r8d
|.define CARG4d,    r9d
|.define FCARG1,    CARG1d        // Upwards compatible to x86 fastcall.
|.define FCARG2,    CARG2d
|.else
|.define CARG1,        rdi        // x64/POSIX C call arguments.
|.define CARG2,        rsi
|.define CARG3,        rdx
|.define CARG4,        rcx
|.define CARG5,        r8
|.define CARG6,        r9
|.define CARG1d,    edi
|.define CARG2d,    esi
|.define CARG3d,    edx
|.define CARG4d,    ecx
|.define CARG5d,    r8d
|.define CARG6d,    r9d
|.define FCARG1,    CARG1d        // Simulate x86 fastcall.
|.define FCARG2,    CARG2d
|.endif
|
|// Type definitions. Some of these are only used for documentation.
|.type L,        lua_State
|.type GL,        global_State
|.type TVALUE,        TValue
|.type GCOBJ,        GCobj
|.type STR,        GCstr
|.type TAB,        GCtab
|.type LFUNC,        GCfuncL
|.type CFUNC,        GCfuncC
|.type PROTO,        GCproto
|.type UPVAL,        GCupval
|.type NODE,        Node
|.type NARGS,        int
|.type TRACE,        GCtrace
|
|// Stack layout while in interpreter. Must match with lj_frame.h.
|//-----------------------------------------------------------------------
|.if not X64        // x86 stack layout.
|
|.define CFRAME_SPACE,    aword*7            // Delta for esp (see <--).
|.macro saveregs_
|  push edi; push esi; push ebx
|  sub esp, CFRAME_SPACE
|.endmacro
|.macro saveregs
|  push ebp; saveregs_
|.endmacro
|.macro restoreregs
|  add esp, CFRAME_SPACE
|  pop ebx; pop esi; pop edi; pop ebp
|.endmacro
|
|.define SAVE_ERRF,    aword [esp+aword*15]    // vm_pcall/vm_cpcall only.
|.define SAVE_NRES,    aword [esp+aword*14]
|.define SAVE_CFRAME,    aword [esp+aword*13]
|.define SAVE_L,    aword [esp+aword*12]
|//----- 16 byte aligned, ^^^ arguments from C caller
|.define SAVE_RET,    aword [esp+aword*11]    //<-- esp entering interpreter.
|.define SAVE_R4,    aword [esp+aword*10]
|.define SAVE_R3,    aword [esp+aword*9]
|.define SAVE_R2,    aword [esp+aword*8]
|//----- 16 byte aligned
|.define SAVE_R1,    aword [esp+aword*7]    //<-- esp after register saves.
|.define SAVE_PC,    aword [esp+aword*6]
|.define TMP2,        aword [esp+aword*5]
|.define TMP1,        aword [esp+aword*4]
|//----- 16 byte aligned
|.define ARG4,        aword [esp+aword*3]
|.define ARG3,        aword [esp+aword*2]
|.define ARG2,        aword [esp+aword*1]
|.define ARG1,        aword [esp]        //<-- esp while in interpreter.
|//----- 16 byte aligned, ^^^ arguments for C callee
|
|// FPARGx overlaps ARGx and ARG(x+1) on x86.
|.define FPARG3,    qword [esp+qword*1]
|.define FPARG1,    qword [esp]
|// TMPQ overlaps TMP1/TMP2. ARG5/MULTRES overlap TMP1/TMP2 (and TMPQ).
|.define TMPQ,        qword [esp+aword*4]
|.define TMP3,        ARG4
|.define ARG5,        TMP1
|.define TMPa,        TMP1
|.define MULTRES,    TMP2
|
|// Arguments for vm_call and vm_pcall.
|.define INARG_BASE,    SAVE_CFRAME        // Overwritten by SAVE_CFRAME!
|
|// Arguments for vm_cpcall.
|.define INARG_CP_CALL,    SAVE_ERRF
|.define INARG_CP_UD,    SAVE_NRES
|.define INARG_CP_FUNC,    SAVE_CFRAME
|
|//-----------------------------------------------------------------------
|.elif X64WIN        // x64/Windows stack layout
|
|.define CFRAME_SPACE,    aword*5            // Delta for rsp (see <--).
|.macro saveregs_
|  push rdi; push rsi; push rbx
|  sub rsp, CFRAME_SPACE
|.endmacro
|.macro saveregs
|  push rbp; saveregs_
|.endmacro
|.macro restoreregs
|  add rsp, CFRAME_SPACE
|  pop rbx; pop rsi; pop rdi; pop rbp
|.endmacro
|
|.define SAVE_CFRAME,    aword [rsp+aword*13]
|.define SAVE_PC,    dword [rsp+dword*25]
|.define SAVE_L,    dword [rsp+dword*24]
|.define SAVE_ERRF,    dword [rsp+dword*23]
|.define SAVE_NRES,    dword [rsp+dword*22]
|.define TMP2,        dword [rsp+dword*21]
|.define TMP1,        dword [rsp+dword*20]
|//----- 16 byte aligned, ^^^ 32 byte register save area, owned by interpreter
|.define SAVE_RET,    aword [rsp+aword*9]    //<-- rsp entering interpreter.
|.define SAVE_R4,    aword [rsp+aword*8]
|.define SAVE_R3,    aword [rsp+aword*7]
|.define SAVE_R2,    aword [rsp+aword*6]
|.define SAVE_R1,    aword [rsp+aword*5]    //<-- rsp after register saves.
|.define ARG5,        aword [rsp+aword*4]
|.define CSAVE_4,    aword [rsp+aword*3]
|.define CSAVE_3,    aword [rsp+aword*2]
|.define CSAVE_2,    aword [rsp+aword*1]
|.define CSAVE_1,    aword [rsp]        //<-- rsp while in interpreter.
|//----- 16 byte aligned, ^^^ 32 byte register save area, owned by callee
|
|// TMPQ overlaps TMP1/TMP2. MULTRES overlaps TMP2 (and TMPQ).
|.define TMPQ,        qword [rsp+aword*10]
|.define MULTRES,    TMP2
|.define TMPa,        ARG5
|.define ARG5d,        dword [rsp+aword*4]
|.define TMP3,        ARG5d
|
|//-----------------------------------------------------------------------
|.else            // x64/POSIX stack layout
|
|.define CFRAME_SPACE,    aword*5            // Delta for rsp (see <--).
|.macro saveregs_
|  push rbx; push r15; push r14
|  sub rsp, CFRAME_SPACE
|.endmacro
|.macro saveregs
|  push rbp; saveregs_
|.endmacro
|.macro restoreregs
|  add rsp, CFRAME_SPACE
|  pop r14; pop r15; pop rbx; pop rbp
|.endmacro
|
|//----- 16 byte aligned,
|.define SAVE_RET,    aword [rsp+aword*9]    //<-- rsp entering interpreter.
|.define SAVE_R4,    aword [rsp+aword*8]
|.define SAVE_R3,    aword [rsp+aword*7]
|.define SAVE_R2,    aword [rsp+aword*6]
|.define SAVE_R1,    aword [rsp+aword*5]    //<-- rsp after register saves.
|.define SAVE_CFRAME,    aword [rsp+aword*4]
|.define SAVE_PC,    dword [rsp+dword*7]
|.define SAVE_L,    dword [rsp+dword*6]
|.define SAVE_ERRF,    dword [rsp+dword*5]
|.define SAVE_NRES,    dword [rsp+dword*4]
|.define TMPa,        aword [rsp+aword*1]
|.define TMP2,        dword [rsp+dword*1]
|.define TMP1,        dword [rsp]        //<-- rsp while in interpreter.
|//----- 16 byte aligned
|
|// TMPQ overlaps TMP1/TMP2. MULTRES overlaps TMP2 (and TMPQ).
|.define TMPQ,        qword [rsp]
|.define TMP3,        dword [rsp+aword*1]
|.define MULTRES,    TMP2
|
|.endif
|
|//-----------------------------------------------------------------------
|
|// Instruction headers.
|.macro ins_A; .endmacro
|.macro ins_AD; .endmacro
|.macro ins_AJ; .endmacro
|.macro ins_ABC; movzx RB, RCH; movzx RC, RCL; .endmacro
|.macro ins_AB_; movzx RB, RCH; .endmacro
|.macro ins_A_C; movzx RC, RCL; .endmacro
|.macro ins_AND; not RDa; .endmacro
|
|// Instruction decode+dispatch. Carefully tuned (nope, lodsd is not faster).
|.macro ins_NEXT
|  mov RC, [PC]
|  movzx RA, RCH
|  movzx OP, RCL
|  add PC, 4
|  shr RC, 16
|.if X64
|  jmp aword [DISPATCH+OP*8]
|.else
|  jmp aword [DISPATCH+OP*4]
|.endif
|.endmacro
|
|// Instruction footer.
|.if 1
|  // Replicated dispatch. Less unpredictable branches, but higher I-Cache use.
|  .define ins_next, ins_NEXT
|  .define ins_next_, ins_NEXT
|.else
|  // Common dispatch. Lower I-Cache use, only one (very) unpredictable branch.
|  // Affects only certain kinds of benchmarks (and only with -j off).
|  // Around 10%-30% slower on Core2, a lot more slower on P4.
|  .macro ins_next
|    jmp ->ins_next
|  .endmacro
|  .macro ins_next_
|  ->ins_next:
|    ins_NEXT
|  .endmacro
|.endif
|
|// Call decode and dispatch.
|.macro ins_callt
|  // BASE = new base, RB = LFUNC, RD = nargs+1, [BASE-4] = PC
|  mov PC, LFUNC:RB->pc
|  mov RA, [PC]
|  movzx OP, RAL
|  movzx RA, RAH
|  add PC, 4
|.if X64
|  jmp aword [DISPATCH+OP*8]
|.else
|  jmp aword [DISPATCH+OP*4]
|.endif
|.endmacro
|
|.macro ins_call
|  // BASE = new base, RB = LFUNC, RD = nargs+1
|  mov [BASE-4], PC
|  ins_callt
|.endmacro
|
|//-----------------------------------------------------------------------
|
|// Macros to test operand types.
|.macro checktp, reg, tp;  cmp dword [BASE+reg*8+4], tp; .endmacro
|.macro checknum, reg, target; checktp reg, LJ_TISNUM; jae target; .endmacro
|.macro checkint, reg, target; checktp reg, LJ_TISNUM; jne target; .endmacro
|.macro checkstr, reg, target; checktp reg, LJ_TSTR; jne target; .endmacro
|.macro checktab, reg, target; checktp reg, LJ_TTAB; jne target; .endmacro
|
|// These operands must be used with movzx.
|.define PC_OP, byte [PC-4]
|.define PC_RA, byte [PC-3]
|.define PC_RB, byte [PC-1]
|.define PC_RC, byte [PC-2]
|.define PC_RD, word [PC-2]
|
|.macro branchPC, reg
|  lea PC, [PC+reg*4-BCBIAS_J*4]
|.endmacro
|
|// Assumes DISPATCH is relative to GL.
#define DISPATCH_GL(field)    (GG_DISP2G + (int)offsetof(global_State, field))
#define DISPATCH_J(field)    (GG_DISP2J + (int)offsetof(jit_State, field))
|
#define PC2PROTO(field)  ((int)offsetof(GCproto, field)-(int)sizeof(GCproto))
|
|// Decrement hashed hotcount and trigger trace recorder if zero.
|.macro hotloop, reg
|  mov reg, PC
|  shr reg, 1
|  and reg, HOTCOUNT_PCMASK
|  sub word [DISPATCH+reg+GG_DISP2HOT], HOTCOUNT_LOOP
|  jb ->vm_hotloop
|.endmacro
|
|.macro hotcall, reg
|  mov reg, PC
|  shr reg, 1
|  and reg, HOTCOUNT_PCMASK
|  sub word [DISPATCH+reg+GG_DISP2HOT], HOTCOUNT_CALL
|  jb ->vm_hotcall
|.endmacro
|
|// Set current VM state.
|.macro set_vmstate, st
|  mov dword [DISPATCH+DISPATCH_GL(vmstate)], ~LJ_VMST_..st
|.endmacro
|
|// x87 compares.
|.macro fcomparepp            // Compare and pop st0 >< st1.
|  fucomip st1
|  fpop
|.endmacro
|
|.macro fdup; fld st0; .endmacro
|.macro fpop1; fstp st1; .endmacro
|
|// Synthesize SSE FP constants.
|.macro sseconst_abs, reg, tmp        // Synthesize abs mask.
|.if X64
|  mov64 tmp, U64x(7fffffff,ffffffff); movd reg, tmp
|.else
|  pxor reg, reg; pcmpeqd reg, reg; psrlq reg, 1
|.endif
|.endmacro
|
|.macro sseconst_hi, reg, tmp, val    // Synthesize hi-32 bit const.
|.if X64
|  mov64 tmp, U64x(val,00000000); movd reg, tmp
|.else
|  mov tmp, 0x .. val; movd reg, tmp; pshufd reg, reg, 0x51
|.endif
|.endmacro
|
|.macro sseconst_sign, reg, tmp        // Synthesize sign mask.
|  sseconst_hi reg, tmp, 80000000
|.endmacro
|.macro sseconst_1, reg, tmp        // Synthesize 1.0.
|  sseconst_hi reg, tmp, 3ff00000
|.endmacro
|.macro sseconst_m1, reg, tmp        // Synthesize -1.0.
|  sseconst_hi reg, tmp, bff00000
|.endmacro
|.macro sseconst_2p52, reg, tmp        // Synthesize 2^52.
|  sseconst_hi reg, tmp, 43300000
|.endmacro
|.macro sseconst_tobit, reg, tmp    // Synthesize 2^52 + 2^51.
|  sseconst_hi reg, tmp, 43380000
|.endmacro
|
|// Move table write barrier back. Overwrites reg.
|.macro barrierback, tab, reg
|  and byte tab->marked, (uint8_t)~LJ_GC_BLACK    // black2gray(tab)
|  mov reg, [DISPATCH+DISPATCH_GL(gc.grayagain)]
|  mov [DISPATCH+DISPATCH_GL(gc.grayagain)], tab
|  mov tab->gclist, reg
|.endmacro
|
|//-----------------------------------------------------------------------

/* Generate subroutines used by opcodes and other parts of the VM. */
/* The .code_sub section should be last to help static branch prediction. */
static void build_subroutines(BuildCtx *ctx)
{
  |.code_sub
  |
  |//-----------------------------------------------------------------------
  |//-- Return handling ----------------------------------------------------
  |//-----------------------------------------------------------------------
  |
  |->vm_returnp:
  |  test PC, FRAME_P
  |  jz ->cont_dispatch
  |
  |  // Return from pcall or xpcall fast func.
  |  and PC, -8
  |  sub BASE, PC            // Restore caller base.
  |  lea RAa, [RA+PC-8]            // Rebase RA and prepend one result.
  |  mov PC, [BASE-4]            // Fetch PC of previous frame.
  |  // Prepending may overwrite the pcall frame, so do it at the end.
  |  mov dword [BASE+RA+4], LJ_TTRUE    // Prepend true to results.
  |
  |->vm_returnc:
  |  add RD, 1                // RD = nresults+1
  |  jz ->vm_unwind_yield
  |  mov MULTRES, RD
  |  test PC, FRAME_TYPE
  |  jz ->BC_RET_Z            // Handle regular return to Lua.
  |
  |->vm_return:
  |  // BASE = base, RA = resultofs, RD = nresults+1 (= MULTRES), PC = return
  |  xor PC, FRAME_C
  |  test PC, FRAME_TYPE
  |  jnz ->vm_returnp
  |
  |  // Return to C.
  |  set_vmstate C
  |  and PC, -8
  |  sub PC, BASE
  |  neg PC                // Previous base = BASE - delta.
  |
  |  sub RD, 1
  |  jz >2
  |1:  // Move results down.
  |.if X64
  |  mov RBa, [BASE+RA]
  |  mov [BASE-8], RBa
  |.else
  |  mov RB, [BASE+RA]
  |  mov [BASE-8], RB
  |  mov RB, [BASE+RA+4]
  |  mov [BASE-4], RB
  |.endif
  |  add BASE, 8
  |  sub RD, 1
  |  jnz <1
  |2:
  |  mov L:RB, SAVE_L
  |  mov L:RB->base, PC
  |3:
  |  mov RD, MULTRES
  |  mov RA, SAVE_NRES            // RA = wanted nresults+1
  |4:
  |  cmp RA, RD
  |  jne >6                // More/less results wanted?
  |5:
  |  sub BASE, 8
  |  mov L:RB->top, BASE
  |
  |->vm_leave_cp:
  |  mov RAa, SAVE_CFRAME        // Restore previous C frame.
  |  mov L:RB->cframe, RAa
  |  xor eax, eax            // Ok return status for vm_pcall.
  |
  |->vm_leave_unw:
  |  restoreregs
  |  ret
  |
  |6:
  |  jb >7                // Less results wanted?
  |  // More results wanted. Check stack size and fill up results with nil.
  |  cmp BASE, L:RB->maxstack
  |  ja >8
  |  mov dword [BASE-4], LJ_TNIL
  |  add BASE, 8
  |  add RD, 1
  |  jmp <4
  |
  |7:  // Less results wanted.
  |  test RA, RA
  |  jz <5                // But check for LUA_MULTRET+1.
  |  sub RA, RD                // Negative result!
  |  lea BASE, [BASE+RA*8]        // Correct top.
  |  jmp <5
  |
  |8:  // Corner case: need to grow stack for filling up results.
  |  // This can happen if:
  |  // - A C function grows the stack (a lot).
  |  // - The GC shrinks the stack in between.
  |  // - A return back from a lua_call() with (high) nresults adjustment.
  |  mov L:RB->top, BASE        // Save current top held in BASE (yes).
  |  mov MULTRES, RD            // Need to fill only remainder with nil.
  |  mov FCARG2, RA
  |  mov FCARG1, L:RB
  |  call extern lj_state_growstack@8    // (lua_State *L, int n)
  |  mov BASE, L:RB->top        // Need the (realloced) L->top in BASE.
  |  jmp <3
  |
  |->vm_unwind_yield:
  |  mov al, LUA_YIELD
  |  jmp ->vm_unwind_c_eh
  |
  |->vm_unwind_c@8:            // Unwind C stack, return from vm_pcall.
  |  // (void *cframe, int errcode)
  |.if X64
  |  mov eax, CARG2d            // Error return status for vm_pcall.
  |  mov rsp, CARG1
  |.else
  |  mov eax, FCARG2            // Error return status for vm_pcall.
  |  mov esp, FCARG1
  |.endif
  |->vm_unwind_c_eh:            // Landing pad for external unwinder.
  |  mov L:RB, SAVE_L
  |  mov GL:RB, L:RB->glref
  |  mov dword GL:RB->vmstate, ~LJ_VMST_C
  |  jmp ->vm_leave_unw
  |
  |->vm_unwind_rethrow:
  |.if X64 and not X64WIN
  |  mov FCARG1, SAVE_L
  |  mov FCARG2, eax
  |  restoreregs
  |  jmp extern lj_err_throw@8        // (lua_State *L, int errcode)
  |.endif
  |
  |->vm_unwind_ff@4:            // Unwind C stack, return from ff pcall.
  |  // (void *cframe)
  |.if X64
  |  and CARG1, CFRAME_RAWMASK
  |  mov rsp, CARG1
  |.else
  |  and FCARG1, CFRAME_RAWMASK
  |  mov esp, FCARG1
  |.endif
  |->vm_unwind_ff_eh:            // Landing pad for external unwinder.
  |  mov L:RB, SAVE_L
  |  mov RAa, -8            // Results start at BASE+RA = BASE-8.
  |  mov RD, 1+1            // Really 1+2 results, incr. later.
  |  mov BASE, L:RB->base
  |  mov DISPATCH, L:RB->glref        // Setup pointer to dispatch table.
  |  add DISPATCH, GG_G2DISP
  |  mov PC, [BASE-4]            // Fetch PC of previous frame.
  |  mov dword [BASE-4], LJ_TFALSE    // Prepend false to error message.
  |  set_vmstate INTERP
  |  jmp ->vm_returnc            // Increments RD/MULTRES and returns.
  |
  |//-----------------------------------------------------------------------
  |//-- Grow stack for calls -----------------------------------------------
  |//-----------------------------------------------------------------------
  |
  |->vm_growstack_c:            // Grow stack for C function.
  |  mov FCARG2, LUA_MINSTACK
  |  jmp >2
  |
  |->vm_growstack_v:            // Grow stack for vararg Lua function.
  |  sub RD, 8
  |  jmp >1
  |
  |->vm_growstack_f:            // Grow stack for fixarg Lua function.
  |  // BASE = new base, RD = nargs+1, RB = L, PC = first PC
  |  lea RD, [BASE+NARGS:RD*8-8]
  |1:
  |  movzx RA, byte [PC-4+PC2PROTO(framesize)]
  |  add PC, 4                // Must point after first instruction.
  |  mov L:RB->base, BASE
  |  mov L:RB->top, RD
  |  mov SAVE_PC, PC
  |  mov FCARG2, RA
  |2:
  |  // RB = L, L->base = new base, L->top = top
  |  mov FCARG1, L:RB
  |  call extern lj_state_growstack@8    // (lua_State *L, int n)
  |  mov BASE, L:RB->base
  |  mov RD, L:RB->top
  |  mov LFUNC:RB, [BASE-8]
  |  sub RD, BASE
  |  shr RD, 3
  |  add NARGS:RD, 1
  |  // BASE = new base, RB = LFUNC, RD = nargs+1
  |  ins_callt                // Just retry the call.
  |
  |//-----------------------------------------------------------------------
  |//-- Entry points into the assembler VM ---------------------------------
  |//-----------------------------------------------------------------------
  |
  |->vm_resume:                // Setup C frame and resume thread.
  |  // (lua_State *L, TValue *base, int nres1 = 0, ptrdiff_t ef = 0)
  |  saveregs
  |.if X64
  |  mov L:RB, CARG1d            // Caveat: CARG1d may be RA.
  |  mov SAVE_L, CARG1d
  |  mov RA, CARG2d
  |.else
  |  mov L:RB, SAVE_L
  |  mov RA, INARG_BASE            // Caveat: overlaps SAVE_CFRAME!
  |.endif
  |  mov PC, FRAME_CP
  |  xor RD, RD
  |  lea KBASEa, [esp+CFRAME_RESUME]
  |  mov DISPATCH, L:RB->glref        // Setup pointer to dispatch table.
  |  add DISPATCH, GG_G2DISP
  |  mov L:RB->cframe, KBASEa
  |  mov SAVE_PC, RD            // Any value outside of bytecode is ok.
  |  mov SAVE_CFRAME, RDa
  |.if X64
  |  mov SAVE_NRES, RD
  |  mov SAVE_ERRF, RD
  |.endif
  |  cmp byte L:RB->status, RDL
  |  je >3                // Initial resume (like a call).
  |
  |  // Resume after yield (like a return).
  |  set_vmstate INTERP
  |  mov byte L:RB->status, RDL
  |  mov BASE, L:RB->base
  |  mov RD, L:RB->top
  |  sub RD, RA
  |  shr RD, 3
  |  add RD, 1                // RD = nresults+1
  |  sub RA, BASE            // RA = resultofs
  |  mov PC, [BASE-4]
  |  mov MULTRES, RD
  |  test PC, FRAME_TYPE
  |  jz ->BC_RET_Z
  |  jmp ->vm_return
  |
  |->vm_pcall:                // Setup protected C frame and enter VM.
  |  // (lua_State *L, TValue *base, int nres1, ptrdiff_t ef)
  |  saveregs
  |  mov PC, FRAME_CP
  |.if X64
  |  mov SAVE_ERRF, CARG4d
  |.endif
  |  jmp >1
  |
  |->vm_call:                // Setup C frame and enter VM.
  |  // (lua_State *L, TValue *base, int nres1)
  |  saveregs
  |  mov PC, FRAME_C
  |
  |1:  // Entry point for vm_pcall above (PC = ftype).
  |.if X64
  |  mov SAVE_NRES, CARG3d
  |  mov L:RB, CARG1d            // Caveat: CARG1d may be RA.
  |  mov SAVE_L, CARG1d
  |  mov RA, CARG2d
  |.else
  |  mov L:RB, SAVE_L
  |  mov RA, INARG_BASE            // Caveat: overlaps SAVE_CFRAME!
  |.endif
  |
  |  mov KBASEa, L:RB->cframe        // Add our C frame to cframe chain.
  |  mov SAVE_CFRAME, KBASEa
  |  mov SAVE_PC, L:RB            // Any value outside of bytecode is ok.
  |.if X64
  |  mov L:RB->cframe, rsp
  |.else
  |  mov L:RB->cframe, esp
  |.endif
  |
  |2:  // Entry point for vm_cpcall below (RA = base, RB = L, PC = ftype).
  |  mov DISPATCH, L:RB->glref        // Setup pointer to dispatch table.
  |  add DISPATCH, GG_G2DISP
  |
  |3:  // Entry point for vm_resume above (RA = base, RB = L, PC = ftype).
  |  set_vmstate INTERP
  |  mov BASE, L:RB->base        // BASE = old base (used in vmeta_call).
  |  add PC, RA
  |  sub PC, BASE            // PC = frame delta + frame type
  |
  |  mov RD, L:RB->top
  |  sub RD, RA
  |  shr NARGS:RD, 3
  |  add NARGS:RD, 1            // RD = nargs+1
  |
  |->vm_call_dispatch:
  |  mov LFUNC:RB, [RA-8]
  |  cmp dword [RA-4], LJ_TFUNC
  |  jne ->vmeta_call            // Ensure KBASE defined and != BASE.
  |
  |->vm_call_dispatch_f:
  |  mov BASE, RA
  |  ins_call
  |  // BASE = new base, RB = func, RD = nargs+1, PC = caller PC
  |
  |->vm_cpcall:                // Setup protected C frame, call C.
  |  // (lua_State *L, lua_CFunction func, void *ud, lua_CPFunction cp)
  |  saveregs
  |.if X64
  |  mov L:RB, CARG1d            // Caveat: CARG1d may be RA.
  |  mov SAVE_L, CARG1d
  |.else
  |  mov L:RB, SAVE_L
  |  // Caveat: INARG_CP_* and SAVE_CFRAME/SAVE_NRES/SAVE_ERRF overlap!
  |  mov RC, INARG_CP_UD        // Get args before they are overwritten.
  |  mov RA, INARG_CP_FUNC
  |  mov BASE, INARG_CP_CALL
  |.endif
  |  mov SAVE_PC, L:RB            // Any value outside of bytecode is ok.
  |
  |  mov KBASE, L:RB->stack        // Compute -savestack(L, L->top).
  |  sub KBASE, L:RB->top
  |  mov SAVE_ERRF, 0            // No error function.
  |  mov SAVE_NRES, KBASE        // Neg. delta means cframe w/o frame.
  |  // Handler may change cframe_nres(L->cframe) or cframe_errfunc(L->cframe).
  |
  |.if X64
  |  mov KBASEa, L:RB->cframe        // Add our C frame to cframe chain.
  |  mov SAVE_CFRAME, KBASEa
  |  mov L:RB->cframe, rsp
  |
  |  call CARG4            // (lua_State *L, lua_CFunction func, void *ud)
  |.else
  |  mov ARG3, RC            // Have to copy args downwards.
  |  mov ARG2, RA
  |  mov ARG1, L:RB
  |
  |  mov KBASE, L:RB->cframe        // Add our C frame to cframe chain.
  |  mov SAVE_CFRAME, KBASE
  |  mov L:RB->cframe, esp
  |
  |  call BASE            // (lua_State *L, lua_CFunction func, void *ud)
  |.endif
  |  // TValue * (new base) or NULL returned in eax (RC).
  |  test RC, RC
  |  jz ->vm_leave_cp            // No base? Just remove C frame.
  |  mov RA, RC
  |  mov PC, FRAME_CP
  |  jmp <2                // Else continue with the call.
  |
  |//-----------------------------------------------------------------------
  |//-- Metamethod handling ------------------------------------------------
  |//-----------------------------------------------------------------------
  |
  |//-- Continuation dispatch ----------------------------------------------
  |
  |->cont_dispatch:
  |  // BASE = meta base, RA = resultofs, RD = nresults+1 (also in MULTRES)
  |  add RA, BASE
  |  and PC, -8
  |  mov RB, BASE
  |  sub BASE, PC            // Restore caller BASE.
  |  mov dword [RA+RD*8-4], LJ_TNIL    // Ensure one valid arg.
  |  mov RC, RA                // ... in [RC]
  |  mov PC, [RB-12]            // Restore PC from [cont|PC].
  |.if X64
  |  movsxd RAa, dword [RB-16]        // May be negative on WIN64 with debug.
  |.if FFI
  |  cmp RA, 1
  |  jbe >1
  |.endif
  |  lea KBASEa, qword [=>0]
  |  add RAa, KBASEa
  |.else
  |  mov RA, dword [RB-16]
  |.if FFI
  |  cmp RA, 1
  |  jbe >1
  |.endif
  |.endif
  |  mov LFUNC:KBASE, [BASE-8]
  |  mov KBASE, LFUNC:KBASE->pc
  |  mov KBASE, [KBASE+PC2PROTO(k)]
  |  // BASE = base, RC = result, RB = meta base
  |  jmp RAa                // Jump to continuation.
  |
  |.if FFI
  |1:
  |  je ->cont_ffi_callback        // cont = 1: return from FFI callback.
  |  // cont = 0: Tail call from C function.
  |  sub RB, BASE
  |  shr RB, 3
  |  lea RD, [RB-1]
  |  jmp ->vm_call_tail
  |.endif
  |
  |->cont_cat:                // BASE = base, RC = result, RB = mbase
  |  movzx RA, PC_RB
  |  sub RB, 16
  |  lea RA, [BASE+RA*8]
  |  sub RA, RB
  |  je ->cont_ra
  |  neg RA
  |  shr RA, 3
  |.if X64WIN
  |  mov CARG3d, RA
  |  mov L:CARG1d, SAVE_L
  |  mov L:CARG1d->base, BASE
  |  mov RCa, [RC]
  |  mov [RB], RCa
  |  mov CARG2d, RB
  |.elif X64
  |  mov L:CARG1d, SAVE_L
  |  mov L:CARG1d->base, BASE
  |  mov CARG3d, RA
  |  mov RAa, [RC]
  |  mov [RB], RAa
  |  mov CARG2d, RB
  |.else
  |  mov ARG3, RA
  |  mov RA, [RC+4]
  |  mov RC, [RC]
  |  mov [RB+4], RA
  |  mov [RB], RC
  |  mov ARG2, RB
  |.endif
  |  jmp ->BC_CAT_Z
  |
  |//-- Table indexing metamethods -----------------------------------------
  |
  |->vmeta_tgets:
  |  mov TMP1, RC            // RC = GCstr *
  |  mov TMP2, LJ_TSTR
  |  lea RCa, TMP1            // Store temp. TValue in TMP1/TMP2.
  |  cmp PC_OP, BC_GGET
  |  jne >1
  |  lea RA, [DISPATCH+DISPATCH_GL(tmptv)]  // Store fn->l.env in g->tmptv.
  |  mov [RA], TAB:RB            // RB = GCtab *
  |  mov dword [RA+4], LJ_TTAB
  |  mov RB, RA
  |  jmp >2
  |
  |->vmeta_tgetb:
  |  movzx RC, PC_RC
  |.if DUALNUM
  |  mov TMP2, LJ_TISNUM
  |  mov TMP1, RC
  |.elif SSE
  |  cvtsi2sd xmm0, RC
  |  movsd TMPQ, xmm0
  |.else
  |  mov ARG4, RC
  |  fild ARG4
  |  fstp TMPQ
  |.endif
  |  lea RCa, TMPQ            // Store temp. TValue in TMPQ.
  |  jmp >1
  |
  |->vmeta_tgetv:
  |  movzx RC, PC_RC            // Reload TValue *k from RC.
  |  lea RC, [BASE+RC*8]
  |1:
  |  movzx RB, PC_RB            // Reload TValue *t from RB.
  |  lea RB, [BASE+RB*8]
  |2:
  |.if X64
  |  mov L:CARG1d, SAVE_L
  |  mov L:CARG1d->base, BASE        // Caveat: CARG2d/CARG3d may be BASE.
  |  mov CARG2d, RB
  |  mov CARG3, RCa            // May be 64 bit ptr to stack.
  |  mov L:RB, L:CARG1d
  |.else
  |  mov ARG2, RB
  |  mov L:RB, SAVE_L
  |  mov ARG3, RC
  |  mov ARG1, L:RB
  |  mov L:RB->base, BASE
  |.endif
  |  mov SAVE_PC, PC
  |  call extern lj_meta_tget        // (lua_State *L, TValue *o, TValue *k)
  |  // TValue * (finished) or NULL (metamethod) returned in eax (RC).
  |  mov BASE, L:RB->base
  |  test RC, RC
  |  jz >3
  |->cont_ra:                // BASE = base, RC = result
  |  movzx RA, PC_RA
  |.if X64
  |  mov RBa, [RC]
  |  mov [BASE+RA*8], RBa
  |.else
  |  mov RB, [RC+4]
  |  mov RC, [RC]
  |  mov [BASE+RA*8+4], RB
  |  mov [BASE+RA*8], RC
  |.endif
  |  ins_next
  |
  |3:  // Call __index metamethod.
  |  // BASE = base, L->top = new base, stack = cont/func/t/k
  |  mov RA, L:RB->top
  |  mov [RA-12], PC            // [cont|PC]
  |  lea PC, [RA+FRAME_CONT]
  |  sub PC, BASE
  |  mov LFUNC:RB, [RA-8]        // Guaranteed to be a function here.
  |  mov NARGS:RD, 2+1            // 2 args for func(t, k).
  |  jmp ->vm_call_dispatch_f
  |
  |//-----------------------------------------------------------------------
  |
  |->vmeta_tsets:
  |  mov TMP1, RC            // RC = GCstr *
  |  mov TMP2, LJ_TSTR
  |  lea RCa, TMP1            // Store temp. TValue in TMP1/TMP2.
  |  cmp PC_OP, BC_GSET
  |  jne >1
  |  lea RA, [DISPATCH+DISPATCH_GL(tmptv)]  // Store fn->l.env in g->tmptv.
  |  mov [RA], TAB:RB            // RB = GCtab *
  |  mov dword [RA+4], LJ_TTAB
  |  mov RB, RA
  |  jmp >2
  |
  |->vmeta_tsetb:
  |  movzx RC, PC_RC
  |.if DUALNUM
  |  mov TMP2, LJ_TISNUM
  |  mov TMP1, RC
  |.elif SSE
  |  cvtsi2sd xmm0, RC
  |  movsd TMPQ, xmm0
  |.else
  |  mov ARG4, RC
  |  fild ARG4
  |  fstp TMPQ
  |.endif
  |  lea RCa, TMPQ            // Store temp. TValue in TMPQ.
  |  jmp >1
  |
  |->vmeta_tsetv:
  |  movzx RC, PC_RC            // Reload TValue *k from RC.
  |  lea RC, [BASE+RC*8]
  |1:
  |  movzx RB, PC_RB            // Reload TValue *t from RB.
  |  lea RB, [BASE+RB*8]
  |2:
  |.if X64
  |  mov L:CARG1d, SAVE_L
  |  mov L:CARG1d->base, BASE        // Caveat: CARG2d/CARG3d may be BASE.
  |  mov CARG2d, RB
  |  mov CARG3, RCa            // May be 64 bit ptr to stack.
  |  mov L:RB, L:CARG1d
  |.else
  |  mov ARG2, RB
  |  mov L:RB, SAVE_L
  |  mov ARG3, RC
  |  mov ARG1, L:RB
  |  mov L:RB->base, BASE
  |.endif
  |  mov SAVE_PC, PC
  |  call extern lj_meta_tset        // (lua_State *L, TValue *o, TValue *k)
  |  // TValue * (finished) or NULL (metamethod) returned in eax (RC).
  |  mov BASE, L:RB->base
  |  test RC, RC
  |  jz >3
  |  // NOBARRIER: lj_meta_tset ensures the table is not black.
  |  movzx RA, PC_RA
  |.if X64
  |  mov RBa, [BASE+RA*8]
  |  mov [RC], RBa
  |.else
  |  mov RB, [BASE+RA*8+4]
  |  mov RA, [BASE+RA*8]
  |  mov [RC+4], RB
  |  mov [RC], RA
  |.endif
  |->cont_nop:                // BASE = base, (RC = result)
  |  ins_next
  |
  |3:  // Call __newindex metamethod.
  |  // BASE = base, L->top = new base, stack = cont/func/t/k/(v)
  |  mov RA, L:RB->top
  |  mov [RA-12], PC            // [cont|PC]
  |  movzx RC, PC_RA
  |  // Copy value to third argument.
  |.if X64
  |  mov RBa, [BASE+RC*8]
  |  mov [RA+16], RBa
  |.else
  |  mov RB, [BASE+RC*8+4]
  |  mov RC, [BASE+RC*8]
  |  mov [RA+20], RB
  |  mov [RA+16], RC
  |.endif
  |  lea PC, [RA+FRAME_CONT]
  |  sub PC, BASE
  |  mov LFUNC:RB, [RA-8]        // Guaranteed to be a function here.
  |  mov NARGS:RD, 3+1            // 3 args for func(t, k, v).
  |  jmp ->vm_call_dispatch_f
  |
  |//-- Comparison metamethods ---------------------------------------------
  |
  |->vmeta_comp:
  |.if X64
  |  mov L:RB, SAVE_L
  |  mov L:RB->base, BASE        // Caveat: CARG2d/CARG3d == BASE.
  |.if X64WIN
  |  lea CARG3d, [BASE+RD*8]
  |  lea CARG2d, [BASE+RA*8]
  |.else
  |  lea CARG2d, [BASE+RA*8]
  |  lea CARG3d, [BASE+RD*8]
  |.endif
  |  mov CARG1d, L:RB            // Caveat: CARG1d/CARG4d == RA.
  |  movzx CARG4d, PC_OP
  |.else
  |  movzx RB, PC_OP
  |  lea RD, [BASE+RD*8]
  |  lea RA, [BASE+RA*8]
  |  mov ARG4, RB
  |  mov L:RB, SAVE_L
  |  mov ARG3, RD
  |  mov ARG2, RA
  |  mov ARG1, L:RB
  |  mov L:RB->base, BASE
  |.endif
  |  mov SAVE_PC, PC
  |  call extern lj_meta_comp    // (lua_State *L, TValue *o1, *o2, int op)
  |  // 0/1 or TValue * (metamethod) returned in eax (RC).
  |3:
  |  mov BASE, L:RB->base
  |  cmp RC, 1
  |  ja ->vmeta_binop
  |4:
  |  lea PC, [PC+4]
  |  jb >6
  |5:
  |  movzx RD, PC_RD
  |  branchPC RD
  |6:
  |  ins_next
  |
  |->cont_condt:            // BASE = base, RC = result
  |  add PC, 4
  |  cmp dword [RC+4], LJ_TISTRUECOND    // Branch if result is true.
  |  jb <5
  |  jmp <6
  |
  |->cont_condf:            // BASE = base, RC = result
  |  cmp dword [RC+4], LJ_TISTRUECOND    // Branch if result is false.
  |  jmp <4
  |
  |->vmeta_equal:
  |  sub PC, 4
  |.if X64WIN
  |  mov CARG3d, RD
  |  mov CARG4d, RB
  |  mov L:RB, SAVE_L
  |  mov L:RB->base, BASE        // Caveat: CARG2d == BASE.
  |  mov CARG2d, RA
  |  mov CARG1d, L:RB            // Caveat: CARG1d == RA.
  |.elif X64
  |  mov CARG2d, RA
  |  mov CARG4d, RB            // Caveat: CARG4d == RA.
  |  mov L:RB, SAVE_L
  |  mov L:RB->base, BASE        // Caveat: CARG3d == BASE.
  |  mov CARG3d, RD
  |  mov CARG1d, L:RB
  |.else
  |  mov ARG4, RB
  |  mov L:RB, SAVE_L
  |  mov ARG3, RD
  |  mov ARG2, RA
  |  mov ARG1, L:RB
  |  mov L:RB->base, BASE
  |.endif
  |  mov SAVE_PC, PC
  |  call extern lj_meta_equal    // (lua_State *L, GCobj *o1, *o2, int ne)
  |  // 0/1 or TValue * (metamethod) returned in eax (RC).
  |  jmp <3
  |
  |->vmeta_equal_cd:
  |.if FFI
  |  sub PC, 4
  |  mov L:RB, SAVE_L
  |  mov L:RB->base, BASE
  |  mov FCARG1, L:RB
  |  mov FCARG2, dword [PC-4]
  |  mov SAVE_PC, PC
  |  call extern lj_meta_equal_cd@8    // (lua_State *L, BCIns ins)
  |  // 0/1 or TValue * (metamethod) returned in eax (RC).
  |  jmp <3
  |.endif
  |
  |//-- Arithmetic metamethods ---------------------------------------------
  |
  |->vmeta_arith_vno:
  |.if DUALNUM
  |  movzx RB, PC_RB
  |.endif
  |->vmeta_arith_vn:
  |  lea RC, [KBASE+RC*8]
  |  jmp >1
  |
  |->vmeta_arith_nvo:
  |.if DUALNUM
  |  movzx RC, PC_RC
  |.endif
  |->vmeta_arith_nv:
  |  lea RC, [KBASE+RC*8]
  |  lea RB, [BASE+RB*8]
  |  xchg RB, RC
  |  jmp >2
  |
  |->vmeta_unm:
  |  lea RC, [BASE+RD*8]
  |  mov RB, RC
  |  jmp >2
  |
  |->vmeta_arith_vvo:
  |.if DUALNUM
  |  movzx RB, PC_RB
  |.endif
  |->vmeta_arith_vv:
  |  lea RC, [BASE+RC*8]
  |1:
  |  lea RB, [BASE+RB*8]
  |2:
  |  lea RA, [BASE+RA*8]
  |.if X64WIN
  |  mov CARG3d, RB
  |  mov CARG4d, RC
  |  movzx RC, PC_OP
  |  mov ARG5d, RC
  |  mov L:RB, SAVE_L
  |  mov L:RB->base, BASE        // Caveat: CARG2d == BASE.
  |  mov CARG2d, RA
  |  mov CARG1d, L:RB            // Caveat: CARG1d == RA.
  |.elif X64
  |  movzx CARG5d, PC_OP
  |  mov CARG2d, RA
  |  mov CARG4d, RC            // Caveat: CARG4d == RA.
  |  mov L:CARG1d, SAVE_L
  |  mov L:CARG1d->base, BASE        // Caveat: CARG3d == BASE.
  |  mov CARG3d, RB
  |  mov L:RB, L:CARG1d
  |.else
  |  mov ARG3, RB
  |  mov L:RB, SAVE_L
  |  mov ARG4, RC
  |  movzx RC, PC_OP
  |  mov ARG2, RA
  |  mov ARG5, RC
  |  mov ARG1, L:RB
  |  mov L:RB->base, BASE
  |.endif
  |  mov SAVE_PC, PC
  |  call extern lj_meta_arith    // (lua_State *L, TValue *ra,*rb,*rc, BCReg op)
  |  // NULL (finished) or TValue * (metamethod) returned in eax (RC).
  |  mov BASE, L:RB->base
  |  test RC, RC
  |  jz ->cont_nop
  |
  |  // Call metamethod for binary op.
  |->vmeta_binop:
  |  // BASE = base, RC = new base, stack = cont/func/o1/o2
  |  mov RA, RC
  |  sub RC, BASE
  |  mov [RA-12], PC            // [cont|PC]
  |  lea PC, [RC+FRAME_CONT]
  |  mov NARGS:RD, 2+1            // 2 args for func(o1, o2).
  |  jmp ->vm_call_dispatch
  |
  |->vmeta_len:
  |  mov L:RB, SAVE_L
  |  mov L:RB->base, BASE
  |  lea FCARG2, [BASE+RD*8]        // Caveat: FCARG2 == BASE
  |  mov L:FCARG1, L:RB
  |  mov SAVE_PC, PC
  |  call extern lj_meta_len@8        // (lua_State *L, TValue *o)
  |  // NULL (retry) or TValue * (metamethod) returned in eax (RC).
  |  mov BASE, L:RB->base
#if LJ_52
  |  test RC, RC
  |  jne ->vmeta_binop            // Binop call for compatibility.
  |  movzx RD, PC_RD
  |  mov TAB:FCARG1, [BASE+RD*8]
  |  jmp ->BC_LEN_Z
#else
  |  jmp ->vmeta_binop            // Binop call for compatibility.
#endif
  |
  |//-- Call metamethod ----------------------------------------------------
  |
  |->vmeta_call_ra:
  |  lea RA, [BASE+RA*8+8]
  |->vmeta_call:            // Resolve and call __call metamethod.
  |  // BASE = old base, RA = new base, RC = nargs+1, PC = return
  |  mov TMP2, RA            // Save RA, RC for us.
  |  mov TMP1, NARGS:RD
  |  sub RA, 8
  |.if X64
  |  mov L:RB, SAVE_L
  |  mov L:RB->base, BASE        // Caveat: CARG2d/CARG3d may be BASE.
  |  mov CARG2d, RA
  |  lea CARG3d, [RA+NARGS:RD*8]
  |  mov CARG1d, L:RB            // Caveat: CARG1d may be RA.
  |.else
  |  lea RC, [RA+NARGS:RD*8]
  |  mov L:RB, SAVE_L
  |  mov ARG2, RA
  |  mov ARG3, RC
  |  mov ARG1, L:RB
  |  mov L:RB->base, BASE        // This is the callers base!
  |.endif
  |  mov SAVE_PC, PC
  |  call extern lj_meta_call    // (lua_State *L, TValue *func, TValue *top)
  |  mov BASE, L:RB->base
  |  mov RA, TMP2
  |  mov NARGS:RD, TMP1
  |  mov LFUNC:RB, [RA-8]
  |  add NARGS:RD, 1
  |  // This is fragile. L->base must not move, KBASE must always be defined.
  |  cmp KBASE, BASE            // Continue with CALLT if flag set.
  |  je ->BC_CALLT_Z
  |  mov BASE, RA
  |  ins_call                // Otherwise call resolved metamethod.
  |
  |//-- Argument coercion for 'for' statement ------------------------------
  |
  |->vmeta_for:
  |  mov L:RB, SAVE_L
  |  mov L:RB->base, BASE
  |  mov FCARG2, RA            // Caveat: FCARG2 == BASE
  |  mov L:FCARG1, L:RB            // Caveat: FCARG1 == RA
  |  mov SAVE_PC, PC
  |  call extern lj_meta_for@8    // (lua_State *L, TValue *base)
  |  mov BASE, L:RB->base
  |  mov RC, [PC-4]
  |  movzx RA, RCH
  |  movzx OP, RCL
  |  shr RC, 16
  |.if X64
  |  jmp aword [DISPATCH+OP*8+GG_DISP2STATIC]    // Retry FORI or JFORI.
  |.else
  |  jmp aword [DISPATCH+OP*4+GG_DISP2STATIC]    // Retry FORI or JFORI.
  |.endif
  |
  |//-----------------------------------------------------------------------
  |//-- Fast functions -----------------------------------------------------
  |//-----------------------------------------------------------------------
  |
  |.macro .ffunc, name
  |->ff_ .. name:
  |.endmacro
  |
  |.macro .ffunc_1, name
  |->ff_ .. name:
  |  cmp NARGS:RD, 1+1;  jb ->fff_fallback
  |.endmacro
  |
  |.macro .ffunc_2, name
  |->ff_ .. name:
  |  cmp NARGS:RD, 2+1;  jb ->fff_fallback
  |.endmacro
  |
  |.macro .ffunc_n, name
  |  .ffunc_1 name
  |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
  |  fld qword [BASE]
  |.endmacro
  |
  |.macro .ffunc_n, name, op
  |  .ffunc_1 name
  |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
  |  op
  |  fld qword [BASE]
  |.endmacro
  |
  |.macro .ffunc_nsse, name, op
  |  .ffunc_1 name
  |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
  |  op xmm0, qword [BASE]
  |.endmacro
  |
  |.macro .ffunc_nsse, name
  |  .ffunc_nsse name, movsd
  |.endmacro
  |
  |.macro .ffunc_nn, name
  |  .ffunc_2 name
  |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
  |  cmp dword [BASE+12], LJ_TISNUM;  jae ->fff_fallback
  |  fld qword [BASE]
  |  fld qword [BASE+8]
  |.endmacro
  |
  |.macro .ffunc_nnsse, name
  |  .ffunc_2 name
  |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
  |  cmp dword [BASE+12], LJ_TISNUM;  jae ->fff_fallback
  |  movsd xmm0, qword [BASE]
  |  movsd xmm1, qword [BASE+8]
  |.endmacro
  |
  |.macro .ffunc_nnr, name
  |  .ffunc_2 name
  |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
  |  cmp dword [BASE+12], LJ_TISNUM;  jae ->fff_fallback
  |  fld qword [BASE+8]
  |  fld qword [BASE]
  |.endmacro
  |
  |// Inlined GC threshold check. Caveat: uses label 1.
  |.macro ffgccheck
  |  mov RB, [DISPATCH+DISPATCH_GL(gc.total)]
  |  cmp RB, [DISPATCH+DISPATCH_GL(gc.threshold)]
  |  jb >1
  |  call ->fff_gcstep
  |1:
  |.endmacro
  |
  |//-- Base library: checks -----------------------------------------------
  |
  |.ffunc_1 assert
  |  mov RB, [BASE+4]
  |  cmp RB, LJ_TISTRUECOND;  jae ->fff_fallback
  |  mov PC, [BASE-4]
  |  mov MULTRES, RD
  |  mov [BASE-4], RB
  |  mov RB, [BASE]
  |  mov [BASE-8], RB
  |  sub RD, 2
  |  jz >2
  |  mov RA, BASE
  |1:
  |  add RA, 8
  |.if X64
  |  mov RBa, [RA]
  |  mov [RA-8], RBa
  |.else
  |  mov RB, [RA+4]
  |  mov [RA-4], RB
  |  mov RB, [RA]
  |  mov [RA-8], RB
  |.endif
  |  sub RD, 1
  |  jnz <1
  |2:
  |  mov RD, MULTRES
  |  jmp ->fff_res_
  |
  |.ffunc_1 type
  |  mov RB, [BASE+4]
  |.if X64
  |  mov RA, RB
  |  sar RA, 15
  |  cmp RA, -2
  |  je >3
  |.endif
  |  mov RC, ~LJ_TNUMX
  |  not RB
  |  cmp RC, RB
  |  cmova RC, RB
  |2:
  |  mov CFUNC:RB, [BASE-8]
  |  mov STR:RC, [CFUNC:RB+RC*8+((char *)(&((GCfuncC *)0)->upvalue))]
  |  mov PC, [BASE-4]
  |  mov dword [BASE-4], LJ_TSTR
  |  mov [BASE-8], STR:RC
  |  jmp ->fff_res1
  |.if X64
  |3:
  |  mov RC, ~LJ_TLIGHTUD
  |  jmp <2
  |.endif
  |
  |//-- Base library: getters and setters ---------------------------------
  |
  |.ffunc_1 getmetatable
  |  mov RB, [BASE+4]
  |  mov PC, [BASE-4]
  |  cmp RB, LJ_TTAB;  jne >6
  |1:  // Field metatable must be at same offset for GCtab and GCudata!
  |  mov TAB:RB, [BASE]
  |  mov TAB:RB, TAB:RB->metatable
  |2:
  |  test TAB:RB, TAB:RB
  |  mov dword [BASE-4], LJ_TNIL
  |  jz ->fff_res1
  |  mov STR:RC, [DISPATCH+DISPATCH_GL(gcroot)+4*(GCROOT_MMNAME+MM_metatable)]
  |  mov dword [BASE-4], LJ_TTAB    // Store metatable as default result.
  |  mov [BASE-8], TAB:RB
  |  mov RA, TAB:RB->hmask
  |  and RA, STR:RC->hash
  |  imul RA, #NODE
  |  add NODE:RA, TAB:RB->node
  |3:  // Rearranged logic, because we expect _not_ to find the key.
  |  cmp dword NODE:RA->key.it, LJ_TSTR
  |  jne >4
  |  cmp dword NODE:RA->key.gcr, STR:RC
  |  je >5
  |4:
  |  mov NODE:RA, NODE:RA->next
  |  test NODE:RA, NODE:RA
  |  jnz <3
  |  jmp ->fff_res1            // Not found, keep default result.
  |5:
  |  mov RB, [RA+4]
  |  cmp RB, LJ_TNIL;  je ->fff_res1    // Ditto for nil value.
  |  mov RC, [RA]
  |  mov [BASE-4], RB            // Return value of mt.__metatable.
  |  mov [BASE-8], RC
  |  jmp ->fff_res1
  |
  |6:
  |  cmp RB, LJ_TUDATA;  je <1
  |.if X64
  |  cmp RB, LJ_TNUMX;  ja >8
  |  cmp RB, LJ_TISNUM;  jbe >7
  |  mov RB, LJ_TLIGHTUD
  |  jmp >8
  |7:
  |.else
  |  cmp RB, LJ_TISNUM;  ja >8
  |.endif
  |  mov RB, LJ_TNUMX
  |8:
  |  not RB
  |  mov TAB:RB, [DISPATCH+RB*4+DISPATCH_GL(gcroot[GCROOT_BASEMT])]
  |  jmp <2
  |
  |.ffunc_2 setmetatable
  |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
  |  // Fast path: no mt for table yet and not clearing the mt.
  |  mov TAB:RB, [BASE]
  |  cmp dword TAB:RB->metatable, 0;  jne ->fff_fallback
  |  cmp dword [BASE+12], LJ_TTAB;  jne ->fff_fallback
  |  mov TAB:RC, [BASE+8]
  |  mov TAB:RB->metatable, TAB:RC
  |  mov PC, [BASE-4]
  |  mov dword [BASE-4], LJ_TTAB        // Return original table.
  |  mov [BASE-8], TAB:RB
  |  test byte TAB:RB->marked, LJ_GC_BLACK    // isblack(table)
  |  jz >1
  |  // Possible write barrier. Table is black, but skip iswhite(mt) check.
  |  barrierback TAB:RB, RC
  |1:
  |  jmp ->fff_res1
  |
  |.ffunc_2 rawget
  |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
  |.if X64WIN
  |  mov RB, BASE            // Save BASE.
  |  lea CARG3d, [BASE+8]
  |  mov CARG2d, [BASE]            // Caveat: CARG2d == BASE.
  |  mov CARG1d, SAVE_L
  |.elif X64
  |  mov RB, BASE            // Save BASE.
  |  mov CARG2d, [BASE]
  |  lea CARG3d, [BASE+8]        // Caveat: CARG3d == BASE.
  |  mov CARG1d, SAVE_L
  |.else
  |  mov TAB:RD, [BASE]
  |  mov L:RB, SAVE_L
  |  mov ARG2, TAB:RD
  |  mov ARG1, L:RB
  |  mov RB, BASE            // Save BASE.
  |  add BASE, 8
  |  mov ARG3, BASE
  |.endif
  |  call extern lj_tab_get    // (lua_State *L, GCtab *t, cTValue *key)
  |  // cTValue * returned in eax (RD).
  |  mov BASE, RB            // Restore BASE.
  |  // Copy table slot.
  |.if X64
  |  mov RBa, [RD]
  |  mov PC, [BASE-4]
  |  mov [BASE-8], RBa
  |.else
  |  mov RB, [RD]
  |  mov RD, [RD+4]
  |  mov PC, [BASE-4]
  |  mov [BASE-8], RB
  |  mov [BASE-4], RD
  |.endif
  |  jmp ->fff_res1
  |
  |//-- Base library: conversions ------------------------------------------
  |
  |.ffunc tonumber
  |  // Only handles the number case inline (without a base argument).
  |  cmp NARGS:RD, 1+1;  jne ->fff_fallback    // Exactly one argument.
  |  cmp dword [BASE+4], LJ_TISNUM
  |.if DUALNUM
  |  jne >1
  |  mov RB, dword [BASE]; jmp ->fff_resi
  |1:
  |  ja ->fff_fallback
  |.else
  |  jae ->fff_fallback
  |.endif
  |.if SSE
  |  movsd xmm0, qword [BASE]; jmp ->fff_resxmm0
  |.else
  |  fld qword [BASE]; jmp ->fff_resn
  |.endif
  |
  |.ffunc_1 tostring
  |  // Only handles the string or number case inline.
  |  mov PC, [BASE-4]
  |  cmp dword [BASE+4], LJ_TSTR;  jne >3
  |  // A __tostring method in the string base metatable is ignored.
  |  mov STR:RD, [BASE]
  |2:
  |  mov dword [BASE-4], LJ_TSTR
  |  mov [BASE-8], STR:RD
  |  jmp ->fff_res1
  |3:  // Handle numbers inline, unless a number base metatable is present.
  |  cmp dword [BASE+4], LJ_TISNUM;  ja ->fff_fallback
  |  cmp dword [DISPATCH+DISPATCH_GL(gcroot[GCROOT_BASEMT_NUM])], 0
  |  jne ->fff_fallback
  |  ffgccheck                // Caveat: uses label 1.
  |  mov L:RB, SAVE_L
  |  mov L:RB->base, BASE        // Add frame since C call can throw.
  |  mov SAVE_PC, PC            // Redundant (but a defined value).
  |.if X64 and not X64WIN
  |  mov FCARG2, BASE            // Otherwise: FCARG2 == BASE
  |.endif
  |  mov L:FCARG1, L:RB
  |.if DUALNUM
  |  call extern lj_str_fromnumber@8    // (lua_State *L, cTValue *o)
  |.else
  |  call extern lj_str_fromnum@8    // (lua_State *L, lua_Number *np)
  |.endif
  |  // GCstr returned in eax (RD).
  |  mov BASE, L:RB->base
  |  jmp <2
  |
  |//-- Base library: iterators -------------------------------------------
  |
  |.ffunc_1 next
  |  je >2                // Missing 2nd arg?
  |1:
  |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
  |  mov L:RB, SAVE_L
  |  mov L:RB->base, BASE        // Add frame since C call can throw.
  |  mov L:RB->top, BASE        // Dummy frame length is ok.
  |  mov PC, [BASE-4]
  |.if X64WIN
  |  lea CARG3d, [BASE+8]
  |  mov CARG2d, [BASE]            // Caveat: CARG2d == BASE.
  |  mov CARG1d, L:RB
  |.elif X64
  |  mov CARG2d, [BASE]
  |  lea CARG3d, [BASE+8]        // Caveat: CARG3d == BASE.
  |  mov CARG1d, L:RB
  |.else
  |  mov TAB:RD, [BASE]
  |  mov ARG2, TAB:RD
  |  mov ARG1, L:RB
  |  add BASE, 8
  |  mov ARG3, BASE
  |.endif
  |  mov SAVE_PC, PC            // Needed for ITERN fallback.
  |  call extern lj_tab_next    // (lua_State *L, GCtab *t, TValue *key)
  |  // Flag returned in eax (RD).
  |  mov BASE, L:RB->base
  |  test RD, RD;  jz >3        // End of traversal?
  |  // Copy key and value to results.
  |.if X64
  |  mov RBa, [BASE+8]
  |  mov RDa, [BASE+16]
  |  mov [BASE-8], RBa
  |  mov [BASE], RDa
  |.else
  |  mov RB, [BASE+8]
  |  mov RD, [BASE+12]
  |  mov [BASE-8], RB
  |  mov [BASE-4], RD
  |  mov RB, [BASE+16]
  |  mov RD, [BASE+20]
  |  mov [BASE], RB
  |  mov [BASE+4], RD
  |.endif
  |->fff_res2:
  |  mov RD, 1+2
  |  jmp ->fff_res
  |2:  // Set missing 2nd arg to nil.
  |  mov dword [BASE+12], LJ_TNIL
  |  jmp <1
  |3:  // End of traversal: return nil.
  |  mov dword [BASE-4], LJ_TNIL
  |  jmp ->fff_res1
  |
  |.ffunc_1 pairs
  |  mov TAB:RB, [BASE]
  |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
#if LJ_52
  |  cmp dword TAB:RB->metatable, 0; jne ->fff_fallback
#endif
  |  mov CFUNC:RB, [BASE-8]
  |  mov CFUNC:RD, CFUNC:RB->upvalue[0]
  |  mov PC, [BASE-4]
  |  mov dword [BASE-4], LJ_TFUNC
  |  mov [BASE-8], CFUNC:RD
  |  mov dword [BASE+12], LJ_TNIL
  |  mov RD, 1+3
  |  jmp ->fff_res
  |
  |.ffunc_1 ipairs_aux
  |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
  |  cmp dword [BASE+12], LJ_TISNUM
  |.if DUALNUM
  |  jne ->fff_fallback
  |.else
  |  jae ->fff_fallback
  |.endif
  |  mov PC, [BASE-4]
  |.if DUALNUM
  |  mov RD, dword [BASE+8]
  |  add RD, 1
  |  mov dword [BASE-4], LJ_TISNUM
  |  mov dword [BASE-8], RD
  |.elif SSE
  |  movsd xmm0, qword [BASE+8]
  |  sseconst_1 xmm1, RBa
  |  addsd xmm0, xmm1
  |  cvtsd2si RD, xmm0
  |  movsd qword [BASE-8], xmm0
  |.else
  |  fld qword [BASE+8]
  |  fld1
  |  faddp st1
  |  fist ARG1
  |  fstp qword [BASE-8]
  |  mov RD, ARG1
  |.endif
  |  mov TAB:RB, [BASE]
  |  cmp RD, TAB:RB->asize;  jae >2    // Not in array part?
  |  shl RD, 3
  |  add RD, TAB:RB->array
  |1:
  |  cmp dword [RD+4], LJ_TNIL;  je ->fff_res0
  |  // Copy array slot.
  |.if X64
  |  mov RBa, [RD]
  |  mov [BASE], RBa
  |.else
  |  mov RB, [RD]
  |  mov RD, [RD+4]
  |  mov [BASE], RB
  |  mov [BASE+4], RD
  |.endif
  |  jmp ->fff_res2
  |2:  // Check for empty hash part first. Otherwise call C function.
  |  cmp dword TAB:RB->hmask, 0; je ->fff_res0
  |  mov FCARG1, TAB:RB
  |  mov RB, BASE            // Save BASE.
  |  mov FCARG2, RD            // Caveat: FCARG2 == BASE
  |  call extern lj_tab_getinth@8    // (GCtab *t, int32_t key)
  |  // cTValue * or NULL returned in eax (RD).
  |  mov BASE, RB
  |  test RD, RD
  |  jnz <1
  |->fff_res0:
  |  mov RD, 1+0
  |  jmp ->fff_res
  |
  |.ffunc_1 ipairs
  |  mov TAB:RB, [BASE]
  |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
#if LJ_52
  |  cmp dword TAB:RB->metatable, 0; jne ->fff_fallback
#endif
  |  mov CFUNC:RB, [BASE-8]
  |  mov CFUNC:RD, CFUNC:RB->upvalue[0]
  |  mov PC, [BASE-4]
  |  mov dword [BASE-4], LJ_TFUNC
  |  mov [BASE-8], CFUNC:RD
  |.if DUALNUM
  |  mov dword [BASE+12], LJ_TISNUM
  |  mov dword [BASE+8], 0
  |.elif SSE
  |  xorps xmm0, xmm0
  |  movsd qword [BASE+8], xmm0
  |.else
  |  fldz
  |  fstp qword [BASE+8]
  |.endif
  |  mov RD, 1+3
  |  jmp ->fff_res
  |
  |//-- Base library: catch errors ----------------------------------------
  |
  |.ffunc_1 pcall
  |  lea RA, [BASE+8]
  |  sub NARGS:RD, 1
  |  mov PC, 8+FRAME_PCALL
  |1:
  |  movzx RB, byte [DISPATCH+DISPATCH_GL(hookmask)]
  |  shr RB, HOOK_ACTIVE_SHIFT
  |  and RB, 1
  |  add PC, RB                // Remember active hook before pcall.
  |  jmp ->vm_call_dispatch
  |
  |.ffunc_2 xpcall
  |  cmp dword [BASE+12], LJ_TFUNC;  jne ->fff_fallback
  |  mov RB, [BASE+4]            // Swap function and traceback.
  |  mov [BASE+12], RB
  |  mov dword [BASE+4], LJ_TFUNC
  |  mov LFUNC:RB, [BASE]
  |  mov PC, [BASE+8]
  |  mov [BASE+8], LFUNC:RB
  |  mov [BASE], PC
  |  lea RA, [BASE+16]
  |  sub NARGS:RD, 2
  |  mov PC, 16+FRAME_PCALL
  |  jmp <1
  |
  |//-- Coroutine library --------------------------------------------------
  |
  |.macro coroutine_resume_wrap, resume
  |.if resume
  |.ffunc_1 coroutine_resume
  |  mov L:RB, [BASE]
  |.else
  |.ffunc coroutine_wrap_aux
  |  mov CFUNC:RB, [BASE-8]
  |  mov L:RB, CFUNC:RB->upvalue[0].gcr
  |.endif
  |  mov PC, [BASE-4]
  |  mov SAVE_PC, PC
  |.if X64
  |  mov TMP1, L:RB
  |.else
  |  mov ARG1, L:RB
  |.endif
  |.if resume
  |  cmp dword [BASE+4], LJ_TTHREAD;  jne ->fff_fallback
  |.endif
  |  cmp aword L:RB->cframe, 0; jne ->fff_fallback
  |  cmp byte L:RB->status, LUA_YIELD;  ja ->fff_fallback
  |  mov RA, L:RB->top
  |  je >1                // Status != LUA_YIELD (i.e. 0)?
  |  cmp RA, L:RB->base            // Check for presence of initial func.
  |  je ->fff_fallback
  |1:
  |.if resume
  |  lea PC, [RA+NARGS:RD*8-16]        // Check stack space (-1-thread).
  |.else
  |  lea PC, [RA+NARGS:RD*8-8]        // Check stack space (-1).
  |.endif
  |  cmp PC, L:RB->maxstack; ja ->fff_fallback
  |  mov L:RB->top, PC
  |
  |  mov L:RB, SAVE_L
  |  mov L:RB->base, BASE
  |.if resume
  |  add BASE, 8            // Keep resumed thread in stack for GC.
  |.endif
  |  mov L:RB->top, BASE
  |.if resume
  |  lea RB, [BASE+NARGS:RD*8-24]    // RB = end of source for stack move.
  |.else
  |  lea RB, [BASE+NARGS:RD*8-16]    // RB = end of source for stack move.
  |.endif
  |  sub RBa, PCa            // Relative to PC.
  |
  |  cmp PC, RA
  |  je >3
  |2:  // Move args to coroutine.
  |.if X64
  |  mov RCa, [PC+RB]
  |  mov [PC-8], RCa
  |.else
  |  mov RC, [PC+RB+4]
  |  mov [PC-4], RC
  |  mov RC, [PC+RB]
  |  mov [PC-8], RC
  |.endif
  |  sub PC, 8
  |  cmp PC, RA
  |  jne <2
  |3:
  |.if X64
  |  mov CARG2d, RA
  |  mov CARG1d, TMP1
  |.else
  |  mov ARG2, RA
  |  xor RA, RA
  |  mov ARG4, RA
  |  mov ARG3, RA
  |.endif
  |  call ->vm_resume            // (lua_State *L, TValue *base, 0, 0)
  |  set_vmstate INTERP
  |
  |  mov L:RB, SAVE_L
  |.if X64
  |  mov L:PC, TMP1
  |.else
  |  mov L:PC, ARG1            // The callee doesn't modify SAVE_L.
  |.endif
  |  mov BASE, L:RB->base
  |  cmp eax, LUA_YIELD
  |  ja >8
  |4:
  |  mov RA, L:PC->base
  |  mov KBASE, L:PC->top
  |  mov L:PC->top, RA            // Clear coroutine stack.
  |  mov PC, KBASE
  |  sub PC, RA
  |  je >6                // No results?
  |  lea RD, [BASE+PC]
  |  shr PC, 3
  |  cmp RD, L:RB->maxstack
  |  ja >9                // Need to grow stack?
  |
  |  mov RB, BASE
  |  sub RBa, RAa
  |5:  // Move results from coroutine.
  |.if X64
  |  mov RDa, [RA]
  |  mov [RA+RB], RDa
  |.else
  |  mov RD, [RA]
  |  mov [RA+RB], RD
  |  mov RD, [RA+4]
  |  mov [RA+RB+4], RD
  |.endif
  |  add RA, 8
  |  cmp RA, KBASE
  |  jne <5
  |6:
  |.if resume
  |  lea RD, [PC+2]            // nresults+1 = 1 + true + results.
  |  mov dword [BASE-4], LJ_TTRUE    // Prepend true to results.
  |.else
  |  lea RD, [PC+1]            // nresults+1 = 1 + results.
  |.endif
  |7:
  |  mov PC, SAVE_PC
  |  mov MULTRES, RD
  |.if resume
  |  mov RAa, -8
  |.else
  |  xor RA, RA
  |.endif
  |  test PC, FRAME_TYPE
  |  jz ->BC_RET_Z
  |  jmp ->vm_return
  |
  |8:  // Coroutine returned with error (at co->top-1).
  |.if resume
  |  mov dword [BASE-4], LJ_TFALSE    // Prepend false to results.
  |  mov RA, L:PC->top
  |  sub RA, 8
  |  mov L:PC->top, RA            // Clear error from coroutine stack.
  |  // Copy error message.
  |.if X64
  |  mov RDa, [RA]
  |  mov [BASE], RDa
  |.else
  |  mov RD, [RA]
  |  mov [BASE], RD
  |  mov RD, [RA+4]
  |  mov [BASE+4], RD
  |.endif
  |  mov RD, 1+2            // nresults+1 = 1 + false + error.
  |  jmp <7
  |.else
  |  mov FCARG2, L:PC
  |  mov FCARG1, L:RB
  |  call extern lj_ffh_coroutine_wrap_err@8  // (lua_State *L, lua_State *co)
  |  // Error function does not return.
  |.endif
  |
  |9:  // Handle stack expansion on return from yield.
  |.if X64
  |  mov L:RA, TMP1
  |.else
  |  mov L:RA, ARG1            // The callee doesn't modify SAVE_L.
  |.endif
  |  mov L:RA->top, KBASE        // Undo coroutine stack clearing.
  |  mov FCARG2, PC
  |  mov FCARG1, L:RB
  |  call extern lj_state_growstack@8    // (lua_State *L, int n)
  |.if X64
  |  mov L:PC, TMP1
  |.else
  |  mov L:PC, ARG1
  |.endif
  |  mov BASE, L:RB->base
  |  jmp <4                // Retry the stack move.
  |.endmacro
  |
  |  coroutine_resume_wrap 1        // coroutine.resume
  |  coroutine_resume_wrap 0        // coroutine.wrap
  |
  |.ffunc coroutine_yield
  |  mov L:RB, SAVE_L
  |  test aword L:RB->cframe, CFRAME_RESUME
  |  jz ->fff_fallback
  |  mov L:RB->base, BASE
  |  lea RD, [BASE+NARGS:RD*8-8]
  |  mov L:RB->top, RD
  |  xor RD, RD
  |  mov aword L:RB->cframe, RDa
  |  mov al, LUA_YIELD
  |  mov byte L:RB->status, al
  |  jmp ->vm_leave_unw
  |
  |//-- Math library -------------------------------------------------------
  |
  |.if not DUALNUM
  |->fff_resi:  // Dummy.
  |.endif
  |
  |.if SSE
  |->fff_resn:
  |  mov PC, [BASE-4]
  |  fstp qword [BASE-8]
  |  jmp ->fff_res1
  |.endif
  |
  |  .ffunc_1 math_abs
  |.if DUALNUM
  |  cmp dword [BASE+4], LJ_TISNUM; jne >2
  |  mov RB, dword [BASE]
  |  cmp RB, 0; jns ->fff_resi
  |  neg RB; js >1
  |->fff_resbit:
  |->fff_resi:
  |  mov PC, [BASE-4]
  |  mov dword [BASE-4], LJ_TISNUM
  |  mov dword [BASE-8], RB
  |  jmp ->fff_res1
  |1:
  |  mov PC, [BASE-4]
  |  mov dword [BASE-4], 0x41e00000  // 2^31.
  |  mov dword [BASE-8], 0
  |  jmp ->fff_res1
  |2:
  |  ja ->fff_fallback
  |.else
  |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
  |.endif
  |
  |.if SSE
  |  movsd xmm0, qword [BASE]
  |  sseconst_abs xmm1, RDa
  |  andps xmm0, xmm1
  |->fff_resxmm0:
  |  mov PC, [BASE-4]
  |  movsd qword [BASE-8], xmm0
  |  // fallthrough
  |.else
  |  fld qword [BASE]
  |  fabs
  |  // fallthrough
  |->fff_resxmm0:  // Dummy.
  |->fff_resn:
  |  mov PC, [BASE-4]
  |  fstp qword [BASE-8]
  |.endif
  |
  |->fff_res1:
  |  mov RD, 1+1
  |->fff_res:
  |  mov MULTRES, RD
  |->fff_res_:
  |  test PC, FRAME_TYPE
  |  jnz >7
  |5:
  |  cmp PC_RB, RDL            // More results expected?
  |  ja >6
  |  // Adjust BASE. KBASE is assumed to be set for the calling frame.
  |  movzx RA, PC_RA
  |  not RAa                // Note: ~RA = -(RA+1)
  |  lea BASE, [BASE+RA*8]        // base = base - (RA+1)*8
  |  ins_next
  |
  |6:  // Fill up results with nil.
  |  mov dword [BASE+RD*8-12], LJ_TNIL
  |  add RD, 1
  |  jmp <5
  |
  |7:  // Non-standard return case.
  |  mov RAa, -8            // Results start at BASE+RA = BASE-8.
  |  jmp ->vm_return
  |
  |.macro math_round, func
  |  .ffunc math_ .. func
  |.if DUALNUM
  |  cmp dword [BASE+4], LJ_TISNUM; jne >1
  |  mov RB, dword [BASE]; jmp ->fff_resi
  |1:
  |  ja ->fff_fallback
  |.else
  |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
  |.endif
  |.if SSE
  |  movsd xmm0, qword [BASE]
  |  call ->vm_ .. func
  |  .if DUALNUM
  |    cvtsd2si RB, xmm0
  |    cmp RB, 0x80000000
  |    jne ->fff_resi
  |    cvtsi2sd xmm1, RB
  |    ucomisd xmm0, xmm1
  |    jp ->fff_resxmm0
  |    je ->fff_resi
  |  .endif
  |  jmp ->fff_resxmm0
  |.else
  |  fld qword [BASE]
  |  call ->vm_ .. func
  |  .if DUALNUM
  |    fist ARG1
  |    mov RB, ARG1
  |    cmp RB, 0x80000000; jne >2
  |    fdup
  |    fild ARG1
  |    fcomparepp
  |    jp ->fff_resn
  |    jne ->fff_resn
  |2:
  |    fpop
  |    jmp ->fff_resi
  | .else
  |    jmp ->fff_resn
  | .endif
  |.endif
  |.endmacro
  |
  |  math_round floor
  |  math_round ceil
  |
  |.if SSE
  |.ffunc_nsse math_sqrt, sqrtsd; jmp ->fff_resxmm0
  |.else
  |.ffunc_n math_sqrt; fsqrt; jmp ->fff_resn
  |.endif
  |
  |.ffunc math_log
  |  cmp NARGS:RD, 1+1; jne ->fff_fallback    // Exactly one argument.
  |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
  |  fldln2; fld qword [BASE]; fyl2x; jmp ->fff_resn
  |
  |.ffunc_n math_log10, fldlg2;    fyl2x;        jmp ->fff_resn
  |.ffunc_n math_exp;    call ->vm_exp_x87;    jmp ->fff_resn
  |
  |.ffunc_n math_sin;    fsin;            jmp ->fff_resn
  |.ffunc_n math_cos;    fcos;            jmp ->fff_resn
  |.ffunc_n math_tan;    fptan; fpop;        jmp ->fff_resn
  |
  |.ffunc_n math_asin
  |  fdup; fmul st0; fld1; fsubrp st1; fsqrt; fpatan
  |  jmp ->fff_resn
  |.ffunc_n math_acos
  |  fdup; fmul st0; fld1; fsubrp st1; fsqrt; fxch; fpatan
  |  jmp ->fff_resn
  |.ffunc_n math_atan;    fld1; fpatan;        jmp ->fff_resn
  |
  |.macro math_extern, func
  |.if SSE
  |  .ffunc_nsse math_ .. func
  |  .if not X64
  |    movsd FPARG1, xmm0
  |  .endif
  |.else
  |  .ffunc_n math_ .. func
  |  fstp FPARG1
  |.endif
  |  mov RB, BASE
  |  call extern lj_vm_ .. func
  |  mov BASE, RB
  |  .if X64
  |    jmp ->fff_resxmm0
  |  .else
  |    jmp ->fff_resn
  |  .endif
  |.endmacro
  |
  |  math_extern sinh
  |  math_extern cosh
  |  math_extern tanh
  |
  |->ff_math_deg:
  |.if SSE
  |.ffunc_nsse math_rad
  |  mov CFUNC:RB, [BASE-8]
  |  mulsd xmm0, qword CFUNC:RB->upvalue[0]
  |  jmp ->fff_resxmm0
  |.else
  |.ffunc_n math_rad
  |  mov CFUNC:RB, [BASE-8]
  |  fmul qword CFUNC:RB->upvalue[0]
  |  jmp ->fff_resn
  |.endif
  |
  |.ffunc_nn math_atan2;    fpatan;        jmp ->fff_resn
  |.ffunc_nnr math_ldexp;    fscale; fpop1;    jmp ->fff_resn
  |
  |.ffunc_1 math_frexp
  |  mov RB, [BASE+4]
  |  cmp RB, LJ_TISNUM;  jae ->fff_fallback
  |  mov PC, [BASE-4]
  |  mov RC, [BASE]
  |  mov [BASE-4], RB; mov [BASE-8], RC
  |  shl RB, 1; cmp RB, 0xffe00000; jae >3
  |  or RC, RB; jz >3
  |  mov RC, 1022
  |  cmp RB, 0x00200000; jb >4
  |1:
  |  shr RB, 21; sub RB, RC        // Extract and unbias exponent.
  |.if SSE
  |  cvtsi2sd xmm0, RB
  |.else
  |  mov TMP1, RB; fild TMP1
  |.endif
  |  mov RB, [BASE-4]
  |  and RB, 0x800fffff            // Mask off exponent.
  |  or RB, 0x3fe00000            // Put mantissa in range [0.5,1) or 0.
  |  mov [BASE-4], RB
  |2:
  |.if SSE
  |  movsd qword [BASE], xmm0
  |.else
  |  fstp qword [BASE]
  |.endif
  |  mov RD, 1+2
  |  jmp ->fff_res
  |3:  // Return +-0, +-Inf, NaN unmodified and an exponent of 0.
  |.if SSE
  |  xorps xmm0, xmm0; jmp <2
  |.else
  |  fldz; jmp <2
  |.endif
  |4:  // Handle denormals by multiplying with 2^54 and adjusting the bias.
  |.if SSE
  |  movsd xmm0, qword [BASE]
  |  sseconst_hi xmm1, RBa, 43500000  // 2^54.
  |  mulsd xmm0, xmm1
  |  movsd qword [BASE-8], xmm0
  |.else
  |  fld qword [BASE]
  |  mov TMP1, 0x5a800000; fmul TMP1    // x = x*2^54
  |  fstp qword [BASE-8]
  |.endif
  |  mov RB, [BASE-4]; mov RC, 1076; shl RB, 1; jmp <1
  |
  |.if SSE
  |.ffunc_nsse math_modf
  |.else
  |.ffunc_n math_modf
  |.endif
  |  mov RB, [BASE+4]
  |  mov PC, [BASE-4]
  |  shl RB, 1; cmp RB, 0xffe00000; je >4    // +-Inf?
  |.if SSE
  |  movaps xmm4, xmm0
  |  call ->vm_trunc
  |  subsd xmm4, xmm0
  |1:
  |  movsd qword [BASE-8], xmm0
  |  movsd qword [BASE], xmm4
  |.else
  |  fdup
  |  call ->vm_trunc
  |  fsub st1, st0
  |1:
  |  fstp qword [BASE-8]
  |  fstp qword [BASE]
  |.endif
  |  mov RC, [BASE-4]; mov RB, [BASE+4]
  |  xor RC, RB; js >3                // Need to adjust sign?
  |2:
  |  mov RD, 1+2
  |  jmp ->fff_res
  |3:
  |  xor RB, 0x80000000; mov [BASE+4], RB    // Flip sign of fraction.
  |  jmp <2
  |4:
  |.if SSE
  |  xorps xmm4, xmm4; jmp <1            // Return +-Inf and +-0.
  |.else
  |  fldz; fxch; jmp <1                // Return +-Inf and +-0.
  |.endif
  |
  |.ffunc_nnr math_fmod
  |1: ; fprem; fnstsw ax; sahf; jp <1
  |  fpop1
  |  jmp ->fff_resn
  |
  |.if SSE
  |.ffunc_nnsse math_pow;    call ->vm_pow;    jmp ->fff_resxmm0
  |.else
  |.ffunc_nn math_pow;        call ->vm_pow;    jmp ->fff_resn
  |.endif
  |
  |.macro math_minmax, name, cmovop, fcmovop, sseop
  |  .ffunc name
  |  mov RA, 2
  |  cmp dword [BASE+4], LJ_TISNUM
  |.if DUALNUM
  |  jne >4
  |  mov RB, dword [BASE]
  |1:  // Handle integers.
  |  cmp RA, RD; jae ->fff_resi
  |  cmp dword [BASE+RA*8-4], LJ_TISNUM; jne >3
  |  cmp RB, dword [BASE+RA*8-8]
  |  cmovop RB, dword [BASE+RA*8-8]
  |  add RA, 1
  |  jmp <1
  |3:
  |  ja ->fff_fallback
  |  // Convert intermediate result to number and continue below.
  |.if SSE
  |  cvtsi2sd xmm0, RB
  |.else
  |  mov TMP1, RB
  |  fild TMP1
  |.endif
  |  jmp >6
  |4:
  |  ja ->fff_fallback
  |.else
  |  jae ->fff_fallback
  |.endif
  |
  |.if SSE
  |  movsd xmm0, qword [BASE]
  |5:  // Handle numbers or integers.
  |  cmp RA, RD; jae ->fff_resxmm0
  |  cmp dword [BASE+RA*8-4], LJ_TISNUM
  |.if DUALNUM
  |  jb >6
  |  ja ->fff_fallback
  |  cvtsi2sd xmm1, dword [BASE+RA*8-8]
  |  jmp >7
  |.else
  |  jae ->fff_fallback
  |.endif
  |6:
  |  movsd xmm1, qword [BASE+RA*8-8]
  |7:
  |  sseop xmm0, xmm1
  |  add RA, 1
  |  jmp <5
  |.else
  |  fld qword [BASE]
  |5:  // Handle numbers or integers.
  |  cmp RA, RD; jae ->fff_resn
  |  cmp dword [BASE+RA*8-4], LJ_TISNUM
  |.if DUALNUM
  |  jb >6
  |  ja >9
  |  fild dword [BASE+RA*8-8]
  |  jmp >7
  |.else
  |  jae >9
  |.endif
  |6:
  |  fld qword [BASE+RA*8-8]
  |7:
  |  fucomi st1; fcmovop st1; fpop1
  |  add RA, 1
  |  jmp <5
  |.endif
  |.endmacro
  |
  |  math_minmax math_min, cmovg, fcmovnbe, minsd
  |  math_minmax math_max, cmovl, fcmovbe, maxsd
  |.if not SSE
  |9:
  |  fpop; jmp ->fff_fallback
  |.endif
  |
  |//-- String library -----------------------------------------------------
  |
  |.ffunc_1 string_len
  |  cmp dword [BASE+4], LJ_TSTR;  jne ->fff_fallback
  |  mov STR:RB, [BASE]
  |.if DUALNUM
  |  mov RB, dword STR:RB->len; jmp ->fff_resi
  |.elif SSE
  |  cvtsi2sd xmm0, dword STR:RB->len; jmp ->fff_resxmm0
  |.else
  |  fild dword STR:RB->len; jmp ->fff_resn
  |.endif
  |
  |.ffunc string_byte            // Only handle the 1-arg case here.
  |  cmp NARGS:RD, 1+1;  jne ->fff_fallback
  |  cmp dword [BASE+4], LJ_TSTR;  jne ->fff_fallback
  |  mov STR:RB, [BASE]
  |  mov PC, [BASE-4]
  |  cmp dword STR:RB->len, 1
  |  jb ->fff_res0            // Return no results for empty string.
  |  movzx RB, byte STR:RB[1]
  |.if DUALNUM
  |  jmp ->fff_resi
  |.elif SSE
  |  cvtsi2sd xmm0, RB; jmp ->fff_resxmm0
  |.else
  |  mov TMP1, RB; fild TMP1; jmp ->fff_resn
  |.endif
  |
  |.ffunc string_char            // Only handle the 1-arg case here.
  |  ffgccheck
  |  cmp NARGS:RD, 1+1;  jne ->fff_fallback    // *Exactly* 1 arg.
  |  cmp dword [BASE+4], LJ_TISNUM
  |.if DUALNUM
  |  jne ->fff_fallback
  |  mov RB, dword [BASE]
  |  cmp RB, 255;  ja ->fff_fallback
  |  mov TMP2, RB
  |.elif SSE
  |  jae ->fff_fallback
  |  cvttsd2si RB, qword [BASE]
  |  cmp RB, 255;  ja ->fff_fallback
  |  mov TMP2, RB
  |.else
  |  jae ->fff_fallback
  |  fld qword [BASE]
  |  fistp TMP2
  |  cmp TMP2, 255;  ja ->fff_fallback
  |.endif
  |.if X64
  |  mov TMP3, 1
  |.else
  |  mov ARG3, 1
  |.endif
  |  lea RDa, TMP2            // Points to stack. Little-endian.
  |->fff_newstr:
  |  mov L:RB, SAVE_L
  |  mov L:RB->base, BASE
  |.if X64
  |  mov CARG3d, TMP3            // Zero-extended to size_t.
  |  mov CARG2, RDa            // May be 64 bit ptr to stack.
  |  mov CARG1d, L:RB
  |.else
  |  mov ARG2, RD
  |  mov ARG1, L:RB
  |.endif
  |  mov SAVE_PC, PC
  |  call extern lj_str_new        // (lua_State *L, char *str, size_t l)
  |  // GCstr * returned in eax (RD).
  |  mov BASE, L:RB->base
  |  mov PC, [BASE-4]
  |  mov dword [BASE-4], LJ_TSTR
  |  mov [BASE-8], STR:RD
  |  jmp ->fff_res1
  |
  |.ffunc string_sub
  |  ffgccheck
  |  mov TMP2, -1
  |  cmp NARGS:RD, 1+2;  jb ->fff_fallback
  |  jna >1
  |  cmp dword [BASE+20], LJ_TISNUM
  |.if DUALNUM
  |  jne ->fff_fallback
  |  mov RB, dword [BASE+16]
  |  mov TMP2, RB
  |.elif SSE
  |  jae ->fff_fallback
  |  cvttsd2si RB, qword [BASE+16]
  |  mov TMP2, RB
  |.else
  |  jae ->fff_fallback
  |  fld qword [BASE+16]
  |  fistp TMP2
  |.endif
  |1:
  |  cmp dword [BASE+4], LJ_TSTR;  jne ->fff_fallback
  |  cmp dword [BASE+12], LJ_TISNUM
  |.if DUALNUM
  |  jne ->fff_fallback
  |.else
  |  jae ->fff_fallback
  |.endif
  |  mov STR:RB, [BASE]
  |  mov TMP3, STR:RB
  |  mov RB, STR:RB->len
  |.if DUALNUM
  |  mov RA, dword [BASE+8]
  |.elif SSE
  |  cvttsd2si RA, qword [BASE+8]
  |.else
  |  fld qword [BASE+8]
  |  fistp ARG3
  |  mov RA, ARG3
  |.endif
  |  mov RC, TMP2
  |  cmp RB, RC                // len < end? (unsigned compare)
  |  jb >5
  |2:
  |  test RA, RA            // start <= 0?
  |  jle >7
  |3:
  |  mov STR:RB, TMP3
  |  sub RC, RA                // start > end?
  |  jl ->fff_emptystr
  |  lea RB, [STR:RB+RA+#STR-1]
  |  add RC, 1
  |4:
  |.if X64
  |  mov TMP3, RC
  |.else
  |  mov ARG3, RC
  |.endif
  |  mov RD, RB
  |  jmp ->fff_newstr
  |
  |5:  // Negative end or overflow.
  |  jl >6
  |  lea RC, [RC+RB+1]            // end = end+(len+1)
  |  jmp <2
  |6:  // Overflow.
  |  mov RC, RB                // end = len
  |  jmp <2
  |
  |7:  // Negative start or underflow.
  |  je >8
  |  add RA, RB                // start = start+(len+1)
  |  add RA, 1
  |  jg <3                // start > 0?
  |8:  // Underflow.
  |  mov RA, 1                // start = 1
  |  jmp <3
  |
  |->fff_emptystr:  // Range underflow.
  |  xor RC, RC                // Zero length. Any ptr in RB is ok.
  |  jmp <4
  |
  |.ffunc string_rep            // Only handle the 1-char case inline.
  |  ffgccheck
  |  cmp NARGS:RD, 2+1; jne ->fff_fallback    // Exactly 2 arguments.
  |  cmp dword [BASE+4], LJ_TSTR;  jne ->fff_fallback
  |  cmp dword [BASE+12], LJ_TISNUM
  |  mov STR:RB, [BASE]
  |.if DUALNUM
  |  jne ->fff_fallback
  |  mov RC, dword [BASE+8]
  |.elif SSE
  |  jae ->fff_fallback
  |  cvttsd2si RC, qword [BASE+8]
  |.else
  |  jae ->fff_fallback
  |  fld qword [BASE+8]
  |  fistp TMP2
  |  mov RC, TMP2
  |.endif
  |  test RC, RC
  |  jle ->fff_emptystr            // Count <= 0? (or non-int)
  |  cmp dword STR:RB->len, 1
  |  jb ->fff_emptystr            // Zero length string?
  |  jne ->fff_fallback_2        // Fallback for > 1-char strings.
  |  cmp [DISPATCH+DISPATCH_GL(tmpbuf.sz)], RC;  jb ->fff_fallback_2
  |  movzx RA, byte STR:RB[1]
  |  mov RB, [DISPATCH+DISPATCH_GL(tmpbuf.buf)]
  |.if X64
  |  mov TMP3, RC
  |.else
  |  mov ARG3, RC
  |.endif
  |1:  // Fill buffer with char. Yes, this is suboptimal code (do you care?).
  |  mov [RB], RAL
  |  add RB, 1
  |  sub RC, 1
  |  jnz <1
  |  mov RD, [DISPATCH+DISPATCH_GL(tmpbuf.buf)]
  |  jmp ->fff_newstr
  |
  |.ffunc_1 string_reverse
  |  ffgccheck
  |  cmp dword [BASE+4], LJ_TSTR;  jne ->fff_fallback
  |  mov STR:RB, [BASE]
  |  mov RC, STR:RB->len
  |  test RC, RC
  |  jz ->fff_emptystr            // Zero length string?
  |  cmp [DISPATCH+DISPATCH_GL(tmpbuf.sz)], RC;  jb ->fff_fallback_1
  |  add RB, #STR
  |  mov TMP2, PC            // Need another temp register.
  |.if X64
  |  mov TMP3, RC
  |.else
  |  mov ARG3, RC
  |.endif
  |  mov PC, [DISPATCH+DISPATCH_GL(tmpbuf.buf)]
  |1:
  |  movzx RA, byte [RB]
  |  add RB, 1
  |  sub RC, 1
  |  mov [PC+RC], RAL
  |  jnz <1
  |  mov RD, PC
  |  mov PC, TMP2
  |  jmp ->fff_newstr
  |
  |.macro ffstring_case, name, lo, hi
  |  .ffunc_1 name
  |  ffgccheck
  |  cmp dword [BASE+4], LJ_TSTR;  jne ->fff_fallback
  |  mov STR:RB, [BASE]
  |  mov RC, STR:RB->len
  |  cmp [DISPATCH+DISPATCH_GL(tmpbuf.sz)], RC;  jb ->fff_fallback_1
  |  add RB, #STR
  |  mov TMP2, PC            // Need another temp register.
  |.if X64
  |  mov TMP3, RC
  |.else
  |  mov ARG3, RC
  |.endif
  |  mov PC, [DISPATCH+DISPATCH_GL(tmpbuf.buf)]
  |  jmp >3
  |1:  // ASCII case conversion. Yes, this is suboptimal code (do you care?).
  |  movzx RA, byte [RB+RC]
  |  cmp RA, lo
  |  jb >2
  |  cmp RA, hi
  |  ja >2
  |  xor RA, 0x20
  |2:
  |  mov [PC+RC], RAL
  |3:
  |  sub RC, 1
  |  jns <1
  |  mov RD, PC
  |  mov PC, TMP2
  |  jmp ->fff_newstr
  |.endmacro
  |
  |ffstring_case string_lower, 0x41, 0x5a
  |ffstring_case string_upper, 0x61, 0x7a
  |
  |//-- Table library ------------------------------------------------------
  |
  |.ffunc_1 table_getn
  |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
  |  mov RB, BASE            // Save BASE.
  |  mov TAB:FCARG1, [BASE]
  |  call extern lj_tab_len@4        // LJ_FASTCALL (GCtab *t)
  |  // Length of table returned in eax (RD).
  |  mov BASE, RB            // Restore BASE.
  |.if DUALNUM
  |  mov RB, RD; jmp ->fff_resi
  |.elif SSE
  |  cvtsi2sd xmm0, RD; jmp ->fff_resxmm0
  |.else
  |  mov ARG1, RD; fild ARG1; jmp ->fff_resn
  |.endif
  |
  |//-- Bit library --------------------------------------------------------
  |
  |.define TOBIT_BIAS, 0x59c00000    // 2^52 + 2^51 (float, not double!).
  |
  |.macro .ffunc_bit, name, kind
  |  .ffunc_1 name
  |.if kind == 2
  |.if SSE
  |  sseconst_tobit xmm1, RBa
  |.else
  |  mov TMP1, TOBIT_BIAS
  |.endif
  |.endif
  |  cmp dword [BASE+4], LJ_TISNUM
  |.if DUALNUM
  |  jne >1
  |  mov RB, dword [BASE]
  |.if kind > 0
  |  jmp >2
  |.else
  |  jmp ->fff_resbit
  |.endif
  |1:
  |  ja ->fff_fallback
  |.else
  |  jae ->fff_fallback
  |.endif
  |.if SSE
  |  movsd xmm0, qword [BASE]
  |.if kind < 2
  |  sseconst_tobit xmm1, RBa
  |.endif
  |  addsd xmm0, xmm1
  |  movd RB, xmm0
  |.else
  |  fld qword [BASE]
  |.if kind < 2
  |  mov TMP1, TOBIT_BIAS
  |.endif
  |  fadd TMP1
  |  fstp FPARG1
  |.if kind > 0
  |  mov RB, ARG1
  |.endif
  |.endif
  |2:
  |.endmacro
  |
  |.ffunc_bit bit_tobit, 0
  |.if DUALNUM or SSE
  |.if not SSE
  |  mov RB, ARG1
  |.endif
  |  jmp ->fff_resbit
  |.else
  |  fild ARG1
  |  jmp ->fff_resn
  |.endif
  |
  |.macro .ffunc_bit_op, name, ins
  |  .ffunc_bit name, 2
  |  mov TMP2, NARGS:RD            // Save for fallback.
  |  lea RD, [BASE+NARGS:RD*8-16]
  |1:
  |  cmp RD, BASE
  |  jbe ->fff_resbit
  |  cmp dword [RD+4], LJ_TISNUM
  |.if DUALNUM
  |  jne >2
  |  ins RB, dword [RD]
  |  sub RD, 8
  |  jmp <1
  |2:
  |  ja ->fff_fallback_bit_op
  |.else
  |  jae ->fff_fallback_bit_op
  |.endif
  |.if SSE
  |  movsd xmm0, qword [RD]
  |  addsd xmm0, xmm1
  |  movd RA, xmm0
  |  ins RB, RA
  |.else
  |  fld qword [RD]
  |  fadd TMP1
  |  fstp FPARG1
  |  ins RB, ARG1
  |.endif
  |  sub RD, 8
  |  jmp <1
  |.endmacro
  |
  |.ffunc_bit_op bit_band, and
  |.ffunc_bit_op bit_bor, or
  |.ffunc_bit_op bit_bxor, xor
  |
  |.ffunc_bit bit_bswap, 1
  |  bswap RB
  |  jmp ->fff_resbit
  |
  |.ffunc_bit bit_bnot, 1
  |  not RB
  |.if DUALNUM
  |  jmp ->fff_resbit
  |.elif SSE
  |->fff_resbit:
  |  cvtsi2sd xmm0, RB
  |  jmp ->fff_resxmm0
  |.else
  |->fff_resbit:
  |  mov ARG1, RB
  |  fild ARG1
  |  jmp ->fff_resn
  |.endif
  |
  |->fff_fallback_bit_op:
  |  mov NARGS:RD, TMP2            // Restore for fallback
  |  jmp ->fff_fallback
  |
  |.macro .ffunc_bit_sh, name, ins
  |.if DUALNUM
  |  .ffunc_bit name, 1
  |  // Note: no inline conversion from number for 2nd argument!
  |  cmp dword [BASE+12], LJ_TISNUM; jne ->fff_fallback
  |  mov RA, dword [BASE+8]
  |.elif SSE
  |  .ffunc_nnsse name
  |  sseconst_tobit xmm2, RBa
  |  addsd xmm0, xmm2
  |  addsd xmm1, xmm2
  |  movd RB, xmm0
  |  movd RA, xmm1
  |.else
  |  .ffunc_nn name
  |  mov TMP1, TOBIT_BIAS
  |  fadd TMP1
  |  fstp FPARG3
  |  fadd TMP1
  |  fstp FPARG1
  |  mov RA, ARG3
  |  mov RB, ARG1
  |.endif
  |  ins RB, cl                // Assumes RA is ecx.
  |  jmp ->fff_resbit
  |.endmacro
  |
  |.ffunc_bit_sh bit_lshift, shl
  |.ffunc_bit_sh bit_rshift, shr
  |.ffunc_bit_sh bit_arshift, sar
  |.ffunc_bit_sh bit_rol, rol
  |.ffunc_bit_sh bit_ror, ror
  |
  |//-----------------------------------------------------------------------
  |
  |->fff_fallback_2:
  |  mov NARGS:RD, 1+2            // Other args are ignored, anyway.
  |  jmp ->fff_fallback
  |->fff_fallback_1:
  |  mov NARGS:RD, 1+1            // Other args are ignored, anyway.
  |->fff_fallback:            // Call fast function fallback handler.
  |  // BASE = new base, RD = nargs+1
  |  mov L:RB, SAVE_L
  |  mov PC, [BASE-4]            // Fallback may overwrite PC.
  |  mov SAVE_PC, PC            // Redundant (but a defined value).
  |  mov L:RB->base, BASE
  |  lea RD, [BASE+NARGS:RD*8-8]
  |  lea RA, [RD+8*LUA_MINSTACK]    // Ensure enough space for handler.
  |  mov L:RB->top, RD
  |  mov CFUNC:RD, [BASE-8]
  |  cmp RA, L:RB->maxstack
  |  ja >5                // Need to grow stack.
  |.if X64
  |  mov CARG1d, L:RB
  |.else
  |  mov ARG1, L:RB
  |.endif
  |  call aword CFUNC:RD->f        // (lua_State *L)
  |  mov BASE, L:RB->base
  |  // Either throws an error, or recovers and returns -1, 0 or nresults+1.
  |  test RD, RD;  jg ->fff_res        // Returned nresults+1?
  |1:
  |  mov RA, L:RB->top
  |  sub RA, BASE
  |  shr RA, 3
  |  test RD, RD
  |  lea NARGS:RD, [RA+1]
  |  mov LFUNC:RB, [BASE-8]
  |  jne ->vm_call_tail            // Returned -1?
  |  ins_callt                // Returned 0: retry fast path.
  |
  |// Reconstruct previous base for vmeta_call during tailcall.
  |->vm_call_tail:
  |  mov RA, BASE
  |  test PC, FRAME_TYPE
  |  jnz >3
  |  movzx RB, PC_RA
  |  not RBa                // Note: ~RB = -(RB+1)
  |  lea BASE, [BASE+RB*8]        // base = base - (RB+1)*8
  |  jmp ->vm_call_dispatch        // Resolve again for tailcall.
  |3:
  |  mov RB, PC
  |  and RB, -8
  |  sub BASE, RB
  |  jmp ->vm_call_dispatch        // Resolve again for tailcall.
  |
  |5:  // Grow stack for fallback handler.
  |  mov FCARG2, LUA_MINSTACK
  |  mov FCARG1, L:RB
  |  call extern lj_state_growstack@8    // (lua_State *L, int n)
  |  mov BASE, L:RB->base
  |  xor RD, RD                // Simulate a return 0.
  |  jmp <1                // Dumb retry (goes through ff first).
  |
  |->fff_gcstep:            // Call GC step function.
  |  // BASE = new base, RD = nargs+1
  |  pop RBa                // Must keep stack at same level.
  |  mov TMPa, RBa            // Save return address
  |  mov L:RB, SAVE_L
  |  mov SAVE_PC, PC            // Redundant (but a defined value).
  |  mov L:RB->base, BASE
  |  lea RD, [BASE+NARGS:RD*8-8]
  |  mov FCARG1, L:RB
  |  mov L:RB->top, RD
  |  call extern lj_gc_step@4        // (lua_State *L)
  |  mov BASE, L:RB->base
  |  mov RD, L:RB->top
  |  sub RD, BASE
  |  shr RD, 3
  |  add NARGS:RD, 1
  |  mov RBa, TMPa
  |  push RBa                // Restore return address.
  |  ret
  |
  |//-----------------------------------------------------------------------
  |//-- Special dispatch targets -------------------------------------------
  |//-----------------------------------------------------------------------
  |
  |->vm_record:                // Dispatch target for recording phase.
  |.if JIT
  |  movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]
  |  test RDL, HOOK_VMEVENT        // No recording while in vmevent.
  |  jnz >5
  |  // Decrement the hookcount for consistency, but always do the call.
  |  test RDL, HOOK_ACTIVE
  |  jnz >1
  |  test RDL, LUA_MASKLINE|LUA_MASKCOUNT
  |  jz >1
  |  dec dword [DISPATCH+DISPATCH_GL(hookcount)]
  |  jmp >1
  |.endif
  |
  |->vm_rethook:            // Dispatch target for return hooks.
  |  movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]
  |  test RDL, HOOK_ACTIVE        // Hook already active?
  |  jnz >5
  |  jmp >1
  |
  |->vm_inshook:            // Dispatch target for instr/line hooks.
  |  movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]
  |  test RDL, HOOK_ACTIVE        // Hook already active?
  |  jnz >5
  |
  |  test RDL, LUA_MASKLINE|LUA_MASKCOUNT
  |  jz >5
  |  dec dword [DISPATCH+DISPATCH_GL(hookcount)]
  |  jz >1
  |  test RDL, LUA_MASKLINE
  |  jz >5
  |1:
  |  mov L:RB, SAVE_L
  |  mov L:RB->base, BASE
  |  mov FCARG2, PC            // Caveat: FCARG2 == BASE
  |  mov FCARG1, L:RB
  |  // SAVE_PC must hold the _previous_ PC. The callee updates it with PC.
  |  call extern lj_dispatch_ins@8    // (lua_State *L, BCIns *pc)
  |3:
  |  mov BASE, L:RB->base
  |4:
  |  movzx RA, PC_RA
  |5:
  |  movzx OP, PC_OP
  |  movzx RD, PC_RD
  |.if X64
  |  jmp aword [DISPATCH+OP*8+GG_DISP2STATIC]    // Re-dispatch to static ins.
  |.else
  |  jmp aword [DISPATCH+OP*4+GG_DISP2STATIC]    // Re-dispatch to static ins.
  |.endif
  |
  |->cont_hook:                // Continue from hook yield.
  |  add PC, 4
  |  mov RA, [RB-24]
  |  mov MULTRES, RA            // Restore MULTRES for *M ins.
  |  jmp <4
  |
  |->vm_hotloop:            // Hot loop counter underflow.
  |.if JIT
  |  mov LFUNC:RB, [BASE-8]        // Same as curr_topL(L).
  |  mov RB, LFUNC:RB->pc
  |  movzx RD, byte [RB+PC2PROTO(framesize)]
  |  lea RD, [BASE+RD*8]
  |  mov L:RB, SAVE_L
  |  mov L:RB->base, BASE
  |  mov L:RB->top, RD
  |  mov FCARG2, PC
  |  lea FCARG1, [DISPATCH+GG_DISP2J]
  |  mov aword [DISPATCH+DISPATCH_J(L)], L:RBa
  |  mov SAVE_PC, PC
  |  call extern lj_trace_hot@8        // (jit_State *J, const BCIns *pc)
  |  jmp <3
  |.endif
  |
  |->vm_callhook:            // Dispatch target for call hooks.
  |  mov SAVE_PC, PC
  |.if JIT
  |  jmp >1
  |.endif
  |
  |->vm_hotcall:            // Hot call counter underflow.
  |.if JIT
  |  mov SAVE_PC, PC
  |  or PC, 1                // Marker for hot call.
  |1:
  |.endif
  |  lea RD, [BASE+NARGS:RD*8-8]
  |  mov L:RB, SAVE_L
  |  mov L:RB->base, BASE
  |  mov L:RB->top, RD
  |  mov FCARG2, PC
  |  mov FCARG1, L:RB
  |  call extern lj_dispatch_call@8    // (lua_State *L, const BCIns *pc)
  |  // ASMFunction returned in eax/rax (RDa).
  |  mov SAVE_PC, 0            // Invalidate for subsequent line hook.
  |.if JIT
  |  and PC, -2
  |.endif
  |  mov BASE, L:RB->base
  |  mov RAa, RDa
  |  mov RD, L:RB->top
  |  sub RD, BASE
  |  mov RBa, RAa
  |  movzx RA, PC_RA
  |  shr RD, 3
  |  add NARGS:RD, 1
  |  jmp RBa
  |
  |//-----------------------------------------------------------------------
  |//-- Trace exit handler -------------------------------------------------
  |//-----------------------------------------------------------------------
  |
  |// Called from an exit stub with the exit number on the stack.
  |// The 16 bit exit number is stored with two (sign-extended) push imm8.
  |->vm_exit_handler:
  |.if JIT
  |.if X64
  |  push r13; push r12
  |  push r11; push r10; push r9; push r8
  |  push rdi; push rsi; push rbp; lea rbp, [rsp+88]; push rbp
  |  push rbx; push rdx; push rcx; push rax
  |  movzx RC, byte [rbp-8]        // Reconstruct exit number.
  |  mov RCH, byte [rbp-16]
  |  mov [rbp-8], r15; mov [rbp-16], r14
  |.else
  |  push ebp; lea ebp, [esp+12]; push ebp
  |  push ebx; push edx; push ecx; push eax
  |  movzx RC, byte [ebp-4]        // Reconstruct exit number.
  |  mov RCH, byte [ebp-8]
  |  mov [ebp-4], edi; mov [ebp-8], esi
  |.endif
  |  // Caveat: DISPATCH is ebx.
  |  mov DISPATCH, [ebp]
  |  mov RA, [DISPATCH+DISPATCH_GL(vmstate)]    // Get trace number.
  |  set_vmstate EXIT
  |  mov [DISPATCH+DISPATCH_J(exitno)], RC
  |  mov [DISPATCH+DISPATCH_J(parent)], RA
  |.if X64
  |.if X64WIN
  |  sub rsp, 16*8+4*8            // Room for SSE regs + save area.
  |.else
  |  sub rsp, 16*8            // Room for SSE regs.
  |.endif
  |  add rbp, -128
  |  movsd qword [rbp-8],   xmm15; movsd qword [rbp-16],  xmm14
  |  movsd qword [rbp-24],  xmm13; movsd qword [rbp-32],  xmm12
  |  movsd qword [rbp-40],  xmm11; movsd qword [rbp-48],  xmm10
  |  movsd qword [rbp-56],  xmm9;  movsd qword [rbp-64],  xmm8
  |  movsd qword [rbp-72],  xmm7;  movsd qword [rbp-80],  xmm6
  |  movsd qword [rbp-88],  xmm5;  movsd qword [rbp-96],  xmm4
  |  movsd qword [rbp-104], xmm3;  movsd qword [rbp-112], xmm2
  |  movsd qword [rbp-120], xmm1;  movsd qword [rbp-128], xmm0
  |.else
  |  sub esp, 8*8+16            // Room for SSE regs + args.
  |  movsd qword [ebp-40], xmm7; movsd qword [ebp-48], xmm6
  |  movsd qword [ebp-56], xmm5; movsd qword [ebp-64], xmm4
  |  movsd qword [ebp-72], xmm3; movsd qword [ebp-80], xmm2
  |  movsd qword [ebp-88], xmm1; movsd qword [ebp-96], xmm0
  |.endif
  |  // Caveat: RB is ebp.
  |  mov L:RB, [DISPATCH+DISPATCH_GL(jit_L)]
  |  mov BASE, [DISPATCH+DISPATCH_GL(jit_base)]
  |  mov aword [DISPATCH+DISPATCH_J(L)], L:RBa
  |  mov dword [DISPATCH+DISPATCH_GL(jit_L)], 0
  |  mov L:RB->base, BASE
  |.if X64WIN
  |  lea CARG2, [rsp+4*8]
  |.elif X64
  |  mov CARG2, rsp
  |.else
  |  lea FCARG2, [esp+16]
  |.endif
  |  lea FCARG1, [DISPATCH+GG_DISP2J]
  |  call extern lj_trace_exit@8    // (jit_State *J, ExitState *ex)
  |  // MULTRES or negated error code returned in eax (RD).
  |  mov RAa, L:RB->cframe
  |  and RAa, CFRAME_RAWMASK
  |.if X64WIN
  |  // Reposition stack later.
  |.elif X64
  |  mov rsp, RAa            // Reposition stack to C frame.
  |.else
  |  mov esp, RAa            // Reposition stack to C frame.
  |.endif
  |  mov [RAa+CFRAME_OFS_L], L:RB    // Set SAVE_L (on-trace resume/yield).
  |  mov BASE, L:RB->base
  |  mov PC, [RAa+CFRAME_OFS_PC]    // Get SAVE_PC.
  |.if X64
  |  jmp >1
  |.endif
  |.endif
  |->vm_exit_interp:
  |  // RD = MULTRES or negated error code, BASE, PC and DISPATCH set.
  |.if JIT
  |.if X64
  |  // Restore additional callee-save registers only used in compiled code.
  |.if X64WIN
  |  lea RAa, [rsp+9*16+4*8]
  |1:
  |  movdqa xmm15, [RAa-9*16]
  |  movdqa xmm14, [RAa-8*16]
  |  movdqa xmm13, [RAa-7*16]
  |  movdqa xmm12, [RAa-6*16]
  |  movdqa xmm11, [RAa-5*16]
  |  movdqa xmm10, [RAa-4*16]
  |  movdqa xmm9, [RAa-3*16]
  |  movdqa xmm8, [RAa-2*16]
  |  movdqa xmm7, [RAa-1*16]
  |  mov rsp, RAa            // Reposition stack to C frame.
  |  movdqa xmm6, [RAa]
  |  mov r15, CSAVE_3
  |  mov r14, CSAVE_4
  |.else
  |  add rsp, 16            // Reposition stack to C frame.
  |1:
  |.endif
  |  mov r13, TMPa
  |  mov r12, TMPQ
  |.endif
  |  test RD, RD; js >3            // Check for error from exit.
  |  mov MULTRES, RD
  |  mov LFUNC:KBASE, [BASE-8]
  |  mov KBASE, LFUNC:KBASE->pc
  |  mov KBASE, [KBASE+PC2PROTO(k)]
  |  mov dword [DISPATCH+DISPATCH_GL(jit_L)], 0
  |  set_vmstate INTERP
  |  // Modified copy of ins_next which handles function header dispatch, too.
  |  mov RC, [PC]
  |  movzx RA, RCH
  |  movzx OP, RCL
  |  add PC, 4
  |  shr RC, 16
  |  cmp OP, BC_FUNCF            // Function header?
  |  jb >2
  |  mov RC, MULTRES            // RC/RD holds nres+1.
  |2:
  |.if X64
  |  jmp aword [DISPATCH+OP*8]
  |.else
  |  jmp aword [DISPATCH+OP*4]
  |.endif
  |
  |3:  // Rethrow error from the right C frame.
  |  neg RD
  |  mov FCARG1, L:RB
  |  mov FCARG2, RD
  |  call extern lj_err_throw@8        // (lua_State *L, int errcode)
  |.endif
  |
  |//-----------------------------------------------------------------------
  |//-- Math helper functions ----------------------------------------------
  |//-----------------------------------------------------------------------
  |
  |// FP value rounding. Called by math.floor/math.ceil fast functions
  |// and from JIT code.
  |
  |// x87 variant: Arg/ret on x87 stack. No int/xmm registers modified.
  |.macro vm_round_x87, mode1, mode2
  |  fnstcw word [esp+4]        // Caveat: overwrites ARG1 and ARG2.
  |  mov [esp+8], eax
  |  mov ax, mode1
  |  or ax, [esp+4]
  |.if mode2 ~= 0xffff
  |  and ax, mode2
  |.endif
  |  mov [esp+6], ax
  |  fldcw word [esp+6]
  |  frndint
  |  fldcw word [esp+4]
  |  mov eax, [esp+8]
  |  ret
  |.endmacro
  |
  |// SSE variant: arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
  |.macro vm_round_sse, mode
  |  sseconst_abs xmm2, RDa
  |  sseconst_2p52 xmm3, RDa
  |  movaps xmm1, xmm0
  |  andpd xmm1, xmm2            // |x|
  |  ucomisd xmm3, xmm1            // No truncation if 2^52 <= |x|.
  |  jbe >1
  |  andnpd xmm2, xmm0            // Isolate sign bit.
  |.if mode == 2        // trunc(x)?
  |  movaps xmm0, xmm1
  |  addsd xmm1, xmm3            // (|x| + 2^52) - 2^52
  |  subsd xmm1, xmm3
  |  sseconst_1 xmm3, RDa
  |  cmpsd xmm0, xmm1, 1        // |x| < result?
  |  andpd xmm0, xmm3
  |  subsd xmm1, xmm0            // If yes, subtract -1.
  |  orpd xmm1, xmm2            // Merge sign bit back in.
  |.else
  |  addsd xmm1, xmm3            // (|x| + 2^52) - 2^52
  |  subsd xmm1, xmm3
  |  orpd xmm1, xmm2            // Merge sign bit back in.
  |  .if mode == 1        // ceil(x)?
  |    sseconst_m1 xmm2, RDa        // Must subtract -1 to preserve -0.
  |    cmpsd xmm0, xmm1, 6        // x > result?
  |  .else            // floor(x)?
  |    sseconst_1 xmm2, RDa
  |    cmpsd xmm0, xmm1, 1        // x < result?
  |  .endif
  |  andpd xmm0, xmm2
  |  subsd xmm1, xmm0            // If yes, subtract +-1.
  |.endif
  |  movaps xmm0, xmm1
  |1:
  |  ret
  |.endmacro
  |
  |.macro vm_round, name, ssemode, mode1, mode2
  |->name:
  |.if not SSE
  |  vm_round_x87 mode1, mode2
  |.endif
  |->name .. _sse:
  |  vm_round_sse ssemode
  |.endmacro
  |
  |  vm_round vm_floor, 0, 0x0400, 0xf7ff
  |  vm_round vm_ceil,  1, 0x0800, 0xfbff
  |  vm_round vm_trunc, 2, 0x0c00, 0xffff
  |
  |// FP modulo x%y. Called by BC_MOD* and vm_arith.
  |->vm_mod:
  |.if SSE
  |// Args in xmm0/xmm1, return value in xmm0.
  |// Caveat: xmm0-xmm5 and RC (eax) modified!
  |  movaps xmm5, xmm0
  |  divsd xmm0, xmm1
  |  sseconst_abs xmm2, RDa
  |  sseconst_2p52 xmm3, RDa
  |  movaps xmm4, xmm0
  |  andpd xmm4, xmm2            // |x/y|
  |  ucomisd xmm3, xmm4            // No truncation if 2^52 <= |x/y|.
  |  jbe >1
  |  andnpd xmm2, xmm0            // Isolate sign bit.
  |  addsd xmm4, xmm3            // (|x/y| + 2^52) - 2^52
  |  subsd xmm4, xmm3
  |  orpd xmm4, xmm2            // Merge sign bit back in.
  |  sseconst_1 xmm2, RDa
  |  cmpsd xmm0, xmm4, 1        // x/y < result?
  |  andpd xmm0, xmm2
  |  subsd xmm4, xmm0            // If yes, subtract 1.0.
  |  movaps xmm0, xmm5
  |  mulsd xmm1, xmm4
  |  subsd xmm0, xmm1
  |  ret
  |1:
  |  mulsd xmm1, xmm0
  |  movaps xmm0, xmm5
  |  subsd xmm0, xmm1
  |  ret
  |.else
  |// Args/ret on x87 stack (y on top). No xmm registers modified.
  |// Caveat: needs 3 slots on x87 stack! RC (eax) modified!
  |  fld st1
  |  fdiv st1
  |  fnstcw word [esp+4]
  |  mov ax, 0x0400
  |  or ax, [esp+4]
  |  and ax, 0xf7ff
  |  mov [esp+6], ax
  |  fldcw word [esp+6]
  |  frndint
  |  fldcw word [esp+4]
  |  fmulp st1
  |  fsubp st1
  |  ret
  |.endif
  |
  |// FP log2(x). Called by math.log(x, base).
  |->vm_log2:
  |.if X64WIN
  |  movsd qword [rsp+8], xmm0        // Use scratch area.
  |  fld1
  |  fld qword [rsp+8]
  |  fyl2x
  |  fstp qword [rsp+8]
  |  movsd xmm0, qword [rsp+8]
  |.elif X64
  |  movsd qword [rsp-8], xmm0        // Use red zone.
  |  fld1
  |  fld qword [rsp-8]
  |  fyl2x
  |  fstp qword [rsp-8]
  |  movsd xmm0, qword [rsp-8]
  |.else
  |  fld1
  |  fld qword [esp+4]
  |  fyl2x
  |.endif
  |  ret
  |
  |// FP exponentiation e^x and 2^x. Called by math.exp fast function and
  |// from JIT code. Arg/ret on x87 stack. No int/xmm regs modified.
  |// Caveat: needs 3 slots on x87 stack!
  |->vm_exp_x87:
  |  fldl2e; fmulp st1                // e^x ==> 2^(x*log2(e))
  |->vm_exp2_x87:
  |  .if X64WIN
  |    .define expscratch, dword [rsp+8]    // Use scratch area.
  |  .elif X64
  |    .define expscratch, dword [rsp-8]    // Use red zone.
  |  .else
  |    .define expscratch, dword [esp+4]    // Needs 4 byte scratch area.
  |  .endif
  |  fst expscratch                // Caveat: overwrites ARG1.
  |  cmp expscratch, 0x7f800000; je >1        // Special case: e^+Inf = +Inf
  |  cmp expscratch, 0xff800000; je >2        // Special case: e^-Inf = 0
  |->vm_exp2raw:  // Entry point for vm_pow. Without +-Inf check.
  |  fdup; frndint; fsub st1, st0; fxch        // Split into frac/int part.
  |  f2xm1; fld1; faddp st1; fscale; fpop1    // ==> (2^frac-1 +1) << int
  |1:
  |  ret
  |2:
  |  fpop; fldz; ret
  |
  |// Generic power function x^y. Called by BC_POW, math.pow fast function,
  |// and vm_arith.
  |// Args/ret on x87 stack (y on top). RC (eax) modified.
  |// Caveat: needs 3 slots on x87 stack!
  |->vm_pow:
  |.if not SSE
  |  fist dword [esp+4]            // Store/reload int before comparison.
  |  fild dword [esp+4]            // Integral exponent used in vm_powi.
  |  fucomip st1
  |  jnz >8                // Branch for FP exponents.
  |  jp >9                // Branch for NaN exponent.
  |  fpop                // Pop y and fallthrough to vm_powi.
  |
  |// FP/int power function x^i. Arg1/ret on x87 stack.
  |// Arg2 (int) on C stack. RC (eax) modified.
  |// Caveat: needs 2 slots on x87 stack!
  |  mov eax, [esp+4]
  |  cmp eax, 1; jle >6            // i<=1?
  |  // Now 1 < (unsigned)i <= 0x80000000.
  |1:  // Handle leading zeros.
  |  test eax, 1; jnz >2
  |  fmul st0
  |  shr eax, 1
  |  jmp <1
  |2:
  |  shr eax, 1; jz >5
  |  fdup
  |3:  // Handle trailing bits.
  |  fmul st0
  |  shr eax, 1; jz >4
  |  jnc <3
  |  fmul st1, st0
  |  jmp <3
  |4:
  |  fmulp st1
  |5:
  |  ret
  |6:
  |  je <5                // x^1 ==> x
  |  jb >7
  |  fld1; fdivrp st1
  |  neg eax
  |  cmp eax, 1; je <5            // x^-1 ==> 1/x
  |  jmp <1                // x^-i ==> (1/x)^i
  |7:
  |  fpop; fld1                // x^0 ==> 1
  |  ret
  |
  |8:  // FP/FP power function x^y.
  |  fst dword [esp+4]
  |  fxch
  |  fst dword [esp+8]
  |  mov eax, [esp+4]; shl eax, 1
  |  cmp eax, 0xff000000; je >2            // x^+-Inf?
  |  mov eax, [esp+8]; shl eax, 1; je >4    // +-0^y?
  |  cmp eax, 0xff000000; je >4            // +-Inf^y?
  |  fyl2x
  |  jmp ->vm_exp2raw
  |
  |9:  // Handle x^NaN.
  |  fld1
  |  fucomip st2
  |  je >1                // 1^NaN ==> 1
  |  fxch                // x^NaN ==> NaN
  |1:
  |  fpop
  |  ret
  |
  |2:  // Handle x^+-Inf.
  |  fabs
  |  fld1
  |  fucomip st1
  |  je >3                    // +-1^+-Inf ==> 1
  |  fpop; fabs; fldz; mov eax, 0; setc al
  |  ror eax, 1; xor eax, [esp+4]; jns >3    // |x|<>1, x^+-Inf ==> +Inf/0
  |  fxch
  |3:
  |  fpop1; fabs
  |  ret
  |
  |4:  // Handle +-0^y or +-Inf^y.
  |  cmp dword [esp+4], 0; jge <3        // y >= 0, x^y ==> |x|
  |  fpop; fpop
  |  test eax, eax; jz >5            // y < 0, +-0^y ==> +Inf
  |  fldz                    // y < 0, +-Inf^y ==> 0
  |  ret
  |5:
  |  mov dword [esp+4], 0x7f800000        // Return +Inf.
  |  fld dword [esp+4]
  |  ret
  |.endif
  |
  |// Args in xmm0/xmm1. Ret in xmm0. xmm0-xmm2 and RC (eax) modified.
  |// Needs 16 byte scratch area for x86. Also called from JIT code.
  |->vm_pow_sse:
  |  cvtsd2si eax, xmm1
  |  cvtsi2sd xmm2, eax
  |  ucomisd xmm1, xmm2
  |  jnz >8                // Branch for FP exponents.
  |  jp >9                // Branch for NaN exponent.
  |  // Fallthrough to vm_powi_sse.
  |
  |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
  |->vm_powi_sse:
  |  cmp eax, 1; jle >6            // i<=1?
  |  // Now 1 < (unsigned)i <= 0x80000000.
  |1:  // Handle leading zeros.
  |  test eax, 1; jnz >2
  |  mulsd xmm0, xmm0
  |  shr eax, 1
  |  jmp <1
  |2:
  |  shr eax, 1; jz >5
  |  movaps xmm1, xmm0
  |3:  // Handle trailing bits.
  |  mulsd xmm0, xmm0
  |  shr eax, 1; jz >4
  |  jnc <3
  |  mulsd xmm1, xmm0
  |  jmp <3
  |4:
  |  mulsd xmm0, xmm1
  |5:
  |  ret
  |6:
  |  je <5                // x^1 ==> x
  |  jb >7                // x^0 ==> 1
  |  neg eax
  |  call <1
  |  sseconst_1 xmm1, RDa
  |  divsd xmm1, xmm0
  |  movaps xmm0, xmm1
  |  ret
  |7:
  |  sseconst_1 xmm0, RDa
  |  ret
  |
  |8:  // FP/FP power function x^y.
  |.if X64
  |  movd rax, xmm1; shl rax, 1
  |  rol rax, 12; cmp rax, 0xffe; je >2        // x^+-Inf?
  |  movd rax, xmm0; shl rax, 1; je >4        // +-0^y?
  |  rol rax, 12; cmp rax, 0xffe; je >5        // +-Inf^y?
  |  .if X64WIN
  |    movsd qword [rsp+16], xmm1        // Use scratch area.
  |    movsd qword [rsp+8], xmm0
  |    fld qword [rsp+16]
  |    fld qword [rsp+8]
  |  .else
  |    movsd qword [rsp-16], xmm1        // Use red zone.
  |    movsd qword [rsp-8], xmm0
  |    fld qword [rsp-16]
  |    fld qword [rsp-8]
  |  .endif
  |.else
  |  movsd qword [esp+12], xmm1            // Needs 16 byte scratch area.
  |  movsd qword [esp+4], xmm0
  |  cmp dword [esp+12], 0; jne >1
  |  mov eax, [esp+16]; shl eax, 1
  |  cmp eax, 0xffe00000; je >2            // x^+-Inf?
  |1:
  |  cmp dword [esp+4], 0; jne >1
  |  mov eax, [esp+8]; shl eax, 1; je >4    // +-0^y?
  |  cmp eax, 0xffe00000; je >5            // +-Inf^y?
  |1:
  |  fld qword [esp+12]
  |  fld qword [esp+4]
  |.endif
  |  fyl2x                    // y*log2(x)
  |  fdup; frndint; fsub st1, st0; fxch        // Split into frac/int part.
  |  f2xm1; fld1; faddp st1; fscale; fpop1    // ==> (2^frac-1 +1) << int
  |.if X64WIN
  |  fstp qword [rsp+8]                // Use scratch area.
  |  movsd xmm0, qword [rsp+8]
  |.elif X64
  |  fstp qword [rsp-8]                // Use red zone.
  |  movsd xmm0, qword [rsp-8]
  |.else
  |  fstp qword [esp+4]                // Needs 8 byte scratch area.
  |  movsd xmm0, qword [esp+4]
  |.endif
  |  ret
  |
  |9:  // Handle x^NaN.
  |  sseconst_1 xmm2, RDa
  |  ucomisd xmm0, xmm2; je >1            // 1^NaN ==> 1
  |  movaps xmm0, xmm1                // x^NaN ==> NaN
  |1:
  |  ret
  |
  |2:  // Handle x^+-Inf.
  |  sseconst_abs xmm2, RDa
  |  andpd xmm0, xmm2                // |x|
  |  sseconst_1 xmm2, RDa
  |  ucomisd xmm0, xmm2; je <1            // +-1^+-Inf ==> 1
  |  movmskpd eax, xmm1
  |  xorps xmm0, xmm0
  |  mov ah, al; setc al; xor al, ah; jne <1    // |x|<>1, x^+-Inf ==> +Inf/0
  |3:
  |  sseconst_hi xmm0, RDa, 7ff00000  // +Inf
  |  ret
  |
  |4:  // Handle +-0^y.
  |  movmskpd eax, xmm1; test eax, eax; jnz <3    // y < 0, +-0^y ==> +Inf
  |  xorps xmm0, xmm0                // y >= 0, +-0^y ==> 0
  |  ret
  |
  |5:  // Handle +-Inf^y.
  |  movmskpd eax, xmm1; test eax, eax; jz <3    // y >= 0, +-Inf^y ==> +Inf
  |  xorps xmm0, xmm0                // y < 0, +-Inf^y ==> 0
  |  ret
  |
  |// Callable from C: double lj_vm_foldfpm(double x, int fpm)
  |// Computes fpm(x) for extended math functions. ORDER FPM.
  |->vm_foldfpm:
  |.if JIT
  |.if X64
  |  .if X64WIN
  |    .define fpmop, CARG2d
  |  .else
  |    .define fpmop, CARG1d
  |  .endif
  |  cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil
  |  cmp fpmop, 3; jb ->vm_trunc; ja >2
  |  sqrtsd xmm0, xmm0; ret
  |2:
  |  .if X64WIN
  |    movsd qword [rsp+8], xmm0    // Use scratch area.
  |    fld qword [rsp+8]
  |  .else
  |    movsd qword [rsp-8], xmm0    // Use red zone.
  |    fld qword [rsp-8]
  |  .endif
  |  cmp fpmop, 5; ja >2
  |  .if X64WIN; pop rax; .endif
  |  je >1
  |  call ->vm_exp_x87
  |  .if X64WIN; push rax; .endif
  |  jmp >7
  |1:
  |  call ->vm_exp2_x87
  |  .if X64WIN; push rax; .endif
  |  jmp >7
  |2: ; cmp fpmop, 7; je >1; ja >2
  |  fldln2; fxch; fyl2x; jmp >7
  |1: ; fld1; fxch; fyl2x; jmp >7
  |2: ; cmp fpmop, 9; je >1; ja >2
  |  fldlg2; fxch; fyl2x; jmp >7
  |1: ; fsin; jmp >7
  |2: ; cmp fpmop, 11; je >1; ja >9
  |   fcos; jmp >7
  |1: ; fptan; fpop
  |7:
  |  .if X64WIN
  |    fstp qword [rsp+8]        // Use scratch area.
  |    movsd xmm0, qword [rsp+8]
  |  .else
  |    fstp qword [rsp-8]        // Use red zone.
  |    movsd xmm0, qword [rsp-8]
  |  .endif
  |  ret
  |.else  // x86 calling convention.
  |  .define fpmop, eax
  |.if SSE
  |  mov fpmop, [esp+12]
  |  movsd xmm0, qword [esp+4]
  |  cmp fpmop, 1; je >1; ja >2
  |  call ->vm_floor; jmp >7
  |1: ; call ->vm_ceil; jmp >7
  |2: ; cmp fpmop, 3; je >1; ja >2
  |  call ->vm_trunc; jmp >7
  |1:
  |  sqrtsd xmm0, xmm0
  |7:
  |  movsd qword [esp+4], xmm0    // Overwrite callee-owned args.
  |  fld qword [esp+4]
  |  ret
  |2: ; fld qword [esp+4]
  |  cmp fpmop, 5; jb ->vm_exp_x87; je ->vm_exp2_x87
  |2: ; cmp fpmop, 7; je >1; ja >2
  |  fldln2; fxch; fyl2x; ret
  |1: ; fld1; fxch; fyl2x; ret
  |2: ; cmp fpmop, 9; je >1; ja >2
  |  fldlg2; fxch; fyl2x; ret
  |1: ; fsin; ret
  |2: ; cmp fpmop, 11; je >1; ja >9
  |   fcos; ret
  |1: ; fptan; fpop; ret
  |.else
  |  mov fpmop, [esp+12]
  |  fld qword [esp+4]
  |  cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil
  |  cmp fpmop, 3; jb ->vm_trunc; ja >2
  |  fsqrt; ret
  |2: ; cmp fpmop, 5; jb ->vm_exp_x87; je ->vm_exp2_x87
  |  cmp fpmop, 7; je >1; ja >2
  |  fldln2; fxch; fyl2x; ret
  |1: ; fld1; fxch; fyl2x; ret
  |2: ; cmp fpmop, 9; je >1; ja >2
  |  fldlg2; fxch; fyl2x; ret
  |1: ; fsin; ret
  |2: ; cmp fpmop, 11; je >1; ja >9
  |   fcos; ret
  |1: ; fptan; fpop; ret
  |.endif
  |.endif
  |9: ; int3                    // Bad fpm.
  |.endif
  |
  |// Callable from C: double lj_vm_foldarith(double x, double y, int op)
  |// Compute x op y for basic arithmetic operators (+ - * / % ^ and unary -)
  |// and basic math functions. ORDER ARITH
  |->vm_foldarith:
  |.if X64
  |
  |  .if X64WIN
  |    .define foldop, CARG3d
  |  .else
  |    .define foldop, CARG1d
  |  .endif
  |  cmp foldop, 1; je >1; ja >2
  |  addsd xmm0, xmm1; ret
  |1: ; subsd xmm0, xmm1; ret
  |2: ; cmp foldop, 3; je >1; ja >2
  |  mulsd xmm0, xmm1; ret
  |1: ; divsd xmm0, xmm1; ret
  |2: ; cmp foldop, 5; jb ->vm_mod; je ->vm_pow
  |  cmp foldop, 7; je >1; ja >2
  |  sseconst_sign xmm1, RDa; xorps xmm0, xmm1; ret
  |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; ret
  |2: ; cmp foldop, 9; ja >2
  |.if X64WIN
  |  movsd qword [rsp+8], xmm0    // Use scratch area.
  |  movsd qword [rsp+16], xmm1
  |  fld qword [rsp+8]
  |  fld qword [rsp+16]
  |.else
  |  movsd qword [rsp-8], xmm0    // Use red zone.
  |  movsd qword [rsp-16], xmm1
  |  fld qword [rsp-8]
  |  fld qword [rsp-16]
  |.endif
  |  je >1
  |  fpatan
  |7:
  |.if X64WIN
  |  fstp qword [rsp+8]        // Use scratch area.
  |  movsd xmm0, qword [rsp+8]
  |.else
  |  fstp qword [rsp-8]        // Use red zone.
  |  movsd xmm0, qword [rsp-8]
  |.endif
  |  ret
  |1: ; fxch; fscale; fpop1; jmp <7
  |2: ; cmp foldop, 11; je >1; ja >9
  |  minsd xmm0, xmm1; ret
  |1: ; maxsd xmm0, xmm1; ret
  |9: ; int3                // Bad op.
  |
  |.elif SSE  // x86 calling convention with SSE ops.
  |
  |  .define foldop, eax
  |  mov foldop, [esp+20]
  |  movsd xmm0, qword [esp+4]
  |  movsd xmm1, qword [esp+12]
  |  cmp foldop, 1; je >1; ja >2
  |  addsd xmm0, xmm1
  |7:
  |  movsd qword [esp+4], xmm0    // Overwrite callee-owned args.
  |  fld qword [esp+4]
  |  ret
  |1: ; subsd xmm0, xmm1; jmp <7
  |2: ; cmp foldop, 3; je >1; ja >2
  |  mulsd xmm0, xmm1; jmp <7
  |1: ; divsd xmm0, xmm1; jmp <7
  |2: ; cmp foldop, 5
  |  je >1; ja >2
  |  call ->vm_mod; jmp <7
  |1: ; pop edx; call ->vm_pow; push edx; jmp <7  // Writes to scratch area.
  |2: ; cmp foldop, 7; je >1; ja >2
  |  sseconst_sign xmm1, RDa; xorps xmm0, xmm1; jmp <7
  |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; jmp <7
  |2: ; cmp foldop, 9; ja >2
  |  fld qword [esp+4]        // Reload from stack
  |  fld qword [esp+12]
  |  je >1
  |  fpatan; ret
  |1: ; fxch; fscale; fpop1; ret
  |2: ; cmp foldop, 11; je >1; ja >9
  |  minsd xmm0, xmm1; jmp <7
  |1: ; maxsd xmm0, xmm1; jmp <7
  |9: ; int3                // Bad op.
  |
  |.else  // x86 calling convention with x87 ops.
  |
  |  mov eax, [esp+20]
  |  fld qword [esp+4]
  |  fld qword [esp+12]
  |  cmp eax, 1; je >1; ja >2
  |  faddp st1; ret
  |1: ; fsubp st1; ret
  |2: ; cmp eax, 3; je >1; ja >2
  |  fmulp st1; ret
  |1: ; fdivp st1; ret
  |2: ; cmp eax, 5; jb ->vm_mod; je ->vm_pow
  |  cmp eax, 7; je >1; ja >2
  |  fpop; fchs; ret
  |1: ; fpop; fabs; ret
  |2: ; cmp eax, 9; je >1; ja >2
  |  fpatan; ret
  |1: ; fxch; fscale; fpop1; ret
  |2: ; cmp eax, 11; je >1; ja >9
  |  fucomi st1; fcmovnbe st1; fpop1; ret
  |1: ; fucomi st1; fcmovbe st1; fpop1; ret
  |9: ; int3                // Bad op.
  |
  |.endif
  |
  |//-----------------------------------------------------------------------
  |//-- Miscellaneous functions --------------------------------------------
  |//-----------------------------------------------------------------------
  |
  |// int lj_vm_cpuid(uint32_t f, uint32_t res[4])
  |->vm_cpuid:
  |.if X64
  |  mov eax, CARG1d
  |  .if X64WIN; push rsi; mov rsi, CARG2; .endif
  |  push rbx
  |  cpuid
  |  mov [rsi], eax
  |  mov [rsi+4], ebx
  |  mov [rsi+8], ecx
  |  mov [rsi+12], edx
  |  pop rbx
  |  .if X64WIN; pop rsi; .endif
  |  ret
  |.else
  |  pushfd
  |  pop edx
  |  mov ecx, edx
  |  xor edx, 0x00200000        // Toggle ID bit in flags.
  |  push edx
  |  popfd
  |  pushfd
  |  pop edx
  |  xor eax, eax            // Zero means no features supported.
  |  cmp ecx, edx
  |  jz >1                // No ID toggle means no CPUID support.
  |  mov eax, [esp+4]            // Argument 1 is function number.
  |  push edi
  |  push ebx
  |  cpuid
  |  mov edi, [esp+16]            // Argument 2 is result area.
  |  mov [edi], eax
  |  mov [edi+4], ebx
  |  mov [edi+8], ecx
  |  mov [edi+12], edx
  |  pop ebx
  |  pop edi
  |1:
  |  ret
  |.endif
  |
  |//-----------------------------------------------------------------------
  |//-- Assertions ---------------------------------------------------------
  |//-----------------------------------------------------------------------
  |
  |->assert_bad_for_arg_type:
#ifdef LUA_USE_ASSERT
  |  int3
#endif
  |  int3
  |
  |//-----------------------------------------------------------------------
  |//-- FFI helper functions -----------------------------------------------
  |//-----------------------------------------------------------------------
  |
  |// Handler for callback functions. Callback slot number in ah/al.
  |->vm_ffi_callback:
  |.if FFI
  |.type CTSTATE, CTState, PC
  |.if not X64
  |  sub esp, 16            // Leave room for SAVE_ERRF etc.
  |.endif
  |  saveregs_    // ebp/rbp already saved. ebp now holds global_State *.
  |  lea DISPATCH, [ebp+GG_G2DISP]
  |  mov CTSTATE, GL:ebp->ctype_state
  |  movzx eax, ax
  |  mov CTSTATE->cb.slot, eax
  |.if X64
  |  mov CTSTATE->cb.gpr[0], CARG1
  |  mov CTSTATE->cb.gpr[1], CARG2
  |  mov CTSTATE->cb.gpr[2], CARG3
  |  mov CTSTATE->cb.gpr[3], CARG4
  |  movsd qword CTSTATE->cb.fpr[0], xmm0
  |  movsd qword CTSTATE->cb.fpr[1], xmm1
  |  movsd qword CTSTATE->cb.fpr[2], xmm2
  |  movsd qword CTSTATE->cb.fpr[3], xmm3
  |.if X64WIN
  |  lea rax, [rsp+CFRAME_SIZE+4*8]
  |.else
  |  lea rax, [rsp+CFRAME_SIZE]
  |  mov CTSTATE->cb.gpr[4], CARG5
  |  mov CTSTATE->cb.gpr[5], CARG6
  |  movsd qword CTSTATE->cb.fpr[4], xmm4
  |  movsd qword CTSTATE->cb.fpr[5], xmm5
  |  movsd qword CTSTATE->cb.fpr[6], xmm6
  |  movsd qword CTSTATE->cb.fpr[7], xmm7
  |.endif
  |  mov CTSTATE->cb.stack, rax
  |  mov CARG2, rsp
  |.else
  |  lea eax, [esp+CFRAME_SIZE+16]
  |  mov CTSTATE->cb.gpr[0], FCARG1
  |  mov CTSTATE->cb.gpr[1], FCARG2
  |  mov CTSTATE->cb.stack, eax
  |  mov FCARG1, [esp+CFRAME_SIZE+12]    // Move around misplaced retaddr/ebp.
  |  mov FCARG2, [esp+CFRAME_SIZE+8]
  |  mov SAVE_RET, FCARG1
  |  mov SAVE_R4, FCARG2
  |  mov FCARG2, esp
  |.endif
  |  mov SAVE_PC, CTSTATE        // Any value outside of bytecode is ok.
  |  mov FCARG1, CTSTATE
  |  call extern lj_ccallback_enter@8    // (CTState *cts, void *cf)
  |  // lua_State * returned in eax (RD).
  |  set_vmstate INTERP
  |  mov BASE, L:RD->base
  |  mov RD, L:RD->top
  |  sub RD, BASE
  |  mov LFUNC:RB, [BASE-8]
  |  shr RD, 3
  |  add RD, 1
  |  ins_callt
  |.endif
  |
  |->cont_ffi_callback:            // Return from FFI callback.
  |.if FFI
  |  mov L:RA, SAVE_L
  |  mov CTSTATE, [DISPATCH+DISPATCH_GL(ctype_state)]
  |  mov aword CTSTATE->L, L:RAa
  |  mov L:RA->base, BASE
  |  mov L:RA->top, RB
  |  mov FCARG1, CTSTATE
  |  mov FCARG2, RC
  |  call extern lj_ccallback_leave@8    // (CTState *cts, TValue *o)
  |.if X64
  |  mov rax, CTSTATE->cb.gpr[0]
  |  movsd xmm0, qword CTSTATE->cb.fpr[0]
  |  jmp ->vm_leave_unw
  |.else
  |  mov L:RB, SAVE_L
  |  mov eax, CTSTATE->cb.gpr[0]
  |  mov edx, CTSTATE->cb.gpr[1]
  |  cmp dword CTSTATE->cb.gpr[2], 1
  |  jb >7
  |  je >6
  |  fld qword CTSTATE->cb.fpr[0].d
  |  jmp >7
  |6:
  |  fld dword CTSTATE->cb.fpr[0].f
  |7:
  |  mov ecx, L:RB->top
  |  movzx ecx, word [ecx+6]        // Get stack adjustment and copy up.
  |  mov SAVE_L, ecx            // Must be one slot above SAVE_RET
  |  restoreregs
  |  pop ecx                // Move return addr from SAVE_RET.
  |  add esp, [esp]            // Adjust stack.
  |  add esp, 16
  |  push ecx
  |  ret
  |.endif
  |.endif
  |
  |->vm_ffi_call@4:            // Call C function via FFI.
  |  // Caveat: needs special frame unwinding, see below.
  |.if FFI
  |.if X64
  |  .type CCSTATE, CCallState, rbx
  |  push rbp; mov rbp, rsp; push rbx; mov CCSTATE, CARG1
  |.else
  |  .type CCSTATE, CCallState, ebx
  |  push ebp; mov ebp, esp; push ebx; mov CCSTATE, FCARG1
  |.endif
  |
  |  // Readjust stack.
  |.if X64
  |  mov eax, CCSTATE->spadj
  |  sub rsp, rax
  |.else
  |  sub esp, CCSTATE->spadj
  |.if WIN
  |  mov CCSTATE->spadj, esp
  |.endif
  |.endif
  |
  |  // Copy stack slots.
  |  movzx ecx, byte CCSTATE->nsp
  |  sub ecx, 1
  |  js >2
  |1:
  |.if X64
  |  mov rax, [CCSTATE+rcx*8+offsetof(CCallState, stack)]
  |  mov [rsp+rcx*8+CCALL_SPS_EXTRA*8], rax
  |.else
  |  mov eax, [CCSTATE+ecx*4+offsetof(CCallState, stack)]
  |  mov [esp+ecx*4], eax
  |.endif
  |  sub ecx, 1
  |  jns <1
  |2:
  |
  |.if X64
  |  movzx eax, byte CCSTATE->nfpr
  |  mov CARG1, CCSTATE->gpr[0]
  |  mov CARG2, CCSTATE->gpr[1]
  |  mov CARG3, CCSTATE->gpr[2]
  |  mov CARG4, CCSTATE->gpr[3]
  |.if not X64WIN
  |  mov CARG5, CCSTATE->gpr[4]
  |  mov CARG6, CCSTATE->gpr[5]
  |.endif
  |  test eax, eax; jz >5
  |  movaps xmm0, CCSTATE->fpr[0]
  |  movaps xmm1, CCSTATE->fpr[1]
  |  movaps xmm2, CCSTATE->fpr[2]
  |  movaps xmm3, CCSTATE->fpr[3]
  |.if not X64WIN
  |  cmp eax, 4; jbe >5
  |  movaps xmm4, CCSTATE->fpr[4]
  |  movaps xmm5, CCSTATE->fpr[5]
  |  movaps xmm6, CCSTATE->fpr[6]
  |  movaps xmm7, CCSTATE->fpr[7]
  |.endif
  |5:
  |.else
  |  mov FCARG1, CCSTATE->gpr[0]
  |  mov FCARG2, CCSTATE->gpr[1]
  |.endif
  |
  |  call aword CCSTATE->func
  |
  |.if X64
  |  mov CCSTATE->gpr[0], rax
  |  movaps CCSTATE->fpr[0], xmm0
  |.if not X64WIN
  |  mov CCSTATE->gpr[1], rdx
  |  movaps CCSTATE->fpr[1], xmm1
  |.endif
  |.else
  |  mov CCSTATE->gpr[0], eax
  |  mov CCSTATE->gpr[1], edx
  |  cmp byte CCSTATE->resx87, 1
  |  jb >7
  |  je >6
  |  fstp qword CCSTATE->fpr[0].d[0]
  |  jmp >7
  |6:
  |  fstp dword CCSTATE->fpr[0].f[0]
  |7:
  |.if WIN
  |  sub CCSTATE->spadj, esp
  |.endif
  |.endif
  |
  |.if X64
  |  mov rbx, [rbp-8]; leave; ret
  |.else
  |  mov ebx, [ebp-4]; leave; ret
  |.endif
  |.endif
  |// Note: vm_ffi_call must be the last function in this object file!
  |
  |//-----------------------------------------------------------------------
}

/* Generate the code for a single instruction. */
static void build_ins(BuildCtx *ctx, BCOp op, int defop)
{
  int vk = 0;
  |// Note: aligning all instructions does not pay off.
  |=>defop:

  switch (op) {

  /* -- Comparison ops ---------------------------------------------------- */

  /* Remember: all ops branch for a true comparison, fall through otherwise. */

  |.macro jmp_comp, lt, ge, le, gt, target
  ||switch (op) {
  ||case BC_ISLT:
  |   lt target
  ||break;
  ||case BC_ISGE:
  |   ge target
  ||break;
  ||case BC_ISLE:
  |   le target
  ||break;
  ||case BC_ISGT:
  |   gt target
  ||break;
  ||default: break;  /* Shut up GCC. */
  ||}
  |.endmacro

  case BC_ISLT: case BC_ISGE: case BC_ISLE: case BC_ISGT:
    |  // RA = src1, RD = src2, JMP with RD = target
    |  ins_AD
    |.if DUALNUM
    |  checkint RA, >7
    |  checkint RD, >8
    |  mov RB, dword [BASE+RA*8]
    |  add PC, 4
    |  cmp RB, dword [BASE+RD*8]
    |  jmp_comp jge, jl, jg, jle, >9
    |6:
    |  movzx RD, PC_RD
    |  branchPC RD
    |9:
    |  ins_next
    |
    |7:  // RA is not an integer.
    |  ja ->vmeta_comp
    |  // RA is a number.
    |  cmp dword [BASE+RD*8+4], LJ_TISNUM; jb >1; jne ->vmeta_comp
    |  // RA is a number, RD is an integer.
    |.if SSE
    |  cvtsi2sd xmm0, dword [BASE+RD*8]
    |  jmp >2
    |.else
    |  fld qword [BASE+RA*8]
    |  fild dword [BASE+RD*8]
    |  jmp >3
    |.endif
    |
    |8:  // RA is an integer, RD is not an integer.
    |  ja ->vmeta_comp
    |  // RA is an integer, RD is a number.
    |.if SSE
    |  cvtsi2sd xmm1, dword [BASE+RA*8]
    |  movsd xmm0, qword [BASE+RD*8]
    |  add PC, 4
    |  ucomisd xmm0, xmm1
    |  jmp_comp jbe, ja, jb, jae, <9
    |  jmp <6
    |.else
    |  fild dword [BASE+RA*8]
    |  jmp >2
    |.endif
    |.else
    |  checknum RA, ->vmeta_comp
    |  checknum RD, ->vmeta_comp
    |.endif
    |.if SSE
    |1:
    |  movsd xmm0, qword [BASE+RD*8]
    |2:
    |  add PC, 4
    |  ucomisd xmm0, qword [BASE+RA*8]
    |3:
    |.else
    |1:
    |  fld qword [BASE+RA*8]        // Reverse order, i.e like cmp D, A.
    |2:
    |  fld qword [BASE+RD*8]
    |3:
    |  add PC, 4
    |  fcomparepp
    |.endif
    |  // Unordered: all of ZF CF PF set, ordered: PF clear.
    |  // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't.
    |.if DUALNUM
    |  jmp_comp jbe, ja, jb, jae, <9
    |  jmp <6
    |.else
    |  jmp_comp jbe, ja, jb, jae, >1
    |  movzx RD, PC_RD
    |  branchPC RD
    |1:
    |  ins_next
    |.endif
    break;

  case BC_ISEQV: case BC_ISNEV:
    vk = op == BC_ISEQV;
    |  ins_AD    // RA = src1, RD = src2, JMP with RD = target
    |  mov RB, [BASE+RD*8+4]
    |  add PC, 4
    |.if DUALNUM
    |  cmp RB, LJ_TISNUM; jne >7
    |  checkint RA, >8
    |  mov RB, dword [BASE+RD*8]
    |  cmp RB, dword [BASE+RA*8]
    if (vk) {
      |  jne >9
    } else {
      |  je >9
    }
    |  movzx RD, PC_RD
    |  branchPC RD
    |9:
    |  ins_next
    |
    |7:  // RD is not an integer.
    |  ja >5
    |  // RD is a number.
    |  cmp dword [BASE+RA*8+4], LJ_TISNUM; jb >1; jne >5
    |  // RD is a number, RA is an integer.
    |.if SSE
    |  cvtsi2sd xmm0, dword [BASE+RA*8]
    |.else
    |  fild dword [BASE+RA*8]
    |.endif
    |  jmp >2
    |
    |8:  // RD is an integer, RA is not an integer.
    |  ja >5
    |  // RD is an integer, RA is a number.
    |.if SSE
    |  cvtsi2sd xmm0, dword [BASE+RD*8]
    |  ucomisd xmm0, qword [BASE+RA*8]
    |.else
    |  fild dword [BASE+RD*8]
    |  fld qword [BASE+RA*8]
    |.endif
    |  jmp >4
    |
    |.else
    |  cmp RB, LJ_TISNUM; jae >5
    |  checknum RA, >5
    |.endif
    |.if SSE
    |1:
    |  movsd xmm0, qword [BASE+RA*8]
    |2:
    |  ucomisd xmm0, qword [BASE+RD*8]
    |4:
    |.else
    |1:
    |  fld qword [BASE+RA*8]
    |2:
    |  fld qword [BASE+RD*8]
    |4:
    |  fcomparepp
    |.endif
  iseqne_fp:
    if (vk) {
      |  jp >2                // Unordered means not equal.
      |  jne >2
    } else {
      |  jp >2                // Unordered means not equal.
      |  je >1
    }
  iseqne_end:
    if (vk) {
      |1:                // EQ: Branch to the target.
      |  movzx RD, PC_RD
      |  branchPC RD
      |2:                // NE: Fallthrough to next instruction.
      |.if not FFI
      |3:
      |.endif
    } else {
      |.if not FFI
      |3:
      |.endif
      |2:                // NE: Branch to the target.
      |  movzx RD, PC_RD
      |  branchPC RD
      |1:                // EQ: Fallthrough to next instruction.
    }
    if (LJ_DUALNUM && (op == BC_ISEQV || op == BC_ISNEV ||
               op == BC_ISEQN || op == BC_ISNEN)) {
      |  jmp <9
    } else {
      |  ins_next
    }
    |
    if (op == BC_ISEQV || op == BC_ISNEV) {
      |5:  // Either or both types are not numbers.
      |.if FFI
      |  cmp RB, LJ_TCDATA; je ->vmeta_equal_cd
      |  checktp RA, LJ_TCDATA; je ->vmeta_equal_cd
      |.endif
      |  checktp RA, RB            // Compare types.
      |  jne <2                // Not the same type?
      |  cmp RB, LJ_TISPRI
      |  jae <1                // Same type and primitive type?
      |
      |  // Same types and not a primitive type. Compare GCobj or pvalue.
      |  mov RA, [BASE+RA*8]
      |  mov RD, [BASE+RD*8]
      |  cmp RA, RD
      |  je <1                // Same GCobjs or pvalues?
      |  cmp RB, LJ_TISTABUD
      |  ja <2                // Different objects and not table/ud?
      |.if X64
      |  cmp RB, LJ_TUDATA        // And not 64 bit lightuserdata.
      |  jb <2
      |.endif
      |
      |  // Different tables or userdatas. Need to check __eq metamethod.
      |  // Field metatable must be at same offset for GCtab and GCudata!
      |  mov TAB:RB, TAB:RA->metatable
      |  test TAB:RB, TAB:RB
      |  jz <2                // No metatable?
      |  test byte TAB:RB->nomm, 1<<MM_eq
      |  jnz <2                // Or 'no __eq' flag set?
      if (vk) {
    |  xor RB, RB            // ne = 0
      } else {
    |  mov RB, 1            // ne = 1
      }
      |  jmp ->vmeta_equal        // Handle __eq metamethod.
    } else {
      |.if FFI
      |3:
      |  cmp RB, LJ_TCDATA
      if (LJ_DUALNUM && vk) {
    |  jne <9
      } else {
    |  jne <2
      }
      |  jmp ->vmeta_equal_cd
      |.endif
    }
    break;
  case BC_ISEQS: case BC_ISNES:
    vk = op == BC_ISEQS;
    |  ins_AND    // RA = src, RD = str const, JMP with RD = target
    |  mov RB, [BASE+RA*8+4]
    |  add PC, 4
    |  cmp RB, LJ_TSTR; jne >3
    |  mov RA, [BASE+RA*8]
    |  cmp RA, [KBASE+RD*4]
  iseqne_test:
    if (vk) {
      |  jne >2
    } else {
      |  je >1
    }
    goto iseqne_end;
  case BC_ISEQN: case BC_ISNEN:
    vk = op == BC_ISEQN;
    |  ins_AD    // RA = src, RD = num const, JMP with RD = target
    |  mov RB, [BASE+RA*8+4]
    |  add PC, 4
    |.if DUALNUM
    |  cmp RB, LJ_TISNUM; jne >7
    |  cmp dword [KBASE+RD*8+4], LJ_TISNUM; jne >8
    |  mov RB, dword [KBASE+RD*8]
    |  cmp RB, dword [BASE+RA*8]
    if (vk) {
      |  jne >9
    } else {
      |  je >9
    }
    |  movzx RD, PC_RD
    |  branchPC RD
    |9:
    |  ins_next
    |
    |7:  // RA is not an integer.
    |  ja >3
    |  // RA is a number.
    |  cmp dword [KBASE+RD*8+4], LJ_TISNUM; jb >1
    |  // RA is a number, RD is an integer.
    |.if SSE
    |  cvtsi2sd xmm0, dword [KBASE+RD*8]
    |.else
    |  fild dword [KBASE+RD*8]
    |.endif
    |  jmp >2
    |
    |8:  // RA is an integer, RD is a number.
    |.if SSE
    |  cvtsi2sd xmm0, dword [BASE+RA*8]
    |  ucomisd xmm0, qword [KBASE+RD*8]
    |.else
    |  fild dword [BASE+RA*8]
    |  fld qword [KBASE+RD*8]
    |.endif
    |  jmp >4
    |.else
    |  cmp RB, LJ_TISNUM; jae >3
    |.endif
    |.if SSE
    |1:
    |  movsd xmm0, qword [KBASE+RD*8]
    |2:
    |  ucomisd xmm0, qword [BASE+RA*8]
    |4:
    |.else
    |1:
    |  fld qword [KBASE+RD*8]
    |2:
    |  fld qword [BASE+RA*8]
    |4:
    |  fcomparepp
    |.endif
    goto iseqne_fp;
  case BC_ISEQP: case BC_ISNEP:
    vk = op == BC_ISEQP;
    |  ins_AND    // RA = src, RD = primitive type (~), JMP with RD = target
    |  mov RB, [BASE+RA*8+4]
    |  add PC, 4
    |  cmp RB, RD
    if (!LJ_HASFFI) goto iseqne_test;
    if (vk) {
      |  jne >3
      |  movzx RD, PC_RD
      |  branchPC RD
      |2:
      |  ins_next
      |3:
      |  cmp RB, LJ_TCDATA; jne <2
      |  jmp ->vmeta_equal_cd
    } else {
      |  je >2
      |  cmp RB, LJ_TCDATA; je ->vmeta_equal_cd
      |  movzx RD, PC_RD
      |  branchPC RD
      |2:
      |  ins_next
    }
    break;

  /* -- Unary test and copy ops ------------------------------------------- */

  case BC_ISTC: case BC_ISFC: case BC_IST: case BC_ISF:
    |  ins_AD    // RA = dst or unused, RD = src, JMP with RD = target
    |  mov RB, [BASE+RD*8+4]
    |  add PC, 4
    |  cmp RB, LJ_TISTRUECOND
    if (op == BC_IST || op == BC_ISTC) {
      |  jae >1
    } else {
      |  jb >1
    }
    if (op == BC_ISTC || op == BC_ISFC) {
      |  mov [BASE+RA*8+4], RB
      |  mov RB, [BASE+RD*8]
      |  mov [BASE+RA*8], RB
    }
    |  movzx RD, PC_RD
    |  branchPC RD
    |1:                    // Fallthrough to the next instruction.
    |  ins_next
    break;

  /* -- Unary ops --------------------------------------------------------- */

  case BC_MOV:
    |  ins_AD    // RA = dst, RD = src
    |.if X64
    |  mov RBa, [BASE+RD*8]
    |  mov [BASE+RA*8], RBa
    |.else
    |  mov RB, [BASE+RD*8+4]
    |  mov RD, [BASE+RD*8]
    |  mov [BASE+RA*8+4], RB
    |  mov [BASE+RA*8], RD
    |.endif
    |  ins_next_
    break;
  case BC_NOT:
    |  ins_AD    // RA = dst, RD = src
    |  xor RB, RB
    |  checktp RD, LJ_TISTRUECOND
    |  adc RB, LJ_TTRUE
    |  mov [BASE+RA*8+4], RB
    |  ins_next
    break;
  case BC_UNM:
    |  ins_AD    // RA = dst, RD = src
    |.if DUALNUM
    |  checkint RD, >5
    |  mov RB, [BASE+RD*8]
    |  neg RB
    |  jo >4
    |  mov dword [BASE+RA*8+4], LJ_TISNUM
    |  mov dword [BASE+RA*8], RB
    |9:
    |  ins_next
    |4:
    |  mov dword [BASE+RA*8+4], 0x41e00000  // 2^31.
    |  mov dword [BASE+RA*8], 0
    |  jmp <9
    |5:
    |  ja ->vmeta_unm
    |.else
    |  checknum RD, ->vmeta_unm
    |.endif
    |.if SSE
    |  movsd xmm0, qword [BASE+RD*8]
    |  sseconst_sign xmm1, RDa
    |  xorps xmm0, xmm1
    |  movsd qword [BASE+RA*8], xmm0
    |.else
    |  fld qword [BASE+RD*8]
    |  fchs
    |  fstp qword [BASE+RA*8]
    |.endif
    |.if DUALNUM
    |  jmp <9
    |.else
    |  ins_next
    |.endif
    break;
  case BC_LEN:
    |  ins_AD    // RA = dst, RD = src
    |  checkstr RD, >2
    |  mov STR:RD, [BASE+RD*8]
    |.if DUALNUM
    |  mov RD, dword STR:RD->len
    |1:
    |  mov dword [BASE+RA*8+4], LJ_TISNUM
    |  mov dword [BASE+RA*8], RD
    |.elif SSE
    |  xorps xmm0, xmm0
    |  cvtsi2sd xmm0, dword STR:RD->len
    |1:
    |  movsd qword [BASE+RA*8], xmm0
    |.else
    |  fild dword STR:RD->len
    |1:
    |  fstp qword [BASE+RA*8]
    |.endif
    |  ins_next
    |2:
    |  checktab RD, ->vmeta_len
    |  mov TAB:FCARG1, [BASE+RD*8]
#if LJ_52
    |  mov TAB:RB, TAB:FCARG1->metatable
    |  cmp TAB:RB, 0
    |  jnz >9
    |3:
#endif
    |->BC_LEN_Z:
    |  mov RB, BASE            // Save BASE.
    |  call extern lj_tab_len@4        // (GCtab *t)
    |  // Length of table returned in eax (RD).
    |.if DUALNUM
    |  // Nothing to do.
    |.elif SSE
    |  cvtsi2sd xmm0, RD
    |.else
    |  mov ARG1, RD
    |  fild ARG1
    |.endif
    |  mov BASE, RB            // Restore BASE.
    |  movzx RA, PC_RA
    |  jmp <1
#if LJ_52
    |9:  // Check for __len.
    |  test byte TAB:RB->nomm, 1<<MM_len
    |  jnz <3
    |  jmp ->vmeta_len            // 'no __len' flag NOT set: check.
#endif
    break;

  /* -- Binary ops -------------------------------------------------------- */

    |.macro ins_arithpre, x87ins, sseins, ssereg
    |  ins_ABC
    ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
    ||switch (vk) {
    ||case 0:
    |   checknum RB, ->vmeta_arith_vn
    |   .if DUALNUM
    |     cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_vn
    |   .endif
    |   .if SSE
    |     movsd xmm0, qword [BASE+RB*8]
    |     sseins ssereg, qword [KBASE+RC*8]
    |   .else
    |     fld qword [BASE+RB*8]
    |     x87ins qword [KBASE+RC*8]
    |   .endif
    ||  break;
    ||case 1:
    |   checknum RB, ->vmeta_arith_nv
    |   .if DUALNUM
    |     cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_nv
    |   .endif
    |   .if SSE
    |     movsd xmm0, qword [KBASE+RC*8]
    |     sseins ssereg, qword [BASE+RB*8]
    |   .else
    |     fld qword [KBASE+RC*8]
    |     x87ins qword [BASE+RB*8]
    |   .endif
    ||  break;
    ||default:
    |   checknum RB, ->vmeta_arith_vv
    |   checknum RC, ->vmeta_arith_vv
    |   .if SSE
    |     movsd xmm0, qword [BASE+RB*8]
    |     sseins ssereg, qword [BASE+RC*8]
    |   .else
    |     fld qword [BASE+RB*8]
    |     x87ins qword [BASE+RC*8]
    |   .endif
    ||  break;
    ||}
    |.endmacro
    |
    |.macro ins_arithdn, intins
    |  ins_ABC
    ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
    ||switch (vk) {
    ||case 0:
    |   checkint RB, ->vmeta_arith_vn
    |   cmp dword [KBASE+RC*8+4], LJ_TISNUM; jne ->vmeta_arith_vn
    |   mov RB, [BASE+RB*8]
    |   intins RB, [KBASE+RC*8]; jo ->vmeta_arith_vno
    ||  break;
    ||case 1:
    |   checkint RB, ->vmeta_arith_nv
    |   cmp dword [KBASE+RC*8+4], LJ_TISNUM; jne ->vmeta_arith_nv
    |   mov RC, [KBASE+RC*8]
    |   intins RC, [BASE+RB*8]; jo ->vmeta_arith_nvo
    ||  break;
    ||default:
    |   checkint RB, ->vmeta_arith_vv
    |   checkint RC, ->vmeta_arith_vv
    |   mov RB, [BASE+RB*8]
    |   intins RB, [BASE+RC*8]; jo ->vmeta_arith_vvo
    ||  break;
    ||}
    |  mov dword [BASE+RA*8+4], LJ_TISNUM
    ||if (vk == 1) {
    |   mov dword [BASE+RA*8], RC
    ||} else {
    |   mov dword [BASE+RA*8], RB
    ||}
    |  ins_next
    |.endmacro
    |
    |.macro ins_arithpost
    |.if SSE
    |  movsd qword [BASE+RA*8], xmm0
    |.else
    |  fstp qword [BASE+RA*8]
    |.endif
    |.endmacro
    |
    |.macro ins_arith, x87ins, sseins
    |  ins_arithpre x87ins, sseins, xmm0
    |  ins_arithpost
    |  ins_next
    |.endmacro
    |
    |.macro ins_arith, intins, x87ins, sseins
    |.if DUALNUM
    |  ins_arithdn intins
    |.else
    |  ins_arith, x87ins, sseins
    |.endif
    |.endmacro

    |  // RA = dst, RB = src1 or num const, RC = src2 or num const
  case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
    |  ins_arith add, fadd, addsd
    break;
  case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
    |  ins_arith sub, fsub, subsd
    break;
  case BC_MULVN: case BC_MULNV: case BC_MULVV:
    |  ins_arith imul, fmul, mulsd
    break;
  case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
    |  ins_arith fdiv, divsd
    break;
  case BC_MODVN:
    |  ins_arithpre fld, movsd, xmm1
    |->BC_MODVN_Z:
    |  call ->vm_mod
    |  ins_arithpost
    |  ins_next
    break;
  case BC_MODNV: case BC_MODVV:
    |  ins_arithpre fld, movsd, xmm1
    |  jmp ->BC_MODVN_Z            // Avoid 3 copies. It's slow anyway.
    break;
  case BC_POW:
    |  ins_arithpre fld, movsd, xmm1
    |  call ->vm_pow
    |  ins_arithpost
    |  ins_next
    break;

  case BC_CAT:
    |  ins_ABC    // RA = dst, RB = src_start, RC = src_end
    |.if X64
    |  mov L:CARG1d, SAVE_L
    |  mov L:CARG1d->base, BASE
    |  lea CARG2d, [BASE+RC*8]
    |  mov CARG3d, RC
    |  sub CARG3d, RB
    |->BC_CAT_Z:
    |  mov L:RB, L:CARG1d
    |.else
    |  lea RA, [BASE+RC*8]
    |  sub RC, RB
    |  mov ARG2, RA
    |  mov ARG3, RC
    |->BC_CAT_Z:
    |  mov L:RB, SAVE_L
    |  mov ARG1, L:RB
    |  mov L:RB->base, BASE
    |.endif
    |  mov SAVE_PC, PC
    |  call extern lj_meta_cat        // (lua_State *L, TValue *top, int left)
    |  // NULL (finished) or TValue * (metamethod) returned in eax (RC).
    |  mov BASE, L:RB->base
    |  test RC, RC
    |  jnz ->vmeta_binop
    |  movzx RB, PC_RB            // Copy result to Stk[RA] from Stk[RB].
    |  movzx RA, PC_RA
    |.if X64
    |  mov RCa, [BASE+RB*8]
    |  mov [BASE+RA*8], RCa
    |.else
    |  mov RC, [BASE+RB*8+4]
    |  mov RB, [BASE+RB*8]
    |  mov [BASE+RA*8+4], RC
    |  mov [BASE+RA*8], RB
    |.endif
    |  ins_next
    break;

  /* -- Constant ops ------------------------------------------------------ */

  case BC_KSTR:
    |  ins_AND    // RA = dst, RD = str const (~)
    |  mov RD, [KBASE+RD*4]
    |  mov dword [BASE+RA*8+4], LJ_TSTR
    |  mov [BASE+RA*8], RD
    |  ins_next
    break;
  case BC_KCDATA:
    |.if FFI
    |  ins_AND    // RA = dst, RD = cdata const (~)
    |  mov RD, [KBASE+RD*4]
    |  mov dword [BASE+RA*8+4], LJ_TCDATA
    |  mov [BASE+RA*8], RD
    |  ins_next
    |.endif
    break;
  case BC_KSHORT:
    |  ins_AD    // RA = dst, RD = signed int16 literal
    |.if DUALNUM
    |  movsx RD, RDW
    |  mov dword [BASE+RA*8+4], LJ_TISNUM
    |  mov dword [BASE+RA*8], RD
    |.elif SSE
    |  movsx RD, RDW            // Sign-extend literal.
    |  cvtsi2sd xmm0, RD
    |  movsd qword [BASE+RA*8], xmm0
    |.else
    |  fild PC_RD            // Refetch signed RD from instruction.
    |  fstp qword [BASE+RA*8]
    |.endif
    |  ins_next
    break;
  case BC_KNUM:
    |  ins_AD    // RA = dst, RD = num const
    |.if SSE
    |  movsd xmm0, qword [KBASE+RD*8]
    |  movsd qword [BASE+RA*8], xmm0
    |.else
    |  fld qword [KBASE+RD*8]
    |  fstp qword [BASE+RA*8]
    |.endif
    |  ins_next
    break;
  case BC_KPRI:
    |  ins_AND    // RA = dst, RD = primitive type (~)
    |  mov [BASE+RA*8+4], RD
    |  ins_next
    break;
  case BC_KNIL:
    |  ins_AD    // RA = dst_start, RD = dst_end
    |  lea RA, [BASE+RA*8+12]
    |  lea RD, [BASE+RD*8+4]
    |  mov RB, LJ_TNIL
    |  mov [RA-8], RB            // Sets minimum 2 slots.
    |1:
    |  mov [RA], RB
    |  add RA, 8
    |  cmp RA, RD
    |  jbe <1
    |  ins_next
    break;

  /* -- Upvalue and function ops ------------------------------------------ */

  case BC_UGET:
    |  ins_AD    // RA = dst, RD = upvalue #
    |  mov LFUNC:RB, [BASE-8]
    |  mov UPVAL:RB, [LFUNC:RB+RD*4+offsetof(GCfuncL, uvptr)]
    |  mov RB, UPVAL:RB->v
    |.if X64
    |  mov RDa, [RB]
    |  mov [BASE+RA*8], RDa
    |.else
    |  mov RD, [RB+4]
    |  mov RB, [RB]
    |  mov [BASE+RA*8+4], RD
    |  mov [BASE+RA*8], RB
    |.endif
    |  ins_next
    break;
  case BC_USETV:
#define TV2MARKOFS \
 ((int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv))
    |  ins_AD    // RA = upvalue #, RD = src
    |  mov LFUNC:RB, [BASE-8]
    |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
    |  cmp byte UPVAL:RB->closed, 0
    |  mov RB, UPVAL:RB->v
    |  mov RA, [BASE+RD*8]
    |  mov RD, [BASE+RD*8+4]
    |  mov [RB], RA
    |  mov [RB+4], RD
    |  jz >1
    |  // Check barrier for closed upvalue.
    |  test byte [RB+TV2MARKOFS], LJ_GC_BLACK        // isblack(uv)
    |  jnz >2
    |1:
    |  ins_next
    |
    |2:  // Upvalue is black. Check if new value is collectable and white.
    |  sub RD, LJ_TISGCV
    |  cmp RD, LJ_TISNUM - LJ_TISGCV            // tvisgcv(v)
    |  jbe <1
    |  test byte GCOBJ:RA->gch.marked, LJ_GC_WHITES    // iswhite(v)
    |  jz <1
    |  // Crossed a write barrier. Move the barrier forward.
    |.if X64 and not X64WIN
    |  mov FCARG2, RB
    |  mov RB, BASE            // Save BASE.
    |.else
    |  xchg FCARG2, RB            // Save BASE (FCARG2 == BASE).
    |.endif
    |  lea GL:FCARG1, [DISPATCH+GG_DISP2G]
    |  call extern lj_gc_barrieruv@8    // (global_State *g, TValue *tv)
    |  mov BASE, RB            // Restore BASE.
    |  jmp <1
    break;
#undef TV2MARKOFS
  case BC_USETS:
    |  ins_AND    // RA = upvalue #, RD = str const (~)
    |  mov LFUNC:RB, [BASE-8]
    |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
    |  mov GCOBJ:RA, [KBASE+RD*4]
    |  mov RD, UPVAL:RB->v
    |  mov [RD], GCOBJ:RA
    |  mov dword [RD+4], LJ_TSTR
    |  test byte UPVAL:RB->marked, LJ_GC_BLACK        // isblack(uv)
    |  jnz >2
    |1:
    |  ins_next
    |
    |2:  // Check if string is white and ensure upvalue is closed.
    |  test byte GCOBJ:RA->gch.marked, LJ_GC_WHITES    // iswhite(str)
    |  jz <1
    |  cmp byte UPVAL:RB->closed, 0
    |  jz <1
    |  // Crossed a write barrier. Move the barrier forward.
    |  mov RB, BASE            // Save BASE (FCARG2 == BASE).
    |  mov FCARG2, RD
    |  lea GL:FCARG1, [DISPATCH+GG_DISP2G]
    |  call extern lj_gc_barrieruv@8    // (global_State *g, TValue *tv)
    |  mov BASE, RB            // Restore BASE.
    |  jmp <1
    break;
  case BC_USETN:
    |  ins_AD    // RA = upvalue #, RD = num const
    |  mov LFUNC:RB, [BASE-8]
    |.if SSE
    |  movsd xmm0, qword [KBASE+RD*8]
    |.else
    |  fld qword [KBASE+RD*8]
    |.endif
    |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
    |  mov RA, UPVAL:RB->v
    |.if SSE
    |  movsd qword [RA], xmm0
    |.else
    |  fstp qword [RA]
    |.endif
    |  ins_next
    break;
  case BC_USETP:
    |  ins_AND    // RA = upvalue #, RD = primitive type (~)
    |  mov LFUNC:RB, [BASE-8]
    |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
    |  mov RA, UPVAL:RB->v
    |  mov [RA+4], RD
    |  ins_next
    break;
  case BC_UCLO:
    |  ins_AD    // RA = level, RD = target
    |  branchPC RD            // Do this first to free RD.
    |  mov L:RB, SAVE_L
    |  cmp dword L:RB->openupval, 0
    |  je >1
    |  mov L:RB->base, BASE
    |  lea FCARG2, [BASE+RA*8]        // Caveat: FCARG2 == BASE
    |  mov L:FCARG1, L:RB        // Caveat: FCARG1 == RA
    |  call extern lj_func_closeuv@8    // (lua_State *L, TValue *level)
    |  mov BASE, L:RB->base
    |1:
    |  ins_next
    break;

  case BC_FNEW:
    |  ins_AND    // RA = dst, RD = proto const (~) (holding function prototype)
    |.if X64
    |  mov L:RB, SAVE_L
    |  mov L:RB->base, BASE        // Caveat: CARG2d/CARG3d may be BASE.
    |  mov CARG3d, [BASE-8]
    |  mov CARG2d, [KBASE+RD*4]        // Fetch GCproto *.
    |  mov CARG1d, L:RB
    |.else
    |  mov LFUNC:RA, [BASE-8]
    |  mov PROTO:RD, [KBASE+RD*4]    // Fetch GCproto *.
    |  mov L:RB, SAVE_L
    |  mov ARG3, LFUNC:RA
    |  mov ARG2, PROTO:RD
    |  mov ARG1, L:RB
    |  mov L:RB->base, BASE
    |.endif
    |  mov SAVE_PC, PC
    |  // (lua_State *L, GCproto *pt, GCfuncL *parent)
    |  call extern lj_func_newL_gc
    |  // GCfuncL * returned in eax (RC).
    |  mov BASE, L:RB->base
    |  movzx RA, PC_RA
    |  mov [BASE+RA*8], LFUNC:RC
    |  mov dword [BASE+RA*8+4], LJ_TFUNC
    |  ins_next
    break;

  /* -- Table ops --------------------------------------------------------- */

  case BC_TNEW:
    |  ins_AD    // RA = dst, RD = hbits|asize
    |  mov L:RB, SAVE_L
    |  mov L:RB->base, BASE
    |  mov RA, [DISPATCH+DISPATCH_GL(gc.total)]
    |  cmp RA, [DISPATCH+DISPATCH_GL(gc.threshold)]
    |  mov SAVE_PC, PC
    |  jae >5
    |1:
    |.if X64
    |  mov CARG3d, RD
    |  and RD, 0x7ff
    |  shr CARG3d, 11
    |.else
    |  mov RA, RD
    |  and RD, 0x7ff
    |  shr RA, 11
    |  mov ARG3, RA
    |.endif
    |  cmp RD, 0x7ff
    |  je >3
    |2:
    |.if X64
    |  mov L:CARG1d, L:RB
    |  mov CARG2d, RD
    |.else
    |  mov ARG1, L:RB
    |  mov ARG2, RD
    |.endif
    |  call extern lj_tab_new  // (lua_State *L, int32_t asize, uint32_t hbits)
    |  // Table * returned in eax (RC).
    |  mov BASE, L:RB->base
    |  movzx RA, PC_RA
    |  mov [BASE+RA*8], TAB:RC
    |  mov dword [BASE+RA*8+4], LJ_TTAB
    |  ins_next
    |3:  // Turn 0x7ff into 0x801.
    |  mov RD, 0x801
    |  jmp <2
    |5:
    |  mov L:FCARG1, L:RB
    |  call extern lj_gc_step_fixtop@4    // (lua_State *L)
    |  movzx RD, PC_RD
    |  jmp <1
    break;
  case BC_TDUP:
    |  ins_AND    // RA = dst, RD = table const (~) (holding template table)
    |  mov L:RB, SAVE_L
    |  mov RA, [DISPATCH+DISPATCH_GL(gc.total)]
    |  mov SAVE_PC, PC
    |  cmp RA, [DISPATCH+DISPATCH_GL(gc.threshold)]
    |  mov L:RB->base, BASE
    |  jae >3
    |2:
    |  mov TAB:FCARG2, [KBASE+RD*4]    // Caveat: FCARG2 == BASE
    |  mov L:FCARG1, L:RB        // Caveat: FCARG1 == RA
    |  call extern lj_tab_dup@8        // (lua_State *L, Table *kt)
    |  // Table * returned in eax (RC).
    |  mov BASE, L:RB->base
    |  movzx RA, PC_RA
    |  mov [BASE+RA*8], TAB:RC
    |  mov dword [BASE+RA*8+4], LJ_TTAB
    |  ins_next
    |3:
    |  mov L:FCARG1, L:RB
    |  call extern lj_gc_step_fixtop@4    // (lua_State *L)
    |  movzx RD, PC_RD            // Need to reload RD.
    |  not RDa
    |  jmp <2
    break;

  case BC_GGET:
    |  ins_AND    // RA = dst, RD = str const (~)
    |  mov LFUNC:RB, [BASE-8]
    |  mov TAB:RB, LFUNC:RB->env
    |  mov STR:RC, [KBASE+RD*4]
    |  jmp ->BC_TGETS_Z
    break;
  case BC_GSET:
    |  ins_AND    // RA = src, RD = str const (~)
    |  mov LFUNC:RB, [BASE-8]
    |  mov TAB:RB, LFUNC:RB->env
    |  mov STR:RC, [KBASE+RD*4]
    |  jmp ->BC_TSETS_Z
    break;

  case BC_TGETV:
    |  ins_ABC    // RA = dst, RB = table, RC = key
    |  checktab RB, ->vmeta_tgetv
    |  mov TAB:RB, [BASE+RB*8]
    |
    |  // Integer key?
    |.if DUALNUM
    |  checkint RC, >5
    |  mov RC, dword [BASE+RC*8]
    |.else
    |  // Convert number to int and back and compare.
    |  checknum RC, >5
    |.if SSE
    |  movsd xmm0, qword [BASE+RC*8]
    |  cvtsd2si RC, xmm0
    |  cvtsi2sd xmm1, RC
    |  ucomisd xmm0, xmm1
    |.else
    |  fld qword [BASE+RC*8]
    |  fist ARG1
    |  fild ARG1
    |  fcomparepp
    |  mov RC, ARG1
    |.endif
    |  jne ->vmeta_tgetv        // Generic numeric key? Use fallback.
    |.endif
    |  cmp RC, TAB:RB->asize    // Takes care of unordered, too.
    |  jae ->vmeta_tgetv        // Not in array part? Use fallback.
    |  shl RC, 3
    |  add RC, TAB:RB->array
    |  cmp dword [RC+4], LJ_TNIL    // Avoid overwriting RB in fastpath.
    |  je >2
    |  // Get array slot.
    |.if X64
    |  mov RBa, [RC]
    |  mov [BASE+RA*8], RBa
    |.else
    |  mov RB, [RC]
    |  mov RC, [RC+4]
    |  mov [BASE+RA*8], RB
    |  mov [BASE+RA*8+4], RC
    |.endif
    |1:
    |  ins_next
    |
    |2:  // Check for __index if table value is nil.
    |  cmp dword TAB:RB->metatable, 0    // Shouldn't overwrite RA for fastpath.
    |  jz >3
    |  mov TAB:RA, TAB:RB->metatable
    |  test byte TAB:RA->nomm, 1<<MM_index
    |  jz ->vmeta_tgetv            // 'no __index' flag NOT set: check.
    |  movzx RA, PC_RA            // Restore RA.
    |3:
    |  mov dword [BASE+RA*8+4], LJ_TNIL
    |  jmp <1
    |
    |5:  // String key?
    |  checkstr RC, ->vmeta_tgetv
    |  mov STR:RC, [BASE+RC*8]
    |  jmp ->BC_TGETS_Z
    break;
  case BC_TGETS:
    |  ins_ABC    // RA = dst, RB = table, RC = str const (~)
    |  not RCa
    |  mov STR:RC, [KBASE+RC*4]
    |  checktab RB, ->vmeta_tgets
    |  mov TAB:RB, [BASE+RB*8]
    |->BC_TGETS_Z:    // RB = GCtab *, RC = GCstr *, refetches PC_RA.
    |  mov RA, TAB:RB->hmask
    |  and RA, STR:RC->hash
    |  imul RA, #NODE
    |  add NODE:RA, TAB:RB->node
    |1:
    |  cmp dword NODE:RA->key.it, LJ_TSTR
    |  jne >4
    |  cmp dword NODE:RA->key.gcr, STR:RC
    |  jne >4
    |  // Ok, key found. Assumes: offsetof(Node, val) == 0
    |  cmp dword [RA+4], LJ_TNIL    // Avoid overwriting RB in fastpath.
    |  je >5                // Key found, but nil value?
    |  movzx RC, PC_RA
    |  // Get node value.
    |.if X64
    |  mov RBa, [RA]
    |  mov [BASE+RC*8], RBa
    |.else
    |  mov RB, [RA]
    |  mov RA, [RA+4]
    |  mov [BASE+RC*8], RB
    |  mov [BASE+RC*8+4], RA
    |.endif
    |2:
    |  ins_next
    |
    |3:
    |  movzx RC, PC_RA
    |  mov dword [BASE+RC*8+4], LJ_TNIL
    |  jmp <2
    |
    |4:  // Follow hash chain.
    |  mov NODE:RA, NODE:RA->next
    |  test NODE:RA, NODE:RA
    |  jnz <1
    |  // End of hash chain: key not found, nil result.
    |
    |5:  // Check for __index if table value is nil.
    |  mov TAB:RA, TAB:RB->metatable
    |  test TAB:RA, TAB:RA
    |  jz <3                // No metatable: done.
    |  test byte TAB:RA->nomm, 1<<MM_index
    |  jnz <3                // 'no __index' flag set: done.
    |  jmp ->vmeta_tgets        // Caveat: preserve STR:RC.
    break;
  case BC_TGETB:
    |  ins_ABC    // RA = dst, RB = table, RC = byte literal
    |  checktab RB, ->vmeta_tgetb
    |  mov TAB:RB, [BASE+RB*8]
    |  cmp RC, TAB:RB->asize
    |  jae ->vmeta_tgetb
    |  shl RC, 3
    |  add RC, TAB:RB->array
    |  cmp dword [RC+4], LJ_TNIL    // Avoid overwriting RB in fastpath.
    |  je >2
    |  // Get array slot.
    |.if X64
    |  mov RBa, [RC]
    |  mov [BASE+RA*8], RBa
    |.else
    |  mov RB, [RC]
    |  mov RC, [RC+4]
    |  mov [BASE+RA*8], RB
    |  mov [BASE+RA*8+4], RC
    |.endif
    |1:
    |  ins_next
    |
    |2:  // Check for __index if table value is nil.
    |  cmp dword TAB:RB->metatable, 0    // Shouldn't overwrite RA for fastpath.
    |  jz >3
    |  mov TAB:RA, TAB:RB->metatable
    |  test byte TAB:RA->nomm, 1<<MM_index
    |  jz ->vmeta_tgetb            // 'no __index' flag NOT set: check.
    |  movzx RA, PC_RA            // Restore RA.
    |3:
    |  mov dword [BASE+RA*8+4], LJ_TNIL
    |  jmp <1
    break;

  case BC_TSETV:
    |  ins_ABC    // RA = src, RB = table, RC = key
    |  checktab RB, ->vmeta_tsetv
    |  mov TAB:RB, [BASE+RB*8]
    |
    |  // Integer key?
    |.if DUALNUM
    |  checkint RC, >5
    |  mov RC, dword [BASE+RC*8]
    |.else
    |  // Convert number to int and back and compare.
    |  checknum RC, >5
    |.if SSE
    |  movsd xmm0, qword [BASE+RC*8]
    |  cvtsd2si RC, xmm0
    |  cvtsi2sd xmm1, RC
    |  ucomisd xmm0, xmm1
    |.else
    |  fld qword [BASE+RC*8]
    |  fist ARG1
    |  fild ARG1
    |  fcomparepp
    |  mov RC, ARG1
    |.endif
    |  jne ->vmeta_tsetv        // Generic numeric key? Use fallback.
    |.endif
    |  cmp RC, TAB:RB->asize        // Takes care of unordered, too.
    |  jae ->vmeta_tsetv
    |  shl RC, 3
    |  add RC, TAB:RB->array
    |  cmp dword [RC+4], LJ_TNIL
    |  je >3                // Previous value is nil?
    |1:
    |  test byte TAB:RB->marked, LJ_GC_BLACK    // isblack(table)
    |  jnz >7
    |2:  // Set array slot.
    |.if X64
    |  mov RBa, [BASE+RA*8]
    |  mov [RC], RBa
    |.else
    |  mov RB, [BASE+RA*8+4]
    |  mov RA, [BASE+RA*8]
    |  mov [RC+4], RB
    |  mov [RC], RA
    |.endif
    |  ins_next
    |
    |3:  // Check for __newindex if previous value is nil.
    |  cmp dword TAB:RB->metatable, 0    // Shouldn't overwrite RA for fastpath.
    |  jz <1
    |  mov TAB:RA, TAB:RB->metatable
    |  test byte TAB:RA->nomm, 1<<MM_newindex
    |  jz ->vmeta_tsetv            // 'no __newindex' flag NOT set: check.
    |  movzx RA, PC_RA            // Restore RA.
    |  jmp <1
    |
    |5:  // String key?
    |  checkstr RC, ->vmeta_tsetv
    |  mov STR:RC, [BASE+RC*8]
    |  jmp ->BC_TSETS_Z
    |
    |7:  // Possible table write barrier for the value. Skip valiswhite check.
    |  barrierback TAB:RB, RA
    |  movzx RA, PC_RA            // Restore RA.
    |  jmp <2
    break;
  case BC_TSETS:
    |  ins_ABC    // RA = src, RB = table, RC = str const (~)
    |  not RCa
    |  mov STR:RC, [KBASE+RC*4]
    |  checktab RB, ->vmeta_tsets
    |  mov TAB:RB, [BASE+RB*8]
    |->BC_TSETS_Z:    // RB = GCtab *, RC = GCstr *, refetches PC_RA.
    |  mov RA, TAB:RB->hmask
    |  and RA, STR:RC->hash
    |  imul RA, #NODE
    |  mov byte TAB:RB->nomm, 0        // Clear metamethod cache.
    |  add NODE:RA, TAB:RB->node
    |1:
    |  cmp dword NODE:RA->key.it, LJ_TSTR
    |  jne >5
    |  cmp dword NODE:RA->key.gcr, STR:RC
    |  jne >5
    |  // Ok, key found. Assumes: offsetof(Node, val) == 0
    |  cmp dword [RA+4], LJ_TNIL
    |  je >4                // Previous value is nil?
    |2:
    |  test byte TAB:RB->marked, LJ_GC_BLACK    // isblack(table)
    |  jnz >7
    |3:  // Set node value.
    |  movzx RC, PC_RA
    |.if X64
    |  mov RBa, [BASE+RC*8]
    |  mov [RA], RBa
    |.else
    |  mov RB, [BASE+RC*8+4]
    |  mov RC, [BASE+RC*8]
    |  mov [RA+4], RB
    |  mov [RA], RC
    |.endif
    |  ins_next
    |
    |4:  // Check for __newindex if previous value is nil.
    |  cmp dword TAB:RB->metatable, 0    // Shouldn't overwrite RA for fastpath.
    |  jz <2
    |  mov TMP1, RA            // Save RA.
    |  mov TAB:RA, TAB:RB->metatable
    |  test byte TAB:RA->nomm, 1<<MM_newindex
    |  jz ->vmeta_tsets            // 'no __newindex' flag NOT set: check.
    |  mov RA, TMP1            // Restore RA.
    |  jmp <2
    |
    |5:  // Follow hash chain.
    |  mov NODE:RA, NODE:RA->next
    |  test NODE:RA, NODE:RA
    |  jnz <1
    |  // End of hash chain: key not found, add a new one.
    |
    |  // But check for __newindex first.
    |  mov TAB:RA, TAB:RB->metatable
    |  test TAB:RA, TAB:RA
    |  jz >6                // No metatable: continue.
    |  test byte TAB:RA->nomm, 1<<MM_newindex
    |  jz ->vmeta_tsets            // 'no __newindex' flag NOT set: check.
    |6:
    |  mov TMP1, STR:RC
    |  mov TMP2, LJ_TSTR
    |  mov TMP3, TAB:RB            // Save TAB:RB for us.
    |.if X64
    |  mov L:CARG1d, SAVE_L
    |  mov L:CARG1d->base, BASE
    |  lea CARG3, TMP1
    |  mov CARG2d, TAB:RB
    |  mov L:RB, L:CARG1d
    |.else
    |  lea RC, TMP1            // Store temp. TValue in TMP1/TMP2.
    |  mov ARG2, TAB:RB
    |  mov L:RB, SAVE_L
    |  mov ARG3, RC
    |  mov ARG1, L:RB
    |  mov L:RB->base, BASE
    |.endif
    |  mov SAVE_PC, PC
    |  call extern lj_tab_newkey    // (lua_State *L, GCtab *t, TValue *k)
    |  // Handles write barrier for the new key. TValue * returned in eax (RC).
    |  mov BASE, L:RB->base
    |  mov TAB:RB, TMP3            // Need TAB:RB for barrier.
    |  mov RA, eax
    |  jmp <2                // Must check write barrier for value.
    |
    |7:  // Possible table write barrier for the value. Skip valiswhite check.
    |  barrierback TAB:RB, RC        // Destroys STR:RC.
    |  jmp <3
    break;
  case BC_TSETB:
    |  ins_ABC    // RA = src, RB = table, RC = byte literal
    |  checktab RB, ->vmeta_tsetb
    |  mov TAB:RB, [BASE+RB*8]
    |  cmp RC, TAB:RB->asize
    |  jae ->vmeta_tsetb
    |  shl RC, 3
    |  add RC, TAB:RB->array
    |  cmp dword [RC+4], LJ_TNIL
    |  je >3                // Previous value is nil?
    |1:
    |  test byte TAB:RB->marked, LJ_GC_BLACK    // isblack(table)
    |  jnz >7
    |2:     // Set array slot.
    |.if X64
    |  mov RAa, [BASE+RA*8]
    |  mov [RC], RAa
    |.else
    |  mov RB, [BASE+RA*8+4]
    |  mov RA, [BASE+RA*8]
    |  mov [RC+4], RB
    |  mov [RC], RA
    |.endif
    |  ins_next
    |
    |3:  // Check for __newindex if previous value is nil.
    |  cmp dword TAB:RB->metatable, 0    // Shouldn't overwrite RA for fastpath.
    |  jz <1
    |  mov TAB:RA, TAB:RB->metatable
    |  test byte TAB:RA->nomm, 1<<MM_newindex
    |  jz ->vmeta_tsetb            // 'no __newindex' flag NOT set: check.
    |  movzx RA, PC_RA            // Restore RA.
    |  jmp <1
    |
    |7:  // Possible table write barrier for the value. Skip valiswhite check.
    |  barrierback TAB:RB, RA
    |  movzx RA, PC_RA            // Restore RA.
    |  jmp <2
    break;

  case BC_TSETM:
    |  ins_AD    // RA = base (table at base-1), RD = num const (start index)
    |  mov TMP1, KBASE            // Need one more free register.
    |  mov KBASE, dword [KBASE+RD*8]    // Integer constant is in lo-word.
    |1:
    |  lea RA, [BASE+RA*8]
    |  mov TAB:RB, [RA-8]        // Guaranteed to be a table.
    |  test byte TAB:RB->marked, LJ_GC_BLACK    // isblack(table)
    |  jnz >7
    |2:
    |  mov RD, MULTRES
    |  sub RD, 1
    |  jz >4                // Nothing to copy?
    |  add RD, KBASE            // Compute needed size.
    |  cmp RD, TAB:RB->asize
    |  ja >5                // Doesn't fit into array part?
    |  sub RD, KBASE
    |  shl KBASE, 3
    |  add KBASE, TAB:RB->array
    |3:  // Copy result slots to table.
    |.if X64
    |  mov RBa, [RA]
    |  add RA, 8
    |  mov [KBASE], RBa
    |.else
    |  mov RB, [RA]
    |  mov [KBASE], RB
    |  mov RB, [RA+4]
    |  add RA, 8
    |  mov [KBASE+4], RB
    |.endif
    |  add KBASE, 8
    |  sub RD, 1
    |  jnz <3
    |4:
    |  mov KBASE, TMP1
    |  ins_next
    |
    |5:  // Need to resize array part.
    |.if X64
    |  mov L:CARG1d, SAVE_L
    |  mov L:CARG1d->base, BASE        // Caveat: CARG2d/CARG3d may be BASE.
    |  mov CARG2d, TAB:RB
    |  mov CARG3d, RD
    |  mov L:RB, L:CARG1d
    |.else
    |  mov ARG2, TAB:RB
    |  mov L:RB, SAVE_L
    |  mov L:RB->base, BASE
    |  mov ARG3, RD
    |  mov ARG1, L:RB
    |.endif
    |  mov SAVE_PC, PC
    |  call extern lj_tab_reasize    // (lua_State *L, GCtab *t, int nasize)
    |  mov BASE, L:RB->base
    |  movzx RA, PC_RA            // Restore RA.
    |  jmp <1                // Retry.
    |
    |7:  // Possible table write barrier for any value. Skip valiswhite check.
    |  barrierback TAB:RB, RD
    |  jmp <2
    break;

  /* -- Calls and vararg handling ----------------------------------------- */

  case BC_CALL: case BC_CALLM:
    |  ins_A_C    // RA = base, (RB = nresults+1,) RC = nargs+1 | extra_nargs
    if (op == BC_CALLM) {
      |  add NARGS:RD, MULTRES
    }
    |  cmp dword [BASE+RA*8+4], LJ_TFUNC
    |  mov LFUNC:RB, [BASE+RA*8]
    |  jne ->vmeta_call_ra
    |  lea BASE, [BASE+RA*8+8]
    |  ins_call
    break;

  case BC_CALLMT:
    |  ins_AD    // RA = base, RD = extra_nargs
    |  add NARGS:RD, MULTRES
    |  // Fall through. Assumes BC_CALLT follows and ins_AD is a no-op.
    break;
  case BC_CALLT:
    |  ins_AD    // RA = base, RD = nargs+1
    |  lea RA, [BASE+RA*8+8]
    |  mov KBASE, BASE            // Use KBASE for move + vmeta_call hint.
    |  mov LFUNC:RB, [RA-8]
    |  cmp dword [RA-4], LJ_TFUNC
    |  jne ->vmeta_call
    |->BC_CALLT_Z:
    |  mov PC, [BASE-4]
    |  test PC, FRAME_TYPE
    |  jnz >7
    |1:
    |  mov [BASE-8], LFUNC:RB        // Copy function down, reloaded below.
    |  mov MULTRES, NARGS:RD
    |  sub NARGS:RD, 1
    |  jz >3
    |2:  // Move args down.
    |.if X64
    |  mov RBa, [RA]
    |  add RA, 8
    |  mov [KBASE], RBa
    |.else
    |  mov RB, [RA]
    |  mov [KBASE], RB
    |  mov RB, [RA+4]
    |  add RA, 8
    |  mov [KBASE+4], RB
    |.endif
    |  add KBASE, 8
    |  sub NARGS:RD, 1
    |  jnz <2
    |
    |  mov LFUNC:RB, [BASE-8]
    |3:
    |  mov NARGS:RD, MULTRES
    |  cmp byte LFUNC:RB->ffid, 1    // (> FF_C) Calling a fast function?
    |  ja >5
    |4:
    |  ins_callt
    |
    |5:  // Tailcall to a fast function.
    |  test PC, FRAME_TYPE        // Lua frame below?
    |  jnz <4
    |  movzx RA, PC_RA
    |  not RAa
    |  lea RA, [BASE+RA*8]
    |  mov LFUNC:KBASE, [RA-8]        // Need to prepare KBASE.
    |  mov KBASE, LFUNC:KBASE->pc
    |  mov KBASE, [KBASE+PC2PROTO(k)]
    |  jmp <4
    |
    |7:  // Tailcall from a vararg function.
    |  sub PC, FRAME_VARG
    |  test PC, FRAME_TYPEP
    |  jnz >8                // Vararg frame below?
    |  sub BASE, PC            // Need to relocate BASE/KBASE down.
    |  mov KBASE, BASE
    |  mov PC, [BASE-4]
    |  jmp <1
    |8:
    |  add PC, FRAME_VARG
    |  jmp <1
    break;

  case BC_ITERC:
    |  ins_A    // RA = base, (RB = nresults+1,) RC = nargs+1 (2+1)
    |  lea RA, [BASE+RA*8+8]        // fb = base+1
    |.if X64
    |  mov RBa, [RA-24]            // Copy state. fb[0] = fb[-3].
    |  mov RCa, [RA-16]            // Copy control var. fb[1] = fb[-2].
    |  mov [RA], RBa
    |  mov [RA+8], RCa
    |.else
    |  mov RB, [RA-24]            // Copy state. fb[0] = fb[-3].
    |  mov RC, [RA-20]
    |  mov [RA], RB
    |  mov [RA+4], RC
    |  mov RB, [RA-16]            // Copy control var. fb[1] = fb[-2].
    |  mov RC, [RA-12]
    |  mov [RA+8], RB
    |  mov [RA+12], RC
    |.endif
    |  mov LFUNC:RB, [RA-32]        // Copy callable. fb[-1] = fb[-4]
    |  mov RC, [RA-28]
    |  mov [RA-8], LFUNC:RB
    |  mov [RA-4], RC
    |  cmp RC, LJ_TFUNC            // Handle like a regular 2-arg call.
    |  mov NARGS:RD, 2+1
    |  jne ->vmeta_call
    |  mov BASE, RA
    |  ins_call
    break;

  case BC_ITERN:
    |  ins_A    // RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
    |.if JIT
    |  // NYI: add hotloop, record BC_ITERN.
    |.endif
    |  mov TMP1, KBASE            // Need two more free registers.
    |  mov TMP2, DISPATCH
    |  mov TAB:RB, [BASE+RA*8-16]
    |  mov RC, [BASE+RA*8-8]        // Get index from control var.
    |  mov DISPATCH, TAB:RB->asize
    |  add PC, 4
    |  mov KBASE, TAB:RB->array
    |1:  // Traverse array part.
    |  cmp RC, DISPATCH; jae >5        // Index points after array part?
    |  cmp dword [KBASE+RC*8+4], LJ_TNIL; je >4
    |.if DUALNUM
    |  mov dword [BASE+RA*8+4], LJ_TISNUM
    |  mov dword [BASE+RA*8], RC
    |.elif SSE
    |  cvtsi2sd xmm0, RC
    |.else
    |  fild dword [BASE+RA*8-8]
    |.endif
    |  // Copy array slot to returned value.
    |.if X64
    |  mov RBa, [KBASE+RC*8]
    |  mov [BASE+RA*8+8], RBa
    |.else
    |  mov RB, [KBASE+RC*8+4]
    |  mov [BASE+RA*8+12], RB
    |  mov RB, [KBASE+RC*8]
    |  mov [BASE+RA*8+8], RB
    |.endif
    |  add RC, 1
    |  // Return array index as a numeric key.
    |.if DUALNUM
    |  // See above.
    |.elif SSE
    |  movsd qword [BASE+RA*8], xmm0
    |.else
    |  fstp qword [BASE+RA*8]
    |.endif
    |  mov [BASE+RA*8-8], RC        // Update control var.
    |2:
    |  movzx RD, PC_RD            // Get target from ITERL.
    |  branchPC RD
    |3:
    |  mov DISPATCH, TMP2
    |  mov KBASE, TMP1
    |  ins_next
    |
    |4:  // Skip holes in array part.
    |  add RC, 1
    |.if not (DUALNUM or SSE)
    |  mov [BASE+RA*8-8], RC
    |.endif
    |  jmp <1
    |
    |5:  // Traverse hash part.
    |  sub RC, DISPATCH
    |6:
    |  cmp RC, TAB:RB->hmask; ja <3    // End of iteration? Branch to ITERL+1.
    |  imul KBASE, RC, #NODE
    |  add NODE:KBASE, TAB:RB->node
    |  cmp dword NODE:KBASE->val.it, LJ_TNIL; je >7
    |  lea DISPATCH, [RC+DISPATCH+1]
    |  // Copy key and value from hash slot.
    |.if X64
    |  mov RBa, NODE:KBASE->key
    |  mov RCa, NODE:KBASE->val
    |  mov [BASE+RA*8], RBa
    |  mov [BASE+RA*8+8], RCa
    |.else
    |  mov RB, NODE:KBASE->key.gcr
    |  mov RC, NODE:KBASE->key.it
    |  mov [BASE+RA*8], RB
    |  mov [BASE+RA*8+4], RC
    |  mov RB, NODE:KBASE->val.gcr
    |  mov RC, NODE:KBASE->val.it
    |  mov [BASE+RA*8+8], RB
    |  mov [BASE+RA*8+12], RC
    |.endif
    |  mov [BASE+RA*8-8], DISPATCH
    |  jmp <2
    |
    |7:  // Skip holes in hash part.
    |  add RC, 1
    |  jmp <6
    break;

  case BC_ISNEXT:
    |  ins_AD    // RA = base, RD = target (points to ITERN)
    |  cmp dword [BASE+RA*8-20], LJ_TFUNC; jne >5
    |  mov CFUNC:RB, [BASE+RA*8-24]
    |  cmp dword [BASE+RA*8-12], LJ_TTAB; jne >5
    |  cmp dword [BASE+RA*8-4], LJ_TNIL; jne >5
    |  cmp byte CFUNC:RB->ffid, FF_next_N; jne >5
    |  branchPC RD
    |  mov dword [BASE+RA*8-8], 0    // Initialize control var.
    |  mov dword [BASE+RA*8-4], 0xfffe7fff
    |1:
    |  ins_next
    |5:  // Despecialize bytecode if any of the checks fail.
    |  mov PC_OP, BC_JMP
    |  branchPC RD
    |  mov byte [PC], BC_ITERC
    |  jmp <1
    break;

  case BC_VARG:
    |  ins_ABC    // RA = base, RB = nresults+1, RC = numparams
    |  mov TMP1, KBASE            // Need one more free register.
    |  lea KBASE, [BASE+RC*8+(8+FRAME_VARG)]
    |  lea RA, [BASE+RA*8]
    |  sub KBASE, [BASE-4]
    |  // Note: KBASE may now be even _above_ BASE if nargs was < numparams.
    |  test RB, RB
    |  jz >5                // Copy all varargs?
    |  lea RB, [RA+RB*8-8]
    |  cmp KBASE, BASE            // No vararg slots?
    |  jnb >2
    |1:  // Copy vararg slots to destination slots.
    |.if X64
    |  mov RCa, [KBASE-8]
    |  add KBASE, 8
    |  mov [RA], RCa
    |.else
    |  mov RC, [KBASE-8]
    |  mov [RA], RC
    |  mov RC, [KBASE-4]
    |  add KBASE, 8
    |  mov [RA+4], RC
    |.endif
    |  add RA, 8
    |  cmp RA, RB            // All destination slots filled?
    |  jnb >3
    |  cmp KBASE, BASE            // No more vararg slots?
    |  jb <1
    |2:  // Fill up remainder with nil.
    |  mov dword [RA+4], LJ_TNIL
    |  add RA, 8
    |  cmp RA, RB
    |  jb <2
    |3:
    |  mov KBASE, TMP1
    |  ins_next
    |
    |5:  // Copy all varargs.
    |  mov MULTRES, 1            // MULTRES = 0+1
    |  mov RC, BASE
    |  sub RC, KBASE
    |  jbe <3                // No vararg slots?
    |  mov RB, RC
    |  shr RB, 3
    |  add RB, 1
    |  mov MULTRES, RB            // MULTRES = #varargs+1
    |  mov L:RB, SAVE_L
    |  add RC, RA
    |  cmp RC, L:RB->maxstack
    |  ja >7                // Need to grow stack?
    |6:  // Copy all vararg slots.
    |.if X64
    |  mov RCa, [KBASE-8]
    |  add KBASE, 8
    |  mov [RA], RCa
    |.else
    |  mov RC, [KBASE-8]
    |  mov [RA], RC
    |  mov RC, [KBASE-4]
    |  add KBASE, 8
    |  mov [RA+4], RC
    |.endif
    |  add RA, 8
    |  cmp KBASE, BASE            // No more vararg slots?
    |  jb <6
    |  jmp <3
    |
    |7:  // Grow stack for varargs.
    |  mov L:RB->base, BASE
    |  mov L:RB->top, RA
    |  mov SAVE_PC, PC
    |  sub KBASE, BASE            // Need delta, because BASE may change.
    |  mov FCARG2, MULTRES
    |  sub FCARG2, 1
    |  mov FCARG1, L:RB
    |  call extern lj_state_growstack@8    // (lua_State *L, int n)
    |  mov BASE, L:RB->base
    |  mov RA, L:RB->top
    |  add KBASE, BASE
    |  jmp <6
    break;

  /* -- Returns ----------------------------------------------------------- */

  case BC_RETM:
    |  ins_AD    // RA = results, RD = extra_nresults
    |  add RD, MULTRES            // MULTRES >=1, so RD >=1.
    |  // Fall through. Assumes BC_RET follows and ins_AD is a no-op.
    break;

  case BC_RET: case BC_RET0: case BC_RET1:
    |  ins_AD    // RA = results, RD = nresults+1
    if (op != BC_RET0) {
      |  shl RA, 3
    }
    |1:
    |  mov PC, [BASE-4]
    |  mov MULTRES, RD            // Save nresults+1.
    |  test PC, FRAME_TYPE        // Check frame type marker.
    |  jnz >7                // Not returning to a fixarg Lua func?
    switch (op) {
    case BC_RET:
      |->BC_RET_Z:
      |  mov KBASE, BASE        // Use KBASE for result move.
      |  sub RD, 1
      |  jz >3
      |2:  // Move results down.
      |.if X64
      |  mov RBa, [KBASE+RA]
      |  mov [KBASE-8], RBa
      |.else
      |  mov RB, [KBASE+RA]
      |  mov [KBASE-8], RB
      |  mov RB, [KBASE+RA+4]
      |  mov [KBASE-4], RB
      |.endif
      |  add KBASE, 8
      |  sub RD, 1
      |  jnz <2
      |3:
      |  mov RD, MULTRES        // Note: MULTRES may be >255.
      |  movzx RB, PC_RB        // So cannot compare with RDL!
      |5:
      |  cmp RB, RD            // More results expected?
      |  ja >6
      break;
    case BC_RET1:
      |.if X64
      |  mov RBa, [BASE+RA]
      |  mov [BASE-8], RBa
      |.else
      |  mov RB, [BASE+RA+4]
      |  mov [BASE-4], RB
      |  mov RB, [BASE+RA]
      |  mov [BASE-8], RB
      |.endif
      /* fallthrough */
    case BC_RET0:
      |5:
      |  cmp PC_RB, RDL            // More results expected?
      |  ja >6
    default:
      break;
    }
    |  movzx RA, PC_RA
    |  not RAa                // Note: ~RA = -(RA+1)
    |  lea BASE, [BASE+RA*8]        // base = base - (RA+1)*8
    |  mov LFUNC:KBASE, [BASE-8]
    |  mov KBASE, LFUNC:KBASE->pc
    |  mov KBASE, [KBASE+PC2PROTO(k)]
    |  ins_next
    |
    |6:  // Fill up results with nil.
    if (op == BC_RET) {
      |  mov dword [KBASE-4], LJ_TNIL    // Note: relies on shifted base.
      |  add KBASE, 8
    } else {
      |  mov dword [BASE+RD*8-12], LJ_TNIL
    }
    |  add RD, 1
    |  jmp <5
    |
    |7:  // Non-standard return case.
    |  lea RB, [PC-FRAME_VARG]
    |  test RB, FRAME_TYPEP
    |  jnz ->vm_return
    |  // Return from vararg function: relocate BASE down and RA up.
    |  sub BASE, RB
    if (op != BC_RET0) {
      |  add RA, RB
    }
    |  jmp <1
    break;

  /* -- Loops and branches ------------------------------------------------ */

  |.define FOR_IDX,  [RA];    .define FOR_TIDX,  dword [RA+4]
  |.define FOR_STOP, [RA+8];  .define FOR_TSTOP, dword [RA+12]
  |.define FOR_STEP, [RA+16]; .define FOR_TSTEP, dword [RA+20]
  |.define FOR_EXT,  [RA+24]; .define FOR_TEXT,  dword [RA+28]

  case BC_FORL:
    |.if JIT
    |  hotloop RB
    |.endif
    | // Fall through. Assumes BC_IFORL follows and ins_AJ is a no-op.
    break;

  case BC_JFORI:
  case BC_JFORL:
#if !LJ_HASJIT
    break;
#endif
  case BC_FORI:
  case BC_IFORL:
    vk = (op == BC_IFORL || op == BC_JFORL);
    |  ins_AJ    // RA = base, RD = target (after end of loop or start of loop)
    |  lea RA, [BASE+RA*8]
    if (LJ_DUALNUM) {
      |  cmp FOR_TIDX, LJ_TISNUM; jne >9
      if (!vk) {
    |  cmp FOR_TSTOP, LJ_TISNUM; jne ->vmeta_for
    |  cmp FOR_TSTEP, LJ_TISNUM; jne ->vmeta_for
    |  mov RB, dword FOR_IDX
    |  cmp dword FOR_STEP, 0; jl >5
      } else {
#ifdef LUA_USE_ASSERT
    |  cmp FOR_TSTOP, LJ_TISNUM; jne ->assert_bad_for_arg_type
    |  cmp FOR_TSTEP, LJ_TISNUM; jne ->assert_bad_for_arg_type
#endif
    |  mov RB, dword FOR_STEP
    |  test RB, RB; js >5
    |  add RB, dword FOR_IDX; jo >1
    |  mov dword FOR_IDX, RB
      }
      |  cmp RB, dword FOR_STOP
      |  mov FOR_TEXT, LJ_TISNUM
      |  mov dword FOR_EXT, RB
      if (op == BC_FORI) {
    |  jle >7
    |1:
    |6:
    |  branchPC RD
      } else if (op == BC_JFORI) {
    |  branchPC RD
    |  movzx RD, PC_RD
    |  jle =>BC_JLOOP
    |1:
    |6:
      } else if (op == BC_IFORL) {
    |  jg >7
    |6:
    |  branchPC RD
    |1:
      } else {
    |  jle =>BC_JLOOP
    |1:
    |6:
      }
      |7:
      |  ins_next
      |
      |5:  // Invert check for negative step.
      if (vk) {
    |  add RB, dword FOR_IDX; jo <1
    |  mov dword FOR_IDX, RB
      }
      |  cmp RB, dword FOR_STOP
      |  mov FOR_TEXT, LJ_TISNUM
      |  mov dword FOR_EXT, RB
      if (op == BC_FORI) {
    |  jge <7
      } else if (op == BC_JFORI) {
    |  branchPC RD
    |  movzx RD, PC_RD
    |  jge =>BC_JLOOP
      } else if (op == BC_IFORL) {
    |  jl <7
      } else {
    |  jge =>BC_JLOOP
      }
      |  jmp <6
      |9:  // Fallback to FP variant.
    } else if (!vk) {
      |  cmp FOR_TIDX, LJ_TISNUM
    }
    if (!vk) {
      |  jae ->vmeta_for
      |  cmp FOR_TSTOP, LJ_TISNUM; jae ->vmeta_for
    } else {
#ifdef LUA_USE_ASSERT
      |  cmp FOR_TSTOP, LJ_TISNUM; jae ->assert_bad_for_arg_type
      |  cmp FOR_TSTEP, LJ_TISNUM; jae ->assert_bad_for_arg_type
#endif
    }
    |  mov RB, FOR_TSTEP        // Load type/hiword of for step.
    if (!vk) {
      |  cmp RB, LJ_TISNUM; jae ->vmeta_for
    }
    |.if SSE
    |  movsd xmm0, qword FOR_IDX
    |  movsd xmm1, qword FOR_STOP
    if (vk) {
      |  addsd xmm0, qword FOR_STEP
      |  movsd qword FOR_IDX, xmm0
      |  test RB, RB; js >3
    } else {
      |  jl >3
    }
    |  ucomisd xmm1, xmm0
    |1:
    |  movsd qword FOR_EXT, xmm0
    |.else
    |  fld qword FOR_STOP
    |  fld qword FOR_IDX
    if (vk) {
      |  fadd qword FOR_STEP        // nidx = idx + step
      |  fst qword FOR_IDX
      |  fst qword FOR_EXT
      |  test RB, RB; js >1
    } else {
      |  fst qword FOR_EXT
      |  jl >1
    }
    |  fxch                // Swap lim/(n)idx if step non-negative.
    |1:
    |  fcomparepp
    |.endif
    if (op == BC_FORI) {
      |.if DUALNUM
      |  jnb <7
      |.else
      |  jnb >2
      |  branchPC RD
      |.endif
    } else if (op == BC_JFORI) {
      |  branchPC RD
      |  movzx RD, PC_RD
      |  jnb =>BC_JLOOP
    } else if (op == BC_IFORL) {
      |.if DUALNUM
      |  jb <7
      |.else
      |  jb >2
      |  branchPC RD
      |.endif
    } else {
      |  jnb =>BC_JLOOP
    }
    |.if DUALNUM
    |  jmp <6
    |.else
    |2:
    |  ins_next
    |.endif
    |.if SSE
    |3:  // Invert comparison if step is negative.
    |  ucomisd xmm0, xmm1
    |  jmp <1
    |.endif
    break;

  case BC_ITERL:
    |.if JIT
    |  hotloop RB
    |.endif
    | // Fall through. Assumes BC_IITERL follows and ins_AJ is a no-op.
    break;

  case BC_JITERL:
#if !LJ_HASJIT
    break;
#endif
  case BC_IITERL:
    |  ins_AJ    // RA = base, RD = target
    |  lea RA, [BASE+RA*8]
    |  mov RB, [RA+4]
    |  cmp RB, LJ_TNIL; je >1        // Stop if iterator returned nil.
    if (op == BC_JITERL) {
      |  mov [RA-4], RB
      |  mov RB, [RA]
      |  mov [RA-8], RB
      |  jmp =>BC_JLOOP
    } else {
      |  branchPC RD            // Otherwise save control var + branch.
      |  mov RD, [RA]
      |  mov [RA-4], RB
      |  mov [RA-8], RD
    }
    |1:
    |  ins_next
    break;

  case BC_LOOP:
    |  ins_A    // RA = base, RD = target (loop extent)
    |  // Note: RA/RD is only used by trace recorder to determine scope/extent
    |  // This opcode does NOT jump, it's only purpose is to detect a hot loop.
  |.if JIT
    |  hotloop RB
    |.endif
    | // Fall through. Assumes BC_ILOOP follows and ins_A is a no-op.
    break;

  case BC_ILOOP:
    |  ins_A    // RA = base, RD = target (loop extent)
    |  ins_next
    break;

  case BC_JLOOP:
    |.if JIT
    |  ins_AD    // RA = base (ignored), RD = traceno
    |  mov RA, [DISPATCH+DISPATCH_J(trace)]
    |  mov TRACE:RD, [RA+RD*4]
    |  mov RDa, TRACE:RD->mcode
    |  mov L:RB, SAVE_L
    |  mov [DISPATCH+DISPATCH_GL(jit_base)], BASE
    |  mov [DISPATCH+DISPATCH_GL(jit_L)], L:RB
    |  // Save additional callee-save registers only used in compiled code.
    |.if X64WIN
    |  mov TMPQ, r12
    |  mov TMPa, r13
    |  mov CSAVE_4, r14
    |  mov CSAVE_3, r15
    |  mov RAa, rsp
    |  sub rsp, 9*16+4*8
    |  movdqa [RAa], xmm6
    |  movdqa [RAa-1*16], xmm7
    |  movdqa [RAa-2*16], xmm8
    |  movdqa [RAa-3*16], xmm9
    |  movdqa [RAa-4*16], xmm10
    |  movdqa [RAa-5*16], xmm11
    |  movdqa [RAa-6*16], xmm12
    |  movdqa [RAa-7*16], xmm13
    |  movdqa [RAa-8*16], xmm14
    |  movdqa [RAa-9*16], xmm15
    |.elif X64
    |  mov TMPQ, r12
    |  mov TMPa, r13
    |  sub rsp, 16
    |.endif
    |  jmp RDa
    |.endif
    break;

  case BC_JMP:
    |  ins_AJ    // RA = unused, RD = target
    |  branchPC RD
    |  ins_next
    break;

  /* -- Function headers -------------------------------------------------- */

   /*
   ** Reminder: A function may be called with func/args above L->maxstack,
   ** i.e. occupying EXTRA_STACK slots. And vmeta_call may add one extra slot,
   ** too. This means all FUNC* ops (including fast functions) must check
   ** for stack overflow _before_ adding more slots!
   */

  case BC_FUNCF:
    |.if JIT
    |  hotcall RB
    |.endif
  case BC_FUNCV:  /* NYI: compiled vararg functions. */
    | // Fall through. Assumes BC_IFUNCF/BC_IFUNCV follow and ins_AD is a no-op.
    break;

  case BC_JFUNCF:
#if !LJ_HASJIT
    break;
#endif
  case BC_IFUNCF:
    |  ins_AD  // BASE = new base, RA = framesize, RD = nargs+1
    |  mov KBASE, [PC-4+PC2PROTO(k)]
    |  mov L:RB, SAVE_L
    |  lea RA, [BASE+RA*8]        // Top of frame.
    |  cmp RA, L:RB->maxstack
    |  ja ->vm_growstack_f
    |  movzx RA, byte [PC-4+PC2PROTO(numparams)]
    |  cmp NARGS:RD, RA            // Check for missing parameters.
    |  jbe >3
    |2:
    if (op == BC_JFUNCF) {
      |  movzx RD, PC_RD
      |  jmp =>BC_JLOOP
    } else {
      |  ins_next
    }
    |
    |3:  // Clear missing parameters.
    |  mov dword [BASE+NARGS:RD*8-4], LJ_TNIL
    |  add NARGS:RD, 1
    |  cmp NARGS:RD, RA
    |  jbe <3
    |  jmp <2
    break;

  case BC_JFUNCV:
#if !LJ_HASJIT
    break;
#endif
    | int3  // NYI: compiled vararg functions
    break;  /* NYI: compiled vararg functions. */

  case BC_IFUNCV:
    |  ins_AD  // BASE = new base, RA = framesize, RD = nargs+1
    |  lea RB, [NARGS:RD*8+FRAME_VARG]
    |  lea RD, [BASE+NARGS:RD*8]
    |  mov LFUNC:KBASE, [BASE-8]
    |  mov [RD-4], RB            // Store delta + FRAME_VARG.
    |  mov [RD-8], LFUNC:KBASE        // Store copy of LFUNC.
    |  mov L:RB, SAVE_L
    |  lea RA, [RD+RA*8]
    |  cmp RA, L:RB->maxstack
    |  ja ->vm_growstack_v        // Need to grow stack.
    |  mov RA, BASE
    |  mov BASE, RD
    |  movzx RB, byte [PC-4+PC2PROTO(numparams)]
    |  test RB, RB
    |  jz >2
    |1:  // Copy fixarg slots up to new frame.
    |  add RA, 8
    |  cmp RA, BASE
    |  jnb >3                // Less args than parameters?
    |  mov KBASE, [RA-8]
    |  mov [RD], KBASE
    |  mov KBASE, [RA-4]
    |  mov [RD+4], KBASE
    |  add RD, 8
    |  mov dword [RA-4], LJ_TNIL    // Clear old fixarg slot (help the GC).
    |  sub RB, 1
    |  jnz <1
    |2:
    if (op == BC_JFUNCV) {
      |  movzx RD, PC_RD
      |  jmp =>BC_JLOOP
    } else {
      |  mov KBASE, [PC-4+PC2PROTO(k)]
      |  ins_next
    }
    |
    |3:  // Clear missing parameters.
    |  mov dword [RD+4], LJ_TNIL
    |  add RD, 8
    |  sub RB, 1
    |  jnz <3
    |  jmp <2
    break;

  case BC_FUNCC:
  case BC_FUNCCW:
    |  ins_AD  // BASE = new base, RA = ins RA|RD (unused), RD = nargs+1
    |  mov CFUNC:RB, [BASE-8]
    |  mov KBASEa, CFUNC:RB->f
    |  mov L:RB, SAVE_L
    |  lea RD, [BASE+NARGS:RD*8-8]
    |  mov L:RB->base, BASE
    |  lea RA, [RD+8*LUA_MINSTACK]
    |  cmp RA, L:RB->maxstack
    |  mov L:RB->top, RD
    if (op == BC_FUNCC) {
      |.if X64
      |  mov CARG1d, L:RB            // Caveat: CARG1d may be RA.
      |.else
      |  mov ARG1, L:RB
      |.endif
    } else {
      |.if X64
      |  mov CARG2, KBASEa
      |  mov CARG1d, L:RB            // Caveat: CARG1d may be RA.
      |.else
      |  mov ARG2, KBASEa
      |  mov ARG1, L:RB
      |.endif
    }
    |  ja ->vm_growstack_c        // Need to grow stack.
    |  set_vmstate C
    if (op == BC_FUNCC) {
      |  call KBASEa            // (lua_State *L)
    } else {
      |  // (lua_State *L, lua_CFunction f)
      |  call aword [DISPATCH+DISPATCH_GL(wrapf)]
    }
    |  set_vmstate INTERP
    |  // nresults returned in eax (RD).
    |  mov BASE, L:RB->base
    |  lea RA, [BASE+RD*8]
    |  neg RA
    |  add RA, L:RB->top        // RA = (L->top-(L->base+nresults))*8
    |  mov PC, [BASE-4]            // Fetch PC of caller.
    |  jmp ->vm_returnc
    break;

  /* ---------------------------------------------------------------------- */

  default:
    fprintf(stderr, "Error: undefined opcode BC_%s\n", bc_names[op]);
    exit(2);
    break;
  }
}

static int build_backend(BuildCtx *ctx)
{
  int op;
  dasm_growpc(Dst, BC__MAX);
  build_subroutines(ctx);
  |.code_op
  for (op = 0; op < BC__MAX; op++)
    build_ins(ctx, (BCOp)op, op);
  return BC__MAX;
}

/* Emit pseudo frame-info for all assembler functions. */
static void emit_asm_debug(BuildCtx *ctx)
{
  int fcofs = (int)((uint8_t *)ctx->glob[GLOB_vm_ffi_call] - ctx->code);
#if LJ_64
#define SZPTR    "8"
#define BSZPTR    "3"
#define REG_SP    "0x7"
#define REG_RA    "0x10"
#else
#define SZPTR    "4"
#define BSZPTR    "2"
#define REG_SP    "0x4"
#define REG_RA    "0x8"
#endif
  switch (ctx->mode) {
  case BUILD_elfasm:
    fprintf(ctx->fp, "\t.section .debug_frame,\"\",@progbits\n");
    fprintf(ctx->fp,
    ".Lframe0:\n"
    "\t.long .LECIE0-.LSCIE0\n"
    ".LSCIE0:\n"
    "\t.long 0xffffffff\n"
    "\t.byte 0x1\n"
    "\t.string \"\"\n"
    "\t.uleb128 0x1\n"
    "\t.sleb128 -" SZPTR "\n"
    "\t.byte " REG_RA "\n"
    "\t.byte 0xc\n\t.uleb128 " REG_SP "\n\t.uleb128 " SZPTR "\n"
    "\t.byte 0x80+" REG_RA "\n\t.uleb128 0x1\n"
    "\t.align " SZPTR "\n"
    ".LECIE0:\n\n");
    fprintf(ctx->fp,
    ".LSFDE0:\n"
    "\t.long .LEFDE0-.LASFDE0\n"
    ".LASFDE0:\n"
    "\t.long .Lframe0\n"
#if LJ_64
    "\t.quad .Lbegin\n"
    "\t.quad %d\n"
    "\t.byte 0xe\n\t.uleb128 %d\n"        /* def_cfa_offset */
    "\t.byte 0x86\n\t.uleb128 0x2\n"    /* offset rbp */
    "\t.byte 0x83\n\t.uleb128 0x3\n"    /* offset rbx */
    "\t.byte 0x8f\n\t.uleb128 0x4\n"    /* offset r15 */
    "\t.byte 0x8e\n\t.uleb128 0x5\n"    /* offset r14 */
#else
    "\t.long .Lbegin\n"
    "\t.long %d\n"
    "\t.byte 0xe\n\t.uleb128 %d\n"        /* def_cfa_offset */
    "\t.byte 0x85\n\t.uleb128 0x2\n"    /* offset ebp */
    "\t.byte 0x87\n\t.uleb128 0x3\n"    /* offset edi */
    "\t.byte 0x86\n\t.uleb128 0x4\n"    /* offset esi */
    "\t.byte 0x83\n\t.uleb128 0x5\n"    /* offset ebx */
#endif
    "\t.align " SZPTR "\n"
    ".LEFDE0:\n\n", fcofs, CFRAME_SIZE);
#if LJ_HASFFI
    fprintf(ctx->fp,
    ".LSFDE1:\n"
    "\t.long .LEFDE1-.LASFDE1\n"
    ".LASFDE1:\n"
    "\t.long .Lframe0\n"
#if LJ_64
    "\t.quad lj_vm_ffi_call\n"
    "\t.quad %d\n"
    "\t.byte 0xe\n\t.uleb128 16\n"        /* def_cfa_offset */
    "\t.byte 0x86\n\t.uleb128 0x2\n"    /* offset rbp */
    "\t.byte 0xd\n\t.uleb128 0x6\n"        /* def_cfa_register rbp */
    "\t.byte 0x83\n\t.uleb128 0x3\n"    /* offset rbx */
#else
    "\t.long lj_vm_ffi_call\n"
    "\t.long %d\n"
    "\t.byte 0xe\n\t.uleb128 8\n"        /* def_cfa_offset */
    "\t.byte 0x85\n\t.uleb128 0x2\n"    /* offset ebp */
    "\t.byte 0xd\n\t.uleb128 0x5\n"        /* def_cfa_register ebp */
    "\t.byte 0x83\n\t.uleb128 0x3\n"    /* offset ebx */
#endif
    "\t.align " SZPTR "\n"
    ".LEFDE1:\n\n", (int)ctx->codesz - fcofs);
#endif
#if (defined(__sun__) && defined(__svr4__))
#if LJ_64
    fprintf(ctx->fp, "\t.section .eh_frame,\"a\",@unwind\n");
#else
    fprintf(ctx->fp, "\t.section .eh_frame,\"aw\",@progbits\n");
#endif
#else
    fprintf(ctx->fp, "\t.section .eh_frame,\"a\",@progbits\n");
#endif
    fprintf(ctx->fp,
    ".Lframe1:\n"
    "\t.long .LECIE1-.LSCIE1\n"
    ".LSCIE1:\n"
    "\t.long 0\n"
    "\t.byte 0x1\n"
    "\t.string \"zPR\"\n"
    "\t.uleb128 0x1\n"
    "\t.sleb128 -" SZPTR "\n"
    "\t.byte " REG_RA "\n"
    "\t.uleb128 6\n"            /* augmentation length */
    "\t.byte 0x1b\n"            /* pcrel|sdata4 */
    "\t.long lj_err_unwind_dwarf-.\n"
    "\t.byte 0x1b\n"            /* pcrel|sdata4 */
    "\t.byte 0xc\n\t.uleb128 " REG_SP "\n\t.uleb128 " SZPTR "\n"
    "\t.byte 0x80+" REG_RA "\n\t.uleb128 0x1\n"
    "\t.align " SZPTR "\n"
    ".LECIE1:\n\n");
    fprintf(ctx->fp,
    ".LSFDE2:\n"
    "\t.long .LEFDE2-.LASFDE2\n"
    ".LASFDE2:\n"
    "\t.long .LASFDE2-.Lframe1\n"
    "\t.long .Lbegin-.\n"
    "\t.long %d\n"
    "\t.uleb128 0\n"            /* augmentation length */
    "\t.byte 0xe\n\t.uleb128 %d\n"        /* def_cfa_offset */
#if LJ_64
    "\t.byte 0x86\n\t.uleb128 0x2\n"    /* offset rbp */
    "\t.byte 0x83\n\t.uleb128 0x3\n"    /* offset rbx */
    "\t.byte 0x8f\n\t.uleb128 0x4\n"    /* offset r15 */
    "\t.byte 0x8e\n\t.uleb128 0x5\n"    /* offset r14 */
#else
    "\t.byte 0x85\n\t.uleb128 0x2\n"    /* offset ebp */
    "\t.byte 0x87\n\t.uleb128 0x3\n"    /* offset edi */
    "\t.byte 0x86\n\t.uleb128 0x4\n"    /* offset esi */
    "\t.byte 0x83\n\t.uleb128 0x5\n"    /* offset ebx */
#endif
    "\t.align " SZPTR "\n"
    ".LEFDE2:\n\n", fcofs, CFRAME_SIZE);
#if LJ_HASFFI
    fprintf(ctx->fp,
    ".Lframe2:\n"
    "\t.long .LECIE2-.LSCIE2\n"
    ".LSCIE2:\n"
    "\t.long 0\n"
    "\t.byte 0x1\n"
    "\t.string \"zR\"\n"
    "\t.uleb128 0x1\n"
    "\t.sleb128 -" SZPTR "\n"
    "\t.byte " REG_RA "\n"
    "\t.uleb128 1\n"            /* augmentation length */
    "\t.byte 0x1b\n"            /* pcrel|sdata4 */
    "\t.byte 0xc\n\t.uleb128 " REG_SP "\n\t.uleb128 " SZPTR "\n"
    "\t.byte 0x80+" REG_RA "\n\t.uleb128 0x1\n"
    "\t.align " SZPTR "\n"
    ".LECIE2:\n\n");
    fprintf(ctx->fp,
    ".LSFDE3:\n"
    "\t.long .LEFDE3-.LASFDE3\n"
    ".LASFDE3:\n"
    "\t.long .LASFDE3-.Lframe2\n"
    "\t.long lj_vm_ffi_call-.\n"
    "\t.long %d\n"
    "\t.uleb128 0\n"            /* augmentation length */
#if LJ_64
    "\t.byte 0xe\n\t.uleb128 16\n"        /* def_cfa_offset */
    "\t.byte 0x86\n\t.uleb128 0x2\n"    /* offset rbp */
    "\t.byte 0xd\n\t.uleb128 0x6\n"        /* def_cfa_register rbp */
    "\t.byte 0x83\n\t.uleb128 0x3\n"    /* offset rbx */
#else
    "\t.byte 0xe\n\t.uleb128 8\n"        /* def_cfa_offset */
    "\t.byte 0x85\n\t.uleb128 0x2\n"    /* offset ebp */
    "\t.byte 0xd\n\t.uleb128 0x5\n"        /* def_cfa_register ebp */
    "\t.byte 0x83\n\t.uleb128 0x3\n"    /* offset ebx */
#endif
    "\t.align " SZPTR "\n"
    ".LEFDE3:\n\n", (int)ctx->codesz - fcofs);
#endif
    break;
  /* Mental note: never let Apple design an assembler.
  ** Or a linker. Or a plastic case. But I digress.
  */
  case BUILD_machasm: {
#if LJ_HASFFI
    int fcsize = 0;
#endif
    int i;
    fprintf(ctx->fp, "\t.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support\n");
    fprintf(ctx->fp,
    "EH_frame1:\n"
    "\t.set L$set$x,LECIEX-LSCIEX\n"
    "\t.long L$set$x\n"
    "LSCIEX:\n"
    "\t.long 0\n"
    "\t.byte 0x1\n"
    "\t.ascii \"zPR\\0\"\n"
    "\t.byte 0x1\n"
    "\t.byte 128-" SZPTR "\n"
    "\t.byte " REG_RA "\n"
    "\t.byte 6\n"                /* augmentation length */
    "\t.byte 0x9b\n"            /* indirect|pcrel|sdata4 */
#if LJ_64
    "\t.long _lj_err_unwind_dwarf+4@GOTPCREL\n"
    "\t.byte 0x1b\n"            /* pcrel|sdata4 */
    "\t.byte 0xc\n\t.byte " REG_SP "\n\t.byte " SZPTR "\n"
#else
    "\t.long L_lj_err_unwind_dwarf$non_lazy_ptr-.\n"
    "\t.byte 0x1b\n"            /* pcrel|sdata4 */
    "\t.byte 0xc\n\t.byte 0x5\n\t.byte 0x4\n"  /* esp=5 on 32 bit MACH-O. */
#endif
    "\t.byte 0x80+" REG_RA "\n\t.byte 0x1\n"
    "\t.align " BSZPTR "\n"
    "LECIEX:\n\n");
    for (i = 0; i < ctx->nsym; i++) {
      const char *name = ctx->sym[i].name;
      int32_t size = ctx->sym[i+1].ofs - ctx->sym[i].ofs;
      if (size == 0) continue;
#if LJ_HASFFI
      if (!strcmp(name, "_lj_vm_ffi_call")) { fcsize = size; continue; }
#endif
      fprintf(ctx->fp,
      "%s.eh:\n"
      "LSFDE%d:\n"
      "\t.set L$set$%d,LEFDE%d-LASFDE%d\n"
      "\t.long L$set$%d\n"
      "LASFDE%d:\n"
      "\t.long LASFDE%d-EH_frame1\n"
      "\t.long %s-.\n"
      "\t.long %d\n"
      "\t.byte 0\n"                /* augmentation length */
      "\t.byte 0xe\n\t.byte %d\n"        /* def_cfa_offset */
#if LJ_64
      "\t.byte 0x86\n\t.byte 0x2\n"        /* offset rbp */
      "\t.byte 0x83\n\t.byte 0x3\n"        /* offset rbx */
      "\t.byte 0x8f\n\t.byte 0x4\n"        /* offset r15 */
      "\t.byte 0x8e\n\t.byte 0x5\n"        /* offset r14 */
#else
      "\t.byte 0x84\n\t.byte 0x2\n"        /* offset ebp (4 for MACH-O)*/
      "\t.byte 0x87\n\t.byte 0x3\n"        /* offset edi */
      "\t.byte 0x86\n\t.byte 0x4\n"        /* offset esi */
      "\t.byte 0x83\n\t.byte 0x5\n"        /* offset ebx */
#endif
      "\t.align " BSZPTR "\n"
      "LEFDE%d:\n\n",
      name, i, i, i, i, i, i, i, name, size, CFRAME_SIZE, i);
    }
#if LJ_HASFFI
    if (fcsize) {
      fprintf(ctx->fp,
      "EH_frame2:\n"
      "\t.set L$set$y,LECIEY-LSCIEY\n"
      "\t.long L$set$y\n"
      "LSCIEY:\n"
      "\t.long 0\n"
      "\t.byte 0x1\n"
      "\t.ascii \"zR\\0\"\n"
      "\t.byte 0x1\n"
      "\t.byte 128-" SZPTR "\n"
      "\t.byte " REG_RA "\n"
      "\t.byte 1\n"                /* augmentation length */
#if LJ_64
      "\t.byte 0x1b\n"            /* pcrel|sdata4 */
      "\t.byte 0xc\n\t.byte " REG_SP "\n\t.byte " SZPTR "\n"
#else
      "\t.byte 0x1b\n"            /* pcrel|sdata4 */
      "\t.byte 0xc\n\t.byte 0x5\n\t.byte 0x4\n"  /* esp=5 on 32 bit MACH. */
#endif
      "\t.byte 0x80+" REG_RA "\n\t.byte 0x1\n"
      "\t.align " BSZPTR "\n"
      "LECIEY:\n\n");
      fprintf(ctx->fp,
      "_lj_vm_ffi_call.eh:\n"
      "LSFDEY:\n"
      "\t.set L$set$yy,LEFDEY-LASFDEY\n"
      "\t.long L$set$yy\n"
      "LASFDEY:\n"
      "\t.long LASFDEY-EH_frame2\n"
      "\t.long _lj_vm_ffi_call-.\n"
      "\t.long %d\n"
      "\t.byte 0\n"                /* augmentation length */
#if LJ_64
      "\t.byte 0xe\n\t.byte 16\n"        /* def_cfa_offset */
      "\t.byte 0x86\n\t.byte 0x2\n"        /* offset rbp */
      "\t.byte 0xd\n\t.uleb128 0x6\n"    /* def_cfa_register rbp */
      "\t.byte 0x83\n\t.byte 0x3\n"        /* offset rbx */
#else
      "\t.byte 0xe\n\t.byte 8\n"        /* def_cfa_offset */
      "\t.byte 0x84\n\t.byte 0x2\n"        /* offset ebp (4 for MACH-O)*/
      "\t.byte 0xd\n\t.uleb128 0x4\n"    /* def_cfa_register ebp */
      "\t.byte 0x83\n\t.byte 0x3\n"        /* offset ebx */
#endif
      "\t.align " BSZPTR "\n"
      "LEFDEY:\n\n", fcsize);
    }
#endif
#if LJ_64
    fprintf(ctx->fp, "\t.subsections_via_symbols\n");
#else
    fprintf(ctx->fp,
      "\t.non_lazy_symbol_pointer\n"
      "L_lj_err_unwind_dwarf$non_lazy_ptr:\n"
      ".indirect_symbol _lj_err_unwind_dwarf\n"
      ".long 0\n");
#endif
    }
    break;
  default:  /* Difficult for other modes. */
    break;
  }
}