Jit in luajit
Luajit是一款高性能的lua解释器,与官方的lua解释器相比,luajit的高速除了将解释器直接以汇编代码实现外,还支持jit模式(Just in time)。Jit模式即将luajit的字节码编译成处理器能够直接执行的机器码,从而比解释执行速度更快。
Luajit存在97个字节码指令,例如 FORL指令对应一个数字类型的for循环语句,同时还有IFORL指令(强制解释模式执行)和JFORL指令(Jit模式执行),同时解释器实现了对各个字节码指令的翻译,这里以X86的翻译器为例。
Luajit优化一段指令序列,当一个指令的地址被识别为hot后,并开始跟踪记录指令线性序列、在退出跟踪时将指令序列编译成机器码。但是luajit只对FUNCF、FORL、ITERL、LOOP这四个指令进行了跟踪,即循环和一个函数的开始,例如,在解释执行FORL指令:
case BC_FORL:|.if JIT| hotloop RB|.endif| // Fall through. Assumes BC_IFORL follows and ins_AJ is a no-op.
break;
它首先判断是否是JIT模式,如果是jit模式,则调用hotloop块进行热点判断,同样的,如果是FUNCF指令,则调用hotcall块:
case BC_FUNCF:|.if JIT| hotcall RB|.endif
case BC_FUNCV: /* NYI: compiled vararg functions. */| // Fall through. Assumes BC_IFUNCF/BC_IFUNCV follow and ins_AD is a no-op.
break;
hotloop块的定义如下:
|// Decrement hashed hotcount and trigger trace recorder if zero.
|.macro hotloop, reg
| mov reg, PC
| shr reg, 1
| and reg, HOTCOUNT_PCMASK
| sub word [DISPATCH+reg+GG_DISP2HOT], HOTCOUNT_LOOP
| jb ->vm_hotloop
|.endmacro
它将当前指令的地址右移一位,并与HOTCOUNT_PCMASK与操作,得到一个索引(哈希运算),根据这个索引在数值中找到计数值,减去HOTCOUNT_LOOP,当这个计数值小于0时,跳转到vm_hotloop继续执行。
|->vm_hotloop: // Hot loop counter underflow.
|.if JIT
| mov LFUNC:RB, [BASE-8] // Same as curr_topL(L).
| mov RB, LFUNC:RB->pc
| movzx RD, byte [RB+PC2PROTO(framesize)]
| lea RD, [BASE+RD*8]
| mov L:RB, SAVE_L
| mov L:RB->base, BASE
| mov L:RB->top, RD
| mov FCARG2, PC
| lea FCARG1, [DISPATCH+GG_DISP2J]
| mov aword [DISPATCH+DISPATCH_J(L)], L:RBa
| mov SAVE_PC, PC
| call extern lj_trace_hot@8 // (jit_State *J, const BCIns *pc)
| jmp <3
|.endif
首先获取当前的函数,并得到字节码PC指针,获取栈大小并保存到RD中,接着讲top的位置保存到RD中,在进行一些参数设置后,调用lj_trace_hot用于跟踪热点,该函数位于lj_trace.c中:
/* A hotcount triggered. Start recording a root trace. */
void LJ_FASTCALL lj_trace_hot(jit_State *J, const BCIns *pc)
{/* Note: pc is the interpreter bytecode PC here. It's offset by 1. */ERRNO_SAVE/* Reset hotcount. */hotcount_set(J2GG(J), pc, J->param[JIT_P_hotloop]*HOTCOUNT_LOOP);/* Only start a new trace if not recording or inside __gc call or vmevent. */if (J->state == LJ_TRACE_IDLE &&!(J2G(J)->hookmask & (HOOK_GC|HOOK_VMEVENT))) {J->parent = 0; /* Root trace. */J->exitno = 0;J->state = LJ_TRACE_START;lj_trace_ins(J, pc-1);}ERRNO_RESTORE
}
它将状态设置为LJ_TRACE_START后,开始调用lj_trace_ins进行热点跟踪:
/* A bytecode instruction is about to be executed. Record it. */
void lj_trace_ins(jit_State *J, const BCIns *pc)
{/* Note: J->L must already be set. pc is the true bytecode PC here. */J->pc = pc;J->fn = curr_func(J->L);J->pt = isluafunc(J->fn) ? funcproto(J->fn) : NULL;while (lj_vm_cpcall(J->L, NULL, (void *)J, trace_state) != 0)J->state = LJ_TRACE_ERR;
}
这里的pc是指向的字节码指令,在循环中不断执行和跟踪,这里的跟踪通过trace_state函数实现,这个函数存在7种状态:
/* Trace compiler state. */
typedef enum {LJ_TRACE_IDLE, /* Trace compiler idle. */LJ_TRACE_ACTIVE = 0x10,LJ_TRACE_RECORD, /* Bytecode recording active. */LJ_TRACE_START, /* New trace started. */LJ_TRACE_END, /* End of trace. */LJ_TRACE_ASM, /* Assemble trace. */LJ_TRACE_ERR /* Trace aborted with error. */
} TraceState;
IDLE表示空闲、RECORD表示正在跟踪记录、END表示结束、ASM表示开始编译机器指令,这个状态转换函数的实现如下:
/* State machine for the trace compiler. Protected callback. */
static TValue *trace_state(lua_State *L, lua_CFunction dummy, void *ud)
{jit_State *J = (jit_State *)ud;UNUSED(dummy);do {retry:switch (J->state) {case LJ_TRACE_START:J->state = LJ_TRACE_RECORD; /* trace_start() may change state. */trace_start(J);lj_dispatch_update(J2G(J));break;case LJ_TRACE_RECORD:trace_pendpatch(J, 0);setvmstate(J2G(J), RECORD);lj_vmevent_send_(L, RECORD,/* Save/restore tmptv state for trace recorder. */TValue savetv = J2G(J)->tmptv;TValue savetv2 = J2G(J)->tmptv2;setintV(L->top++, J->cur.traceno);setfuncV(L, L->top++, J->fn);setintV(L->top++, J->pt ? (int32_t)proto_bcpos(J->pt, J->pc) : -1);setintV(L->top++, J->framedepth);J2G(J)->tmptv = savetv;J2G(J)->tmptv2 = savetv2;);lj_record_ins(J);break;case LJ_TRACE_END:trace_pendpatch(J, 1);J->loopref = 0;if ((J->flags & JIT_F_OPT_LOOP) &&J->cur.link == J->cur.traceno && J->framedepth + J->retdepth == 0) {setvmstate(J2G(J), OPT);lj_opt_dce(J);if (lj_opt_loop(J)) { /* Loop optimization failed? */J->cur.link = 0;J->cur.linktype = LJ_TRLINK_NONE;J->loopref = J->cur.nins;J->state = LJ_TRACE_RECORD; /* Try to continue recording. */break;}J->loopref = J->chain[IR_LOOP]; /* Needed by assembler. */}lj_opt_split(J);lj_opt_sink(J);if (!J->loopref) J->cur.snap[J->cur.nsnap-1].count = SNAPCOUNT_DONE;J->state = LJ_TRACE_ASM;break;case LJ_TRACE_ASM:setvmstate(J2G(J), ASM);lj_asm_trace(J, &J->cur);trace_stop(J);setvmstate(J2G(J), INTERP);J->state = LJ_TRACE_IDLE;lj_dispatch_update(J2G(J));return NULL;default: /* Trace aborted asynchronously. */setintV(L->top++, (int32_t)LJ_TRERR_RECERR);/* fallthrough */case LJ_TRACE_ERR:trace_pendpatch(J, 1);if (trace_abort(J))goto retry;setvmstate(J2G(J), INTERP);J->state = LJ_TRACE_IDLE;lj_dispatch_update(J2G(J));return NULL;}} while (J->state > LJ_TRACE_RECORD);return NULL;
}
它根据不同的状态执行不同的操作函数,我们可以简化为:
/* State machine for the trace compiler. Protected callback. */
static TValue *trace_state(lua_State *L, lua_CFunction dummy, void *ud)
{jit_State *J = (jit_State *)ud;UNUSED(dummy);do {retry:switch (J->state) {case LJ_TRACE_START:J->state = LJ_TRACE_RECORD; /* trace_start() may change state. */trace_start(J);lj_dispatch_update(J2G(J));break;case LJ_TRACE_RECORD:lj_record_ins(J);break;case LJ_TRACE_END:trace_pendpatch(J, 1);J->state = LJ_TRACE_ASM;break;case LJ_TRACE_ASM:setvmstate(J2G(J), ASM);lj_asm_trace(J, &J->cur);trace_stop(J);setvmstate(J2G(J), INTERP);J->state = LJ_TRACE_IDLE;lj_dispatch_update(J2G(J));return NULL;default: /* Trace aborted asynchronously. */setintV(L->top++, (int32_t)LJ_TRERR_RECERR);case LJ_TRACE_ERR:trace_pendpatch(J, 1);if (trace_abort(J))goto retry;setvmstate(J2G(J), INTERP);J->state = LJ_TRACE_IDLE;lj_dispatch_update(J2G(J));return NULL;}} while (J->state > LJ_TRACE_RECORD);return NULL;
}
Trace_start用于初始化trace结构,分配一个traceno等,它是一个数组的下标,其中比较重要的是lj_record_ins函数,它用于记录一个字节码指令,并保存为一个SSA中间代码IR形式,IR的定义在lj_ir.c中:
/* -- IR instructions ----------------------------------------------------- */
/* IR instruction definition. Order matters, see below. ORDER IR */
#define IRDEF(_) \/* Guarded assertions. */ \/* Must be properly aligned to flip opposites (^1) and (un)ordered (^4). */ \_(LT, N , ref, ref) \_(GE, N , ref, ref) \_(LE, N , ref, ref) \_(GT, N , ref, ref) \\_(ULT, N , ref, ref) \_(UGE, N , ref, ref) \_(ULE, N , ref, ref) \_(UGT, N , ref, ref) \\_(EQ, C , ref, ref) \_(NE, C , ref, ref) \\_(ABC, N , ref, ref) \_(RETF, S , ref, ref) \\/* Miscellaneous ops. */ \_(NOP, N , ___, ___) \_(BASE, N , lit, lit) \_(PVAL, N , lit, ___) \_(GCSTEP, S , ___, ___) \_(HIOP, S , ref, ref) \_(LOOP, S , ___, ___) \_(USE, S , ref, ___) \_(PHI, S , ref, ref) \_(RENAME, S , ref, lit) \_(PROF, S , ___, ___) \\/* Constants. */ \_(KPRI, N , ___, ___) \_(KINT, N , cst, ___) \_(KGC, N , cst, ___) \_(KPTR, N , cst, ___) \_(KKPTR, N , cst, ___) \_(KNULL, N , cst, ___) \_(KNUM, N , cst, ___) \_(KINT64, N , cst, ___) \_(KSLOT, N , ref, lit) \\/* Bit ops. */ \_(BNOT, N , ref, ___) \_(BSWAP, N , ref, ___) \_(BAND, C , ref, ref) \_(BOR, C , ref, ref) \_(BXOR, C , ref, ref) \_(BSHL, N , ref, ref) \_(BSHR, N , ref, ref) \_(BSAR, N , ref, ref) \_(BROL, N , ref, ref) \_(BROR, N , ref, ref) \\/* Arithmetic ops. ORDER ARITH */ \_(ADD, C , ref, ref) \_(SUB, N , ref, ref) \_(MUL, C , ref, ref) \_(DIV, N , ref, ref) \_(MOD, N , ref, ref) \_(POW, N , ref, ref) \_(NEG, N , ref, ref) \\_(ABS, N , ref, ref) \_(ATAN2, N , ref, ref) \_(LDEXP, N , ref, ref) \_(MIN, C , ref, ref) \_(MAX, C , ref, ref) \_(FPMATH, N , ref, lit) \\/* Overflow-checking arithmetic ops. */ \_(ADDOV, CW, ref, ref) \_(SUBOV, NW, ref, ref) \_(MULOV, CW, ref, ref) \\/* Memory ops. A = array, H = hash, U = upvalue, F = field, S = stack. */ \\/* Memory references. */ \_(AREF, R , ref, ref) \_(HREFK, R , ref, ref) \_(HREF, L , ref, ref) \_(NEWREF, S , ref, ref) \_(UREFO, LW, ref, lit) \_(UREFC, LW, ref, lit) \_(FREF, R , ref, lit) \_(STRREF, N , ref, ref) \_(LREF, L , ___, ___) \\/* Loads and Stores. These must be in the same order. */ \_(ALOAD, L , ref, ___) \_(HLOAD, L , ref, ___) \_(ULOAD, L , ref, ___) \_(FLOAD, L , ref, lit) \_(XLOAD, L , ref, lit) \_(SLOAD, L , lit, lit) \_(VLOAD, L , ref, ___) \\_(ASTORE, S , ref, ref) \_(HSTORE, S , ref, ref) \_(USTORE, S , ref, ref) \_(FSTORE, S , ref, ref) \_(XSTORE, S , ref, ref) \\/* Allocations. */ \_(SNEW, N , ref, ref) /* CSE is ok, not marked as A. */ \_(XSNEW, A , ref, ref) \_(TNEW, AW, lit, lit) \_(TDUP, AW, ref, ___) \_(CNEW, AW, ref, ref) \_(CNEWI, NW, ref, ref) /* CSE is ok, not marked as A. */ \\/* Buffer operations. */ \_(BUFHDR, L , ref, lit) \_(BUFPUT, L , ref, ref) \_(BUFSTR, A , ref, ref) \\/* Barriers. */ \_(TBAR, S , ref, ___) \_(OBAR, S , ref, ref) \_(XBAR, S , ___, ___) \\/* Type conversions. */ \_(CONV, NW, ref, lit) \_(TOBIT, N , ref, ref) \_(TOSTR, N , ref, lit) \_(STRTO, N , ref, ___) \\/* Calls. */ \_(CALLN, N , ref, lit) \_(CALLA, A , ref, lit) \_(CALLL, L , ref, lit) \_(CALLS, S , ref, lit) \_(CALLXS, S , ref, ref) \_(CARG, N , ref, ref) \\/* End of list. */
多种情况都会出现结束记录的情况,如遇到了已经编译的指令。在LJ_TRACE_ASM状态下会进行代码的编译操作lj_asm_trace函数位于lj_asm.c中,函数中有一个循环如下:
/* Assemble a trace in linear backwards order. */for (as->curins--; as->curins > as->stopins; as->curins--) {IRIns *ir = IR(as->curins);lua_assert(!(LJ_32 && irt_isint64(ir->t))); /* Handled by SPLIT. */if (!ra_used(ir) && !ir_sideeff(ir) && (as->flags & JIT_F_OPT_DCE))
continue; /* Dead-code elimination can be soooo easy. */if (irt_isguard(ir->t))
asm_snap_prep(as);RA_DBG_REF();checkmclim(as);asm_ir(as, ir);}
它调用asm_ir将所有的ir指令转换成机器码,在lj_asm_trace函数后,接着调用trace_stop函数结束一个跟踪,该函数实现如下:
/* Stop tracing. */
static void trace_stop(jit_State *J)
{BCIns *pc = mref(J->cur.startpc, BCIns);BCOp op = bc_op(J->cur.startins);GCproto *pt = &gcref(J->cur.startpt)->pt;TraceNo traceno = J->cur.traceno;GCtrace *T = J->curfinal;lua_State *L;switch (op) {case BC_FORL:setbc_op(pc+bc_j(J->cur.startins), BC_JFORI); /* Patch FORI, too. *//* fallthrough */case BC_LOOP:case BC_ITERL:case BC_FUNCF:/* Patch bytecode of starting instruction in root trace. */setbc_op(pc, (int)op+(int)BC_JLOOP-(int)BC_LOOP);setbc_d(pc, traceno);addroot:/* Add to root trace chain in prototype. */J->cur.nextroot = pt->trace;pt->trace = (TraceNo1)traceno;break;case BC_RET:case BC_RET0:case BC_RET1:*pc = BCINS_AD(BC_JLOOP, J->cur.snap[0].nslots, traceno);goto addroot;case BC_JMP:/* Patch exit branch in parent to side trace entry. */lua_assert(J->parent != 0 && J->cur.root != 0);lj_asm_patchexit(J, traceref(J, J->parent), J->exitno, J->cur.mcode);/* Avoid compiling a side trace twice (stack resizing uses parent exit). */traceref(J, J->parent)->snap[J->exitno].count = SNAPCOUNT_DONE;/* Add to side trace chain in root trace. */{GCtrace *root = traceref(J, J->cur.root);root->nchild++;J->cur.nextside = root->nextside;root->nextside = (TraceNo1)traceno;}break;case BC_CALLM:case BC_CALL:case BC_ITERC:/* Trace stitching: patch link of previous trace. */traceref(J, J->exitno)->link = traceno;break;default:lua_assert(0);break;}/* Commit new mcode only after all patching is done. */lj_mcode_commit(J, J->cur.mcode);J->postproc = LJ_POST_NONE;trace_save(J, T);L = J->L;lj_vmevent_send(L, TRACE,setstrV(L, L->top++, lj_str_newlit(L, "stop"));setintV(L->top++, traceno);setfuncV(L, L->top++, J->fn););
}
它通过如下两个函数:
setbc_op(pc, (int)op+(int)BC_JLOOP-(int)BC_LOOP);
setbc_d(pc, traceno);
重新设置指令的opcode,即J_op = op + BC_JLOOP – BC_LOOP,那么如果将lj_bc.h中的指令随意打乱会影响到这里的正确性。
修改后的指令为:j_op traceno
同时可以看到pt->trace字段记录的是一个traceno
pt->trace = (TraceNo1)traceno;
那么接下来看解释器中对JFORL的实现:
case BC_JFORI:case BC_JFORL:
#if !LJ_HASJITbreak;
#endifcase BC_FORI:case BC_IFORL:vk = (op == BC_IFORL || op == BC_JFORL);| ins_AJ // RA = base, RD = target (after end of loop or start of loop)| lea RA, [BASE+RA*8]if (LJ_DUALNUM) {| cmp FOR_TIDX, LJ_TISNUM; jne >9if (!vk) {| cmp FOR_TSTOP, LJ_TISNUM; jne ->vmeta_for| cmp FOR_TSTEP, LJ_TISNUM; jne ->vmeta_for| mov RB, dword FOR_IDX| cmp dword FOR_STEP, 0; jl >5} else {
#ifdef LUA_USE_ASSERT| cmp FOR_TSTOP, LJ_TISNUM; jne ->assert_bad_for_arg_type| cmp FOR_TSTEP, LJ_TISNUM; jne ->assert_bad_for_arg_type
#endif| mov RB, dword FOR_STEP| test RB, RB; js >5| add RB, dword FOR_IDX; jo >1| mov dword FOR_IDX, RB}| cmp RB, dword FOR_STOP| mov FOR_TEXT, LJ_TISNUM| mov dword FOR_EXT, RBif (op == BC_FORI) {| jle >7|1:|6:| branchPC RD} else if (op == BC_JFORI) {| branchPC RD| movzx RD, PC_RD| jle =>BC_JLOOP|1:|6:} else if (op == BC_IFORL) {| jg >7|6:| branchPC RD|1:} else {| jle =>BC_JLOOP|1:|6:}
当op = JFORL时,跳转到BC_JLOOP,如下:
case BC_JLOOP:|.if JIT| ins_AD // RA = base (ignored), RD = traceno| mov RA, [DISPATCH+DISPATCH_J(trace)]| mov TRACE:RD, [RA+RD*4]| mov RDa, TRACE:RD->mcode| mov L:RB, SAVE_L| mov [DISPATCH+DISPATCH_GL(jit_base)], BASE| mov [DISPATCH+DISPATCH_GL(tmpbuf.L)], L:RB| // Save additional callee-save registers only used in compiled code.|.if X64WIN| mov TMPQ, r12| mov TMPa, r13| mov CSAVE_4, r14| mov CSAVE_3, r15| mov RAa, rsp| sub rsp, 9*16+4*8| movdqa [RAa], xmm6| movdqa [RAa-1*16], xmm7| movdqa [RAa-2*16], xmm8| movdqa [RAa-3*16], xmm9| movdqa [RAa-4*16], xmm10| movdqa [RAa-5*16], xmm11| movdqa [RAa-6*16], xmm12| movdqa [RAa-7*16], xmm13| movdqa [RAa-8*16], xmm14| movdqa [RAa-9*16], xmm15|.elif X64| mov TMPQ, r12| mov TMPa, r13| sub rsp, 16|.endif| jmp RDa|.endif
break;
先根据RD中保存的traceno获取到trace结构,并将trace结构中保存的机器码赋值在Rda中,进行堆栈转换后,jmp Rda直接跳转到机器码处执行。
在x86中,当字节码执行结束,继续执行下一个字节码时,都会使用ins_next块,它的定义如下:
|.macro ins_NEXT
| mov RC, [PC]
| movzx RA, RCH
| movzx OP, RCL
| add PC, 4
| shr RC, 16
|.if X64
| jmp aword [DISPATCH+OP*8]
|.else
| jmp aword [DISPATCH+OP*4]
|.endif
|.endmacro
它从PC指向的字节码中获取了opcode,并跳转到DISPATCH + OP *4的地方执行,可以看出OP实质上保存的是数组的下标而这些数组元素都指向了vm_record汇编块:
|->vm_record: // Dispatch target for recording phase.|.if JIT| movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]| test RDL, HOOK_VMEVENT // No recording while in vmevent.| jnz >5| // Decrement the hookcount for consistency, but always do the call.| test RDL, HOOK_ACTIVE| jnz >1| test RDL, LUA_MASKLINE|LUA_MASKCOUNT| jz >1| dec dword [DISPATCH+DISPATCH_GL(hookcount)]| jmp >1|.endif||->vm_rethook: // Dispatch target for return hooks.| movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]| test RDL, HOOK_ACTIVE // Hook already active?| jnz >5| jmp >1||->vm_inshook: // Dispatch target for instr/line hooks.| movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]| test RDL, HOOK_ACTIVE // Hook already active?| jnz >5|| test RDL, LUA_MASKLINE|LUA_MASKCOUNT| jz >5| dec dword [DISPATCH+DISPATCH_GL(hookcount)]| jz >1| test RDL, LUA_MASKLINE| jz >5|1:| mov L:RB, SAVE_L| mov L:RB->base, BASE| mov FCARG2, PC // Caveat: FCARG2 == BASE| mov FCARG1, L:RB| // SAVE_PC must hold the _previous_ PC. The callee updates it with PC.| call extern lj_dispatch_ins@8 // (lua_State *L, const BCIns *pc)|3:| mov BASE, L:RB->base|4:| movzx RA, PC_RA|5:| movzx OP, PC_OP| movzx RD, PC_RD|.if X64| jmp aword [DISPATCH+OP*8+GG_DISP2STATIC] // Re-dispatch to static ins.|.else| jmp aword [DISPATCH+OP*4+GG_DISP2STATIC] // Re-dispatch to static ins.|.endif
调用lj_dispatch_ins后,最终跳转到DISPATCH+OP*4+GG_DISP2STATIC这个地址继续执行,这个地址正是每个opcode对应的解释器汇编块。
Jit的正常运行还涉及堆栈状态的转换、jit模式到解释模式的跳转等(SSA守护代码),远不止这些。