【lua学习】3.字符串
- Lua字符串的概况
- 字符串实现
- 字符串结构TString
- 全局字符串表stringtable
- 新建字符串luaS_newlstr (先查表,再决定创建与否)
- 新建字符串 newlstr
- 重新设置全局字符串的大小 luaS_resize
- 全局字符串表的缩容
- 保留字是如何不被回收的
Lua字符串的概况
- Lua虚拟机中存在一个散列桶结构的全局字符串表来存放所有字符串
- 关于比较字符串:先比较hash,再比较长度,再逐字符比较。节省时间
- 同一个字符串在Lua虚拟机中仅有一个副本。节省空间
- 一旦创建则无法变更
- 变量存放的仅是字符串的引用
字符串实现
字符串结构TString
(lobject.h) TString
typedef union TString {L_Umaxalign dummy;//保证最大对其//见下文struct {CommonHeader;lu_byte reserved;//当>0时,其值-1表示保留字列表中的索引//见下文unsigned int hash;//字符串散列值,根据字符串长度和部分字符计算而来的值,见下文size_t len;//字符串长度} tsv;
} TString;
(llimits.h) L_Umaxalign
typedef LUAI_USER_ALIGNMENT_T L_Umaxalign;//LUAI_USER_ALIGNMENT_T见下文
(luaconf.h) LUAI_USER_ALIGNMENT_T
//看此定义,为8字节对齐
#define LUAI_USER_ALIGNMENT_T union { double u; void* s; long l; }
全局字符串表stringtable
(lstate.h) stringtable
typedef struct stringtable {GCObject** hash;//开散列结构(和lua table的闭散列结构是有区别的),指向一个数组,每个元素是桶(GCObject*类型),桶管理GCObject链表lu_int32 nuse;//存储的字符串数量int size;//全局字符串表的最大容量(hash桶的最大数量)
} stringtable;
新建字符串luaS_newlstr (先查表,再决定创建与否)
(lstring.c) luaS_newlstr
TString* luaS_newlstr(lua_State* L, const char *str, size_t len)
{//初始h值就是字符串的长度unsigned int h = cast(unsigned int, len);//cast就是强制转型,见下文//获得计算hash值的跨度,如果字符串很长,若逐位计算肯定非常消耗性能size_t step = (len>>5) + 1;//从最后一个字符开始,计算h值,跟后续计算的值执行异或,进而得到最终的h值for (size_t l1 = len; l1 >= step; l1 -= step){h ^= ((h<<5)+(h>>2)+cast(unsigned char, str[l1-1]));}//h值对全局字符串表的最大桶数量求余,得到桶的索引unsigned int bucket_index = lmod(h, G(L)->strt.size);//lmod见下文,G见下文//遍历该桶管理的链表,查找有没有相等的字符串for (GCObject* gco = G(L)->strt.hash[bucket_index];gco != NULL;gco = gco->gch.next){TString* ts = rawgco2ts(gco);//rawgco2ts见下文if (ts->tsv.len == len//先比较长度&& (memcmp(str, getstr(ts), len) == 0))//再逐位比较。getstr见下文{if (isdead(G(L), gco))//若要被GC,则把标记标为另一种白色,防止被GC。isdead见下文{changewhite(gco);}//找到了,就不需要新建了,直接返回即可return ts;}}//新建字符串return newlstr(L, str, len, h);
}
(llimits.h) cast宏
#define cast(t, exp) ((t)(exp))
(lobject.h) lmod宏
//针对size为2次幂的 优化的 取模算法
#define lmod(s,size) \(check_exp((size&(size-1))==0, (cast(int, (s) & ((size)-1)))))
(lstate.h) G宏
#define G(L) (L->l_G)
(lstate.h) rawgco2ts宏
//根据GCObject*获取TString*
#define rawgco2ts(gco) check_exp((gco)->gch.tt == LUA_TSTRING, &((gco)->ts))
(lobject.h) getstr宏
//根据TString* 获取 字符串的首地址,注意:字符串首地址并不在TString内部,而在TString对象最后一个字节的下一个字节,这也解释了为何TString一定要对齐,就是为了提高CPU读取性能
#define getstr(ts) cast(const char *, (ts) + 1)
(lgc.h) isdead宏
//判断是否在当前GC阶段被判定为需要回收,todo以后讨论
#define isdead(g, gco) ((gco)->gch.marked & otherwhite(g) & WHITEBITS)
(lgc.h) changewhite宏
//改变GCObject的当前白色标记,todo以后讨论
#define changewhite(gco) ((gco)->gch.marked ^= WHITEBITS)
新建字符串 newlstr
(lstring.c) newlstr
static TString* newlstr(lua_State* L, const char* str, size_t len, unsigned int h)
{//若字符串太长,则luaM_toobig。luaM_toobig见下文if (len + 1 > (MAX_SIZE - sizeof(TString)/sizeof(char)){luaM_toobig(L);}//为TString对象分配连续的空间,这个空间首部是TString结构,后面紧接着是字符串实际内容TString* ts = cast(TString*, luaM_malloc(L, (len + 1)*sizeof(char) + sizeof(TString)));ts.tsv.len = len;ts.tsv.hash = h;ts.tsv.marked = luaC_white(G(L));ts.tsv.reserved = 0;//获取字符串内容的首地址pstrchar* pstr = (char*)(ts + 1);//拷贝str到pstrmemcpy(pstr , str, len * sizeof(char));//字符串最后一个字符当然是'\0'pstr[len] = '\0';//获取全局字符串表stringtable* tb = &G(L)->strt;//计算桶索引h = lmod(h, tb->size);//用头插法将字符串插入桶中ts->tsv.next = tb->hash[h];tb->hash[h] = obj2gco(ts);//obj2gco见下文//全局字符串表的字符串数量+1tb->nuse++;//若字符串总数 超过了 全局字符串表的最大桶数 且 最大桶数 <= MAX_INT/2,则对全局字符串表扩容if (tb->nuse > cast(lu_int32, tb->size) && tb->size <= MAX_INT/2){luaS_resize(L, tb->size*2);//luaS_resize,重新分配全局字符串表的大小,见下文}
}
(llimits.h) MAX_SIZET宏
#define MAX_SIZET ((size_t)(~(size_t)0)-2)
(lmem.c) luaM_toobig报告要分配的内存过大
void* luaM_toobig(lua_State* L)
{//luaG_runerror, todo后面讨论luaG_runerror(L, "memory allocation error: blobk too big");return NULL;
}
(lmem.h) luaM_malloc宏
//请求分配needbytes字节内存,luaM_realloc_见下文
#define luaM_malloc(L, bytes_to_allocate) luaM_realloc_(L, NULL, 0, (bytes_to_allocate))
(lmem.c) luaM_realloc_分配内存
void* luaM_relloc_(lua_State* L, void* address_to_free, size_t bytes_to_free, size_t bytes_to_allocate)
{lua_assert((bytes_to_free==0)==(address_to_free==NULL));//#llimits.h中define lua_assert(c) ((void)0) 什么也不做,所以忽略global_State* g = G(L);//调用全局表的内存分配函数void* address_to_allocate = (*g->frealloc)(g->ud, address_to_free, bytes_to_free, bytes_to_allocate);//若分配失败且需要的字节数>0,抛出内存分配错误if (address_to_allocate == NULL && bytes_to_allocate> 0){//luaD_throw,todo后面讨论luaD_throw(L, LUA_ERRMEM);//lua.h中#define LUA_ERRMEM 4}lua_assert((bytes_to_allocate==0)==(address_to_allocate==NULL));//分配的内存总字节数 发生变化g->totalbytes += bytes_to_allocate - bytes_to_free;return address_to_allocate;
}
(lgc.h) luaC_white获取当前GC白色
//获取当前gc的白色,todo后面讨论
#define luaC_white(g) cast(lu_byte, (g)->currentwhite & WHITEBITS)
(lstate.h) obj2gco宏
//将对象指针强制转为GCObject*
#define obj2gco(v) (cast(GCObject *, (v)))
(llimits.h) MAX_INT宏
#define MAX_INT (INT_MAX-2)
重新设置全局字符串的大小 luaS_resize
(lstring.c) luaS_resize
void luaS_resize(lua_State* L, int newsize)
{//若GC正处于扫描字符串阶段,则不处理。GCSweepingstring见下文if (G(L)->gcstate == GCSweepingstring){return;}//新分配hash结构GCObject** newhash = luaM_newvector(L, newsize, GCObject*);//初始化每个桶为空指针for (int i = 0; i < newsize; i++){newhash[i] = NULL;}//获取全局字符串表指针stringtable* tb = &G(L)->strt;//遍历每个桶,遍历每个桶管理的链表,全部夺舍到新的hash结构中for (int i = 0; i < tb.size; i+){GCObject* p = tb->hash[i];//遍历桶管理的链表while(p){//以next指向下一个元素GCObject* next = p->gch.next;//根据的hash计算新的 hashunsigned int oldh = gco2ts(p)->hash;int newh = lmod(oldh, newsize);lua_assert(cast_int(oldh%newsize)==lmod(oldh,newsize))//用头插法将元素加入桶管理的链表p->gch.next = newhash[newh];newhash[newh] = p;//p设为next,以便循环的下一轮p = next;}}//释放旧的hash结构luaM_freearray(L, tb->hash, tb_size, TString*);//更新全局字符串表tb->size = newsize;tb->hash = newhash;
}
(lgc.h) GCSsweepstring宏
//gc的几个阶段,todo后面再说
#define GCSpause 0
#define GCSpropagate 1
#define GCSsweepstring 2
#define GCSsweep 3
#define GCSfinalize 4
(lmem.h) luaM_newvector宏
//分配count个类型为datatype的连续内存空间,获得的数据强制转型为datatype*
#define luaM_newvector(L,count_to_allocate,datatype) \cast(datatype*, luaM_reallocv(L, NULL, 0, count_to_allocate, sizeof(datatype)))
(lmem.h) luaM_reallocv宏
//分配count个singlebytes大小的连续内存空间,若空间足够则分配,否则报错
#define luaM_reallocv(L,address_to_free,count_to_free,count_to_allocate,singlebytes) \((cast(size_t, (count)+1) <= MAX_SIZET/(singlebytes)) ? luaM_realloc_(L, (address_to_free), (count_to_free)*(singlebytes), (count_to_allocate)*(singlebytes)) : \luaM_toobig(L))
(lmem.h) luaM_freearray宏
//释放address_to_free处的count_to_free个datatype类型的连续内存空间
#define luaM_freearray(L, address_to_free, count_to_free, datatype) luaM_reallocv(L, (address_to_free), count_to_free, 0, sizeof(datatype))
全局字符串表的缩容
- 缩容的时机:垃圾回收的GCSweep阶段
- 缩容的原则:全局字符串表的字符串总数<桶的最大数量 且 桶的最大数量>MINSTRTABSIZE*2 (llimits.h中#define MINSTRTABSIZE 32)
(lgc.c) 看checkSize
static void checkSize(lua_State* L)
{global_State* g = G(L);//当全局字符串表的字符串总数小于桶最大数量的四分之一 且 桶的最大数量大于MINSTRTABSIZE*2, 则缩容if (g-<strt.nuse < cast(lu_int32, g->strt.size/4)&& g->strt.size > MINSTRTABSIZE*2){luaS_resize(L, g->strt.size/2);}//...无关内容省略
}
(lgc.c) 看singlestep
//一次单步GC,todo后面再说
static l_mem singlestep(lua_State* L)
{global_State* g = G(L);switch(g->gcstate){//...无关内容省略case GCSweep:{lu_mem old = g->totalbytes;g->sweepgc = sweeplist(L, g->sweepgc, GCSWEEPMAX);if (*g->sweepgc == NULL){//包含有全局字符串表缩容的操作checkSizes(L);g->gcstate = GCSfinalize;}lua_assert(old >= g->totalbytes);g->estimate -= old - g->totalbytes;return GCSWEEPMAX*GCSWEEPCOST;}//...无关内容省略}//...无关内容省略
}
保留字是如何不被回收的
- 不被回收的原则:被luaS_fix操作,其tsv.marked被改成了FIXEDBIT
(llex.c) luaX_init
void luaX_init(lua_State* L)
{for (int i=0; i<NUM_RESERVED; i++)//NUM_RESERVED见下文{//尝试新建每个保留字字符串TString* ts = luaS_new(L, luaX_tokens[i]);//luaS_new luaX_tokens 见下文//标记不会被GC,修改ts->tsv.marked为FIXEDBITluaS_fix(ts);//luaS_fix见下文lua_assert(strlen(luaX_tokens[i]) + 1 <= TOKEN_LEN);//TOKEN_LEN见下文//记录在保留字数组的索引+1值ts->tsv.reserved = cast_byte(i + 1);}
}
(llex.h) NUM_RESERVED宏
//表示多少个保留字
//TK_WHILE FIRST_RESERVED 见下文
#define NUM_RESERVED (cast(int, TK_WHILE-FIRST_RESERVED+1))
(llex.h) RESERVED 枚举
enum RESERVED {/* terminal symbols denoted by reserved words */TK_AND = FIRST_RESERVED, TK_BREAK,TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION,TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT,TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE,/* other terminal symbols */TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE, TK_NUMBER,TK_NAME, TK_STRING, TK_EOS
};
(lstring.h) luaS_new宏
//尝试新建一个字符串
#define luaS_new(L, s) (luaS_newlstr(L, s, strlen(s)))
(llex.c) luaX_tokens
//终结符数组
const char *const luaX_tokens [] = {"and", "break", "do", "else", "elseif","end", "false", "for", "function", "if","in", "local", "nil", "not", "or", "repeat","return", "then", "true", "until", "while","..", "...", "==", ">=", "<=", "~=","<number>", "<name>", "<string>", "<eof>",NULL
};
(lstring.h) luaS_fix宏
//设置字符串不会被GC
//l_setbit FIXEDBIT见下文
#define luaS_fix(s) l_setbit((s)->tsv.marked, FIXEDBIT)
(lgc.h) l_setbit宏
//x与m求或
#define setbits(x,m) ((x) |= (m))
//求2的b-1次方,也就是第b位为1,其余为0
#define bitmask(b) (1<<(b))
//将b1和b2位设为1,其余为0
#define bit2mask(b1,b2) (bitmask(b1) | bitmask(b2))
//将x的第b位置为1
#define l_setbit(x,b) setbits(x, bitmask(b))
(lgc.h) FIXEDBIT宏
//todo后面再说
#define WHITE0BIT 0
#define WHITE1BIT 1
#define BLACKBIT 2
#define FINALIZEDBIT 3
#define KEYWEAKBIT 3
#define VALUEWEAKBIT 4
#define FIXEDBIT 5
#define SFIXEDBIT 6
#define WHITEBITS bit2mask(WHITE0BIT, WHITE1BIT)
(llex.h) TOKEN_LEN宏
//保留字选function为最长
#define TOKEN_LEN (sizeof("function")/sizeof(char))