fid介绍
fid是lustre文件系统中文件的唯一标识,总共128位,fid序列、fid序列内编号、fid版本号(目前未使用默认为0)
/*** File IDentifier.** FID is a cluster-wide unique identifier of a file or an object (stripe).* FIDs are never reused.**/
struct lu_fid {/*** FID sequence. Sequence is a unit of migration: all files (objects)* with FIDs from a given sequence are stored on the same server.* Lustre should support 2^64 objects, so even if each sequence* has only a single object we can still enumerate 2^64 objects.**/__u64 f_seq;/* FID number within sequence. */__u32 f_oid;/*** FID version, used to distinguish different versions (in the sense* of snapshots, etc.) of the same file system object. Not currently* used.**/__u32 f_ver;
};
fid获取流程
fld:fid location database
sequence controller: 运行在MDT0上,拥有全量的fld信息
sequence server:运行在MDT(非MDT0)和OST上,互相不会有重叠,是MDT0上fld的子集
sequence client:每个客户端在挂载文件系统时会提前申请一部分sequence,每个客户端拿到的sequence不会有重叠
管理fid范围的结构体:
/*** Describes a range of sequence, lsr_start is included but lsr_end is* not in the range.* Same structure is used in fld module where lsr_index field holds mdt id* of the home mdt.*/
struct lu_seq_range {__u64 lsr_start; //序列号起始__u64 lsr_end; //序列号结束__u32 lsr_index;__u32 lsr_flags;
};
在同一个MDT上创建的文件,如果序列号未使用完,则这些文件的序列号相同,fid序列内编号依次递增。
如果序列号使用完,则客户端会向服务端申请下一批序列号
例:假设同一客户端依次在MDT0上创建test1和test2,那么test1的fid为[0x20001:0x1:0x0],那么test2的fid为[0x20001:0x2:0x0]
fid申请流程
服务端初始化阶段
mdt:
//运行于mdt上的sequnce服务
static int mdt_seq_init(const struct lu_env *env, struct mdt_device *mdt)
{struct seq_server_site *ss;int rc;ENTRY;ss = mdt_seq_site(mdt);/* init sequence controller server(MDT0) */if (ss->ss_node_id == 0) {OBD_ALLOC_PTR(ss->ss_control_seq);if (ss->ss_control_seq == NULL)RETURN(-ENOMEM);//在mdt0上运行sequnce controller,分配seq范围给sequence serverrc = seq_server_init(env, ss->ss_control_seq, mdt->mdt_bottom,mdt_obd_name(mdt), LUSTRE_SEQ_CONTROLLER,ss);if (rc)GOTO(out_seq_fini, rc);}/* Init normal sequence server */OBD_ALLOC_PTR(ss->ss_server_seq);if (ss->ss_server_seq == NULL)GOTO(out_seq_fini, rc = -ENOMEM);//其他的mdt会执行下面的代码,运行sequence server,给sequence server分配seq来构建fidrc = seq_server_init(env, ss->ss_server_seq, mdt->mdt_bottom,mdt_obd_name(mdt), LUSTRE_SEQ_SERVER, ss);if (rc)GOTO(out_seq_fini, rc);/* init seq client for seq server to talk to seq controller(MDT0) */rc = mdt_seq_init_cli(env, mdt);if (rc != 0)GOTO(out_seq_fini, rc);if (ss->ss_node_id != 0)/* register controller export through lwp */rc = mdt_register_seq_exp(mdt);EXIT;
out_seq_fini:if (rc)mdt_seq_fini(env, mdt);return rc;
}
ost:
//运行于ost上的sequnce服务
int ofd_fid_init(const struct lu_env *env, struct ofd_device *ofd)
{struct seq_server_site *ss = &ofd->ofd_seq_site;struct lu_device *lu = &ofd->ofd_dt_dev.dd_lu_dev;char *obd_name = ofd_name(ofd);char *name = NULL;int len = strlen(obd_name) + 7;int rc = 0;ss = &ofd->ofd_seq_site;lu->ld_site->ld_seq_site = ss;ss->ss_lu = lu->ld_site;ss->ss_node_id = ofd->ofd_lut.lut_lsd.lsd_osd_index;OBD_ALLOC(name, len);if (name == NULL)return -ENOMEM;OBD_ALLOC_PTR(ss->ss_server_seq);if (ss->ss_server_seq == NULL)GOTO(out_name, rc = -ENOMEM);//在ost上运行sequence serverrc = seq_server_init(env, ss->ss_server_seq, ofd->ofd_osd, obd_name,LUSTRE_SEQ_SERVER, ss);if (rc) {CERROR("%s: seq server init error: rc = %d\n", obd_name, rc);GOTO(out_server, rc);}ss->ss_server_seq->lss_space.lsr_index = ss->ss_node_id;OBD_ALLOC_PTR(ss->ss_client_seq);if (ss->ss_client_seq == NULL)GOTO(out_server, rc = -ENOMEM);snprintf(name, len, "%s-super", obd_name);//初始化ost上的seq clientrc = seq_client_init(ss->ss_client_seq, NULL, LUSTRE_SEQ_DATA,name, NULL);if (rc) {CERROR("%s: seq client init error: rc = %d\n", obd_name, rc);GOTO(out_client, rc);}rc = seq_server_set_cli(env, ss->ss_server_seq, ss->ss_client_seq);if (rc) {
out_client:seq_client_fini(ss->ss_client_seq);OBD_FREE_PTR(ss->ss_client_seq);ss->ss_client_seq = NULL;
out_server:seq_server_fini(ss->ss_server_seq, env);OBD_FREE_PTR(ss->ss_server_seq);ss->ss_server_seq = NULL;}
out_name:OBD_FREE(name, len);return rc;
}
当客户端创建新文件时,会检查申请到的sequence是否够用,够用的话直接走本地分配fid,然后在向MDT发送创建请求时会将新分配的fid反馈给MDT,由MDT进行处理
当客户端本地的sequence不够用时,会向sequence server申请新的sequence,如果sequence server上的sequence也不够用了,那么sequence server会向sequence controller申请新的sequence,最终返回新的sequence给客户端。
seq cli <–> seq svr
当client中申请的seq使用完之后会向server申请新的seq
static int seq_client_alloc_seq(const struct lu_env *env,struct lu_client_seq *seq, u64 *seqnr)
{......// eq耗尽的话调用seq_client_alloc_meta()获取新的seqif (lu_seq_range_is_exhausted(&seq->lcs_space)) {rc = seq_client_alloc_meta(env, seq);if (rc) {if (rc != -EINPROGRESS)CERROR("%s: Can't allocate new meta-sequence,""rc = %d\n", seq->lcs_name, rc);RETURN(rc);} else {CDEBUG(D_INFO, "%s: New range - "DRANGE"\n",seq->lcs_name, PRANGE(&seq->lcs_space));}} else {rc = 0;}......RETURN(rc);
}static int seq_client_rpc(struct lu_client_seq *seq,struct lu_seq_range *output, __u32 opc,const char *opcname)
{......if (seq->lcs_type == LUSTRE_SEQ_METADATA) {req->rq_reply_portal = MDC_REPLY_PORTAL;req->rq_request_portal = SEQ_METADATA_PORTAL;} else {req->rq_reply_portal = OSC_REPLY_PORTAL;req->rq_request_portal = SEQ_DATA_PORTAL;}......rc = ptlrpc_queue_wait(req);if (rc)GOTO(out_req, rc);//获取到新申请的seqout = req_capsule_server_get(&req->rq_pill, &RMF_SEQ_RANGE);*output = *out;......
}
seq server对应处理函数:
static int seq_handler(struct tgt_session_info *tsi)
{struct lu_seq_range *out, *tmp;struct lu_site *site;int rc;__u32 *opc;ENTRY;LASSERT(!(lustre_msg_get_flags(tgt_ses_req(tsi)->rq_reqmsg) & MSG_REPLAY));site = tsi->tsi_exp->exp_obd->obd_lu_dev->ld_site;LASSERT(site != NULL);opc = req_capsule_client_get(tsi->tsi_pill, &RMF_SEQ_OPC);if (opc != NULL) {out = req_capsule_server_get(tsi->tsi_pill, &RMF_SEQ_RANGE);if (out == NULL)RETURN(err_serious(-EPROTO));tmp = req_capsule_client_get(tsi->tsi_pill, &RMF_SEQ_RANGE);/* seq client passed mdt id, we need to pass that using out* range parameter */out->lsr_index = tmp->lsr_index;out->lsr_flags = tmp->lsr_flags;//走这个函数申请新seqrc = seq_server_handle(site, tsi->tsi_env, *opc, out);} else {rc = err_serious(-EPROTO);}RETURN(rc);
}static int seq_server_handle(struct lu_site *site,const struct lu_env *env,__u32 opc, struct lu_seq_range *out)
{switch (opc) {case SEQ_ALLOC_META:if (!ss_site->ss_server_seq) {CERROR("Sequence server is not ""initialized\n");RETURN(-EINVAL);}dev = lu2dt_dev(ss_site->ss_server_seq->lss_obj->do_lu.lo_dev);if (dev->dd_rdonly)RETURN(-EROFS);rc = seq_server_alloc_meta(ss_site->ss_server_seq, out, env);break;......
}int seq_server_alloc_meta(struct lu_server_seq *seq,struct lu_seq_range *out,const struct lu_env *env)
{......mutex_lock(&seq->lss_mutex);rc = __seq_server_alloc_meta(seq, out, env);mutex_unlock(&seq->lss_mutex);RETURN(rc);
}static int __seq_server_alloc_meta(struct lu_server_seq *seq,struct lu_seq_range *out,const struct lu_env *env)
{struct lu_seq_range *space = &seq->lss_space;int rc = 0;......// 检查server中seq是否够用,不够用会向controller新申请rc = seq_server_check_and_alloc_super(env, seq){if 耗尽{// 向mdt0申请新的可用的sequence rangerc = seq_client_alloc_super(seq->lss_cli, env);// 插入到本地的sequence server的fldrc = fld_insert_entry(env, fld, space);}}......// 更新server中的seq、将seq server持久化到ldiskfs、赋值给outrc = range_alloc_set(env, out, seq);......
}
seq svr <–> seq controller
当seq server中seq不足时,会向seq controller申请新的seq
seq server :
int seq_server_check_and_alloc_super(const struct lu_env *env,struct lu_server_seq *seq)
{struct lu_seq_range *space = &seq->lss_space;int rc = 0;ENTRY;/* Check if available space ends and allocate new super seq */if (lu_seq_range_is_exhausted(space)) {// 向mdt0申请seqrc = seq_client_alloc_super(seq->lss_cli, env);if (rc) {CDEBUG(D_HA, "%s: Can't allocate super-sequence:"" rc %d\n", seq->lss_name, rc);RETURN(rc);}/* Saving new range to allocation space. */*space = seq->lss_cli->lcs_space;LASSERT(lu_seq_range_is_sane(space));if (seq->lss_cli->lcs_srv == NULL) {struct lu_server_fld *fld;/* Insert it to the local FLDB */fld = seq->lss_site->ss_server_fld;mutex_lock(&fld->lsf_lock);// 将申请好的seq插入到本地的sequence server的fldrc = fld_insert_entry(env, fld, space);mutex_unlock(&fld->lsf_lock);}}if (lu_seq_range_is_zero(&seq->lss_lowater_set))__seq_set_init(env, seq);RETURN(rc);
}
controller:
static int seq_server_handle(struct lu_site *site,const struct lu_env *env,__u32 opc, struct lu_seq_range *out)
{int rc;struct seq_server_site *ss_site;struct dt_device *dev;ENTRY;ss_site = lu_site2seq(site);switch (opc) {case SEQ_ALLOC_META:......break;case SEQ_ALLOC_SUPER:if (!ss_site->ss_control_seq) {CERROR("Sequence controller is not ""initialized\n");RETURN(-EINVAL);}dev = lu2dt_dev(ss_site->ss_control_seq->lss_obj->do_lu.lo_dev);if (dev->dd_rdonly)RETURN(-EROFS);rc = seq_server_alloc_super(ss_site->ss_control_seq, out, env);break;default:rc = -EINVAL;break;}RETURN(rc);
}int seq_server_alloc_super(struct lu_server_seq *seq,struct lu_seq_range *out,const struct lu_env *env)
{int rc;ENTRY;mutex_lock(&seq->lss_mutex);rc = __seq_server_alloc_super(seq, out, env);mutex_unlock(&seq->lss_mutex);RETURN(rc);
}static int __seq_server_alloc_super(struct lu_server_seq *seq,struct lu_seq_range *out,const struct lu_env *env)
{struct lu_seq_range *space = &seq->lss_space;int rc;ENTRY;LASSERT(lu_seq_range_is_sane(space));if (lu_seq_range_is_exhausted(space)) {CERROR("%s: Sequences space is exhausted\n",seq->lss_name);RETURN(-ENOSPC);} else {//在mdt0内分配seqrange_alloc(out, space, seq->lss_width);}//将新申请的seq更新到fldrc = seq_store_update(env, seq, out, 1 /* sync */);LCONSOLE_INFO("%s: super-sequence allocation rc = %d " DRANGE"\n",seq->lss_name, rc, PRANGE(out));RETURN(rc);
}
上述理解如果有理解不正确的地方,欢迎各位大佬指正[手动抱拳]
参考了一位大佬的文章,链接如下:https://cloud.tencent.com/developer/article/2074601