querycoord启动源码分析
结构体
// Server is the grpc server of QueryCoord.
type Server struct {wg sync.WaitGrouploopCtx context.ContextloopCancel context.CancelFuncgrpcServer *grpc.ServerserverID atomic.Int64grpcErrChan chan error// 是一个接口类型queryCoord types.QueryCoordComponentfactory dependency.FactoryetcdCli *clientv3.ClienttikvCli *txnkv.ClientdataCoord types.DataCoordClientrootCoord types.RootCoordClient
}
分析变量dataCoord、rootCoord是何时赋予的值。
queryCoord是一个接口,实现queryCoord api功能。
func (mr *MilvusRoles) runQueryCoord(ctx context.Context, localMsg bool, wg *sync.WaitGroup) component {wg.Add(1)return runComponent(ctx, localMsg, wg, components.NewQueryCoord, metrics.RegisterQueryCoord)
}// creator用NewQueryCoord替换
role, err = creator(ctx, factory)
components.NewQueryCoord是一个函数。
NewQueryCoord()用来创建QueryCoord结构体。
// NewQueryCoord creates a new QueryCoord
func NewQueryCoord(ctx context.Context, factory dependency.Factory) (*QueryCoord, error) {svr, err := grpcquerycoord.NewServer(ctx, factory)if err != nil {return nil, err}return &QueryCoord{ctx: ctx,svr: svr,}, nil
}
grpcquerycoord.NewServer()产生的是本结构体Server。
进入NewServer:
// NewServer create a new QueryCoord grpc server.
func NewServer(ctx context.Context, factory dependency.Factory) (*Server, error) {ctx1, cancel := context.WithCancel(ctx)svr, err := qc.NewQueryCoord(ctx1)if err != nil {cancel()return nil, err}return &Server{queryCoord: svr,loopCtx: ctx1,loopCancel: cancel,factory: factory,grpcErrChan: make(chan error),}, nil
}
qc.NewQueryCoord()返回一个结构体,是types.QueryCoordComponent接口的实现。
执行Run()
Server结构体创建后,调用结构体的Run()方法。
func runComponent[T component](ctx context.Context,localMsg bool,runWg *sync.WaitGroup,creator func(context.Context, dependency.Factory) (T, error),metricRegister func(*prometheus.Registry),
) component {var role Tsign := make(chan struct{})go func() {factory := dependency.NewFactory(localMsg)var err errorrole, err = creator(ctx, factory)if localMsg {paramtable.SetRole(typeutil.StandaloneRole)} else {paramtable.SetRole(role.GetName())}if err != nil {panic(err)}close(sign)// 在这里调用对应组件结构体的Run()方法,这里是QueryCoord结构体if err := role.Run(); err != nil {panic(err)}runWg.Done()}()......
}
runComponent是一个包裹函数。
// Run starts service
func (qs *QueryCoord) Run() error {if err := qs.svr.Run(); err != nil {log.Error("QueryCoord starts error", zap.Error(err))return err}log.Debug("QueryCoord successfully started")return nil
}
Run()方法调用qs.svr.Run()方法。srv是qc.NewQueryCoord()返回的结构体。
// Run initializes and starts QueryCoord's grpc service.
func (s *Server) Run() error {if err := s.init(); err != nil {return err}log.Debug("QueryCoord init done ...")if err := s.start(); err != nil {return err}log.Debug("QueryCoord start done ...")return nil
}
接下来分析s.init()和s.start()方法。
s.init()
// init initializes QueryCoord's grpc service.
func (s *Server) init() error {params := paramtable.Get()etcdConfig := ¶ms.EtcdCfgrpcParams := ¶ms.QueryCoordGrpcServerCfgetcdCli, err := etcd.GetEtcdClient(etcdConfig.UseEmbedEtcd.GetAsBool(),etcdConfig.EtcdUseSSL.GetAsBool(),etcdConfig.Endpoints.GetAsStrings(),etcdConfig.EtcdTLSCert.GetValue(),etcdConfig.EtcdTLSKey.GetValue(),etcdConfig.EtcdTLSCACert.GetValue(),etcdConfig.EtcdTLSMinVersion.GetValue())if err != nil {log.Debug("QueryCoord connect to etcd failed", zap.Error(err))return err}s.etcdCli = etcdClis.SetEtcdClient(etcdCli)s.queryCoord.SetAddress(rpcParams.GetAddress())if params.MetaStoreCfg.MetaStoreType.GetValue() == util.MetaStoreTypeTiKV {......}s.wg.Add(1)// 启动grpc,默认为19531go s.startGrpcLoop(rpcParams.Port.GetAsInt())// wait for grpc server loop starterr = <-s.grpcErrChanif err != nil {return err}// --- Master Server Client ---// 创建rootCoord客户端if s.rootCoord == nil {s.rootCoord, err = rcc.NewClient(s.loopCtx, qc.Params.EtcdCfg.MetaRootPath.GetValue(), s.etcdCli)if err != nil {log.Error("QueryCoord try to new RootCoord client failed", zap.Error(err))panic(err)}}// wait for master init or healthy// 等待rootcoord服务正常log.Debug("QueryCoord try to wait for RootCoord ready")err = componentutil.WaitForComponentHealthy(s.loopCtx, s.rootCoord, "RootCoord", 1000000, time.Millisecond*200)if err != nil {log.Error("QueryCoord wait for RootCoord ready failed", zap.Error(err))panic(err)}if err := s.SetRootCoord(s.rootCoord); err != nil {panic(err)}log.Debug("QueryCoord report RootCoord ready")// --- Data service client ---// 创建dataCoord客户端if s.dataCoord == nil {s.dataCoord, err = dcc.NewClient(s.loopCtx, qc.Params.EtcdCfg.MetaRootPath.GetValue(), s.etcdCli)if err != nil {log.Error("QueryCoord try to new DataCoord client failed", zap.Error(err))panic(err)}}// 等待datacoord服务正常log.Debug("QueryCoord try to wait for DataCoord ready")err = componentutil.WaitForComponentHealthy(s.loopCtx, s.dataCoord, "DataCoord", 1000000, time.Millisecond*200)if err != nil {log.Error("QueryCoord wait for DataCoord ready failed", zap.Error(err))panic(err)}if err := s.SetDataCoord(s.dataCoord); err != nil {panic(err)}log.Debug("QueryCoord report DataCoord ready")// 执行真正的初始化if err := s.queryCoord.Init(); err != nil {return err}return nil
}
这段可以看出来,创建了etcdCli并赋予给了s.etcdCli。
s.startGrpcLoop()启动grpc端口服务。
最终调用s.queryCoord.Init()进行初始化,代码位置:internal\querycoordv2\server.go
s.queryCoord是接口类型types.QueryCoordComponent ,QueryCoordComponent 继承于Component。
// QueryCoord is the interface `querycoord` package implements
type QueryCoord interface {Componentquerypb.QueryCoordServer
}// Component is the interface all services implement
type Component interface {Init() errorStart() errorStop() errorRegister() error
}
接口套接口:
RootCoordComponent -> RootCoord -> Component
DataCoordComponent -> DataCoord -> Component
QueryCoordComponent -> QueryCoord -> Component
ProxyComponent -> Proxy -> Component
QueryNodeComponent -> QueryNode -> Component
IndexNodeComponent -> IndexNode -> Component
DataNodeComponent -> DataNode -> Component
各组件最终的Init()初始化代码路径:
internal\rootcoord\root_coord.go->Init()
internal\datacoord\server.go->Init()
internal\querycoordv2\server.go->Init()
internal\datanode\data_node.go->Init()
internal\indexnode\indexnode.go->Init()
internal\querynodev2\server.go->Init()
internal\proxy\proxy.go->Init()
回过头来继续querycoord的init。
func (s *Server) Init() error {log.Info("QueryCoord start init",zap.String("meta-root-path", Params.EtcdCfg.MetaRootPath.GetValue()),zap.String("address", s.address))if err := s.initSession(); err != nil {return err}if s.enableActiveStandBy {......}// 真正执行初始化return s.initQueryCoord()
}
继续进入c.initQueryCoord():
func (s *Server) initQueryCoord() error {s.UpdateStateCode(commonpb.StateCode_Initializing)log.Info("QueryCoord", zap.Any("State", commonpb.StateCode_Initializing))// Init KV and ID allocatormetaType := Params.MetaStoreCfg.MetaStoreType.GetValue()var idAllocatorKV kv.TxnKVlog.Info(fmt.Sprintf("query coordinator connecting to %s.", metaType))if metaType == util.MetaStoreTypeTiKV {s.kv = tikv.NewTiKV(s.tikvCli, Params.TiKVCfg.MetaRootPath.GetValue())idAllocatorKV = tsoutil.NewTSOTiKVBase(s.tikvCli, Params.TiKVCfg.KvRootPath.GetValue(), "querycoord-id-allocator")} else if metaType == util.MetaStoreTypeEtcd {s.kv = etcdkv.NewEtcdKV(s.etcdCli, Params.EtcdCfg.MetaRootPath.GetValue())idAllocatorKV = tsoutil.NewTSOKVBase(s.etcdCli, Params.EtcdCfg.KvRootPath.GetValue(), "querycoord-id-allocator")} else {return fmt.Errorf("not supported meta store: %s", metaType)}log.Info(fmt.Sprintf("query coordinator successfully connected to %s.", metaType))idAllocator := allocator.NewGlobalIDAllocator("idTimestamp", idAllocatorKV)err := idAllocator.Initialize()if err != nil {log.Error("query coordinator id allocator initialize failed", zap.Error(err))return err}s.idAllocator = func() (int64, error) {return idAllocator.AllocOne()}// Init metrics cache managers.metricsCacheManager = metricsinfo.NewMetricsCacheManager()// Init metas.nodeMgr = session.NewNodeManager()err = s.initMeta()if err != nil {return err}// Init sessionlog.Info("init session")s.cluster = session.NewCluster(s.nodeMgr, s.queryNodeCreator)// Init schedulerslog.Info("init schedulers")s.jobScheduler = job.NewScheduler()s.taskScheduler = task.NewScheduler(s.ctx,s.meta,s.dist,s.targetMgr,s.broker,s.cluster,s.nodeMgr,)// Init heartbeatlog.Info("init dist controller")s.distController = dist.NewDistController(s.cluster,s.nodeMgr,s.dist,s.targetMgr,s.taskScheduler,)// Init balancer map and balancerlog.Info("init all available balancer")s.balancerMap = make(map[string]balance.Balance)s.balancerMap[balance.RoundRobinBalancerName] = balance.NewRoundRobinBalancer(s.taskScheduler, s.nodeMgr)s.balancerMap[balance.RowCountBasedBalancerName] = balance.NewRowCountBasedBalancer(s.taskScheduler,s.nodeMgr, s.dist, s.meta, s.targetMgr)s.balancerMap[balance.ScoreBasedBalancerName] = balance.NewScoreBasedBalancer(s.taskScheduler,s.nodeMgr, s.dist, s.meta, s.targetMgr)if balancer, ok := s.balancerMap[params.Params.QueryCoordCfg.Balancer.GetValue()]; ok {s.balancer = balancerlog.Info("use config balancer", zap.String("balancer", params.Params.QueryCoordCfg.Balancer.GetValue()))} else {s.balancer = s.balancerMap[balance.RowCountBasedBalancerName]log.Info("use rowCountBased auto balancer")}// Init checker controllerlog.Info("init checker controller")s.checkerController = checkers.NewCheckerController(s.meta,s.dist,s.targetMgr,s.balancer,s.nodeMgr,s.taskScheduler,s.broker,)// Init observerss.initObserver()// Init load status cachemeta.GlobalFailedLoadCache = meta.NewFailedLoadCache()log.Info("QueryCoord init success")return err
}
从代码可以看出初始化是在填充querycoord结构体。
s.start()
启动组件的逻辑。
// start starts QueryCoord's grpc service.
func (s *Server) start() error {err := s.queryCoord.Register()if err != nil {return err}return s.queryCoord.Start()
}
s.queryCoord是一个Component接口,实现了 方法Init()、 Start() 、 Stop() 、 Register() 。
Register():向元数据etcd注册。
Start():用来启动组件。
进入s.queryCoord.Start():
func (s *Server) Start() error {if !s.enableActiveStandBy {if err := s.startQueryCoord(); err != nil {return err}log.Info("QueryCoord started")}return nil
}
真正执行启动逻辑在s.startQueryCoord()。
func (s *Server) startQueryCoord() error {log.Info("start watcher...")sessions, revision, err := s.session.GetSessions(typeutil.QueryNodeRole)if err != nil {return err}for _, node := range sessions {s.nodeMgr.Add(session.NewNodeInfo(node.ServerID, node.Address))s.taskScheduler.AddExecutor(node.ServerID)if node.Stopping {s.nodeMgr.Stopping(node.ServerID)}}s.checkReplicas()for _, node := range sessions {s.handleNodeUp(node.ServerID)}s.wg.Add(2)go s.handleNodeUpLoop()go s.watchNodes(revision)// Recover dist, to avoid generate too much task when dist not ready after restarts.distController.SyncAll(s.ctx)s.startServerLoop()s.afterStart()s.UpdateStateCode(commonpb.StateCode_Healthy)sessionutil.SaveServerInfo(typeutil.QueryCoordRole, s.session.ServerID)return nil
}
要详细知道启动querycoord组件做了什么事情,研究这个函数。