criService 实现了接口 runtime.RuntimeServiceServer
1. RunPodSandbox 函数
路径 pkg/server/sandbox_run.go,创建以及启动 sandbox,确认成功是 sandbox 状态为 ready
// RunPodSandbox creates and starts a pod-level sandbox. Runtimes should ensure
// the sandbox is in ready state.
func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandboxRequest) (_ *runtime.RunPodSandboxResponse, retErr error) {
config := r.GetConfig()
log.G(ctx).Debugf("Sandbox config %+v", config)
1.1 生成 ID,生成 name,注册 name <--> key 映射关系,防治并行创建
// Generate unique id and name for the sandbox and reserve the name.
id := util.GenerateID()
metadata := config.GetMetadata()
if metadata == nil {
return nil, errors.New("sandbox config must include metadata")
}
name := makeSandboxName(metadata)
log.G(ctx).Debugf("Generated id %q for sandbox %q", id, name)
// Reserve the sandbox name to avoid concurrent `RunPodSandbox` request starting the
// same sandbox.
if err := c.sandboxNameIndex.Reserve(name, id); err != nil {
return nil, errors.Wrapf(err, "failed to reserve sandbox name %q", name)
}
1.2 实例化 Sandbox,初始状态为 unknown
// Create initial internal sandbox object.
sandbox := sandboxstore.NewSandbox(
sandboxstore.Metadata{
ID: id,
Name: name,
Config: config,
RuntimeHandler: r.GetRuntimeHandler(),
},
sandboxstore.Status{
State: sandboxstore.StateUnknown,
},
)
1.3 确保有镜像,如果没有镜像则 pull 镜像
// Ensure sandbox container image snapshot.
image, err := c.ensureImageExists(ctx, c.config.SandboxImage, config)
if err != nil {
return nil, errors.Wrapf(err, "failed to get sandbox image %q", c.config.SandboxImage)
}
containerdImage, err := c.toContainerdImage(ctx, *image)
if err != nil {
return nil, errors.Wrapf(err, "failed to get image from containerd %q", image.ID)
}
1.4 获取 sandbox runtime
注解 io.kubernetes.cri.untrusted-workload = true,设置这个 untrusted 返回 untrusted runtime,否则返回默认 runtime io.containerd.runc.v1
[plugins."io.containerd.grpc.v1.cri".containerd]
snapshotter = "overlayfs"
default_runtime_name = "runc"
no_pivot = false
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
runtime_type = ""
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
runtime_type = ""
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v1"
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false
// getSandboxRuntime returns the runtime configuration for sandbox.
// If the sandbox contains untrusted workload, runtime for untrusted workload will be returned,
// or else default runtime will be returned.
func (c *criService) getSandboxRuntime(config *runtime.PodSandboxConfig, runtimeHandler string) (criconfig.Runtime, error) {
if untrustedWorkload(config) {
// If the untrusted annotation is provided, runtimeHandler MUST be empty.
if runtimeHandler != "" && runtimeHandler != criconfig.RuntimeUntrusted {
return criconfig.Runtime{}, errors.New("untrusted workload with explicit runtime handler is not allowed")
}
// If the untrusted workload is requesting access to the host/node, this request will fail.
//
// Note: If the workload is marked untrusted but requests privileged, this can be granted, as the
// runtime may support this. For example, in a virtual-machine isolated runtime, privileged
// is a supported option, granting the workload to access the entire guest VM instead of host.
// TODO(windows): Deprecate this so that we don't need to handle it for windows.
if hostAccessingSandbox(config) {
return criconfig.Runtime{}, errors.New("untrusted workload with host access is not allowed")
}
runtimeHandler = criconfig.RuntimeUntrusted
}
if runtimeHandler == "" {
runtimeHandler = c.config.ContainerdConfig.DefaultRuntimeName
}
handler, ok := c.config.ContainerdConfig.Runtimes[runtimeHandler]
if !ok {
return criconfig.Runtime{}, errors.Errorf("no runtime for %q is configured", runtimeHandler)
}
return handler, nil
}
1.5 需要为 pod 设置网络
如果不是 host 网络模式,需要创建 namespace
NewNetNS 创建网络 namespace,在目录 /var/run/netns/cni-%x-%x-%x-%x-%x
if podNetwork {
// If it is not in host network namespace then create a namespace and set the sandbox
// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
// namespaces. If the pod is in host network namespace then both are empty and should not
// be used.
sandbox.NetNS, err = netns.NewNetNS()
if err != nil {
return nil, errors.Wrapf(err, "failed to create network namespace for sandbox %q", id)
}
sandbox.NetNSPath = sandbox.NetNS.GetPath()
2. setupPodNetwork 为 sandbox 创建网络
整理传给 CNI 插件的配置,包括 sandbox ID,网络 namespace,以及基本配置,如果包括 bandwidth,dns
// setupPodNetwork setups up the network for a pod
func (c *criService) setupPodNetwork(ctx context.Context, sandbox *sandboxstore.Sandbox) error {
var (
id = sandbox.ID
config = sandbox.Config
path = sandbox.NetNSPath
)
if c.netPlugin == nil {
return errors.New("cni config not initialized")
}
opts, err := cniNamespaceOpts(id, config)
if err != nil {
return errors.Wrap(err, "get cni namespace options")
}
2..1 netPlugin.Setup 最终调用 AddNetworkList CNI 插件接口为 sandbox 配置网络
最终调用 plugin 二进制为 sandbox 配置网络
这里轻描淡写,知道个过程过,假设配置配置网络成功,接着看看做了哪些工作
result, err := c.netPlugin.Setup(ctx, id, path, opts...)
if err != nil {
return err
}
logDebugCNIResult(ctx, id, result)
// Check if the default interface has IP config
if configs, ok := result.Interfaces[defaultIfName]; ok && len(configs.IPConfigs) > 0 {
sandbox.IP, sandbox.AdditionalIPs = selectPodIPs(configs.IPConfigs)
sandbox.CNIResult = result
return nil
}
3. 生成 runtime spec 配置
可以使用 crictl pods,crictl inspectp $id 查看配置
func (c *criService) sandboxContainerSpec(id string, config *runtime.PodSandboxConfig,
imageConfig *imagespec.ImageConfig, nsPath string, runtimePodAnnotations []string) (*runtimespec.Spec, error) {
// Creates a spec Generator with the default spec.
// TODO(random-liu): [P1] Compare the default settings with docker and containerd default.
specOpts := []oci.SpecOpts{
customopts.WithoutRunMount,
customopts.WithoutDefaultSecuritySettings,
customopts.WithRelativeRoot(relativeRootfsPath),
oci.WithEnv(imageConfig.Env),
oci.WithRootFSReadonly(),
oci.WithHostname(config.GetHostname()),
}
if imageConfig.WorkingDir != "" {
specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir))
}
3.1 label 的类型为 sandbox
// Generate spec options that will be applied to the spec later.
specOpts, err := c.sandboxContainerSpecOpts(config, &image.ImageSpec.Config)
if err != nil {
return nil, errors.Wrap(err, "failed to generate sanbdox container spec options")
}
sandboxLabels := buildLabels(config.Labels, containerKindSandbox)
4. 存储 sandbox 信息,创建 root 工作目录
container, err := c.client.NewContainer(ctx, id, opts...)
if err != nil {
return nil, errors.Wrap(err, "failed to create containerd container")
}
// Create sandbox container root directories.
sandboxRootDir := c.getSandboxRootDir(id)
if err := c.os.MkdirAll(sandboxRootDir, 0755); err != nil {
return nil, errors.Wrapf(err, "failed to create sandbox root directory %q",
sandboxRootDir)
}
4.1 setupSandboxFiles 主要创建 hostname resolv.conf hosts 等文件
// Setup files required for the sandbox.
if err = c.setupSandboxFiles(id, config); err != nil {
return nil, errors.Wrapf(err, "failed to setup sandbox files")
}
5. 创建 sandbox 任务
这个其实最终是发送 task 请求,分别为 CreateTaskRequest,StartRequest,创建以及启动任务
taskOpts := c.taskOpts(ociRuntime.Type)
// We don't need stdio for sandbox container.
task, err := container.NewTask(ctx, containerdio.NullIO, taskOpts...)
if err != nil {
return nil, errors.Wrap(err, "failed to create containerd task")
}
// wait is a long running background request, no timeout needed.
exitCh, err := task.Wait(ctrdutil.NamespacedContext())
if err != nil {
return nil, errors.Wrap(err, "failed to wait for sandbox container task")
}
if err := task.Start(ctx); err != nil {
return nil, errors.Wrapf(err, "failed to start sandbox container task %q", id)
}
5.1 比如使用默认 tasks-service io.containerd.service.v1
func (l *local) Create(ctx context.Context, r *api.CreateTaskRequest, _ ...grpc.CallOption) (*api.CreateTaskResponse, error) {
container, err := l.getContainer(ctx, r.ContainerID)
if err != nil {
return nil, errdefs.ToGRPC(err)
}
checkpointPath, err := getRestorePath(container.Runtime.Name, r.Options)
if err != nil {
return nil, err
}
5.2 比如 io.containerd.runc.v1
实现路径为 contaienrd/runtime/v1/runtime.go
// Create a new task
func (r *Runtime) Create(ctx context.Context, id string, opts runtime.CreateOpts) (_ runtime.Task, err error) {
namespace, err := namespaces.NamespaceRequired(ctx)
if err != nil {
return nil, err
}
if err := identifiers.Validate(id); err != nil {
return nil, errors.Wrapf(err, "invalid task id")
}
ropts, err := r.getRuncOptions(ctx, id)
if err != nil {
return nil, err
}
启动 shim 进程
/usr/bin/containerd-shim-runc-v1 -namespace k8s.io -id d84185af26fcc146b4787ed08543c49d327bb97171ed6b669618f9793a8545fc -address /run/containerd/containerd.sock
shimopt := ShimLocal(r.config, r.events)
if !r.config.NoShim {
var cgroup string
if opts.TaskOptions != nil {
v, err := typeurl.UnmarshalAny(opts.TaskOptions)
if err != nil {
return nil, err
}
cgroup = v.(*runctypes.CreateOptions).ShimCgroup
}
exitHandler := func() {
log.G(ctx).WithField("id", id).Info("shim reaped")
if _, err := r.tasks.Get(ctx, id); err != nil {
// Task was never started or was already successfully deleted
return
}
if err = r.cleanupAfterDeadShim(context.Background(), bundle, namespace, id); err != nil {
log.G(ctx).WithError(err).WithFields(logrus.Fields{
"id": id,
"namespace": namespace,
}).Warn("failed to clean up after killed shim")
}
}
shimopt = ShimRemote(r.config, r.address, cgroup, exitHandler)
}
与 shim 建立GRPC 连接,发送 CreateTaskRequest
sopts := &shim.CreateTaskRequest{
ID: id,
Bundle: bundle.path,
Runtime: rt,
Stdin: opts.IO.Stdin,
Stdout: opts.IO.Stdout,
Stderr: opts.IO.Stderr,
Terminal: opts.IO.Terminal,
Checkpoint: opts.Checkpoint,
Options: opts.TaskOptions,
}
for _, m := range opts.Rootfs {
sopts.Rootfs = append(sopts.Rootfs, &types.Mount{
Type: m.Type,
Source: m.Source,
Options: m.Options,
})
}
cr, err := s.Create(ctx, sopts)
if err != nil {
return nil, errdefs.FromGRPC(err)
}
startTaskRequest 一样的流程
6. 更新 sandbox 状态为 ready
if err := sandbox.Status.Update(func(status sandboxstore.Status) (sandboxstore.Status, error) {
// Set the pod sandbox as ready after successfully start sandbox container.
status.Pid = task.Pid()
status.State = sandboxstore.StateReady
status.CreatedAt = info.CreatedAt
return status, nil
}); err != nil {
return nil, errors.Wrap(err, "failed to update sandbox status")
}
总结:
RunPodSandbox 获取配置,生成 ID,name 注册 name <--> 映射关系,防止重复并发创建
确保 image 本地节点存在,不存在册 pull image
获取 runtime,根据 pod 注解,以及配置文件,如果 untrusted 则返回该 runtime,否则返回默认 runtime
为 sandbox 创建网络,与 docker-shim 不一样的是这个先创建网络
生成 spec 配置
发送 GRPC 创建以及启动请求,成功将 sandbox 状态改为 ready