本文链接： https://blog.csdn.net/zhonglinzhang/article/details/101753780

criService 实现了接口 runtime.RuntimeServiceServer

1. RunPodSandbox 函数

路径 pkg/server/sandbox_run.go，创建以及启动 sandbox，确认成功是 sandbox 状态为 ready

// RunPodSandbox creates and starts a pod-level sandbox. Runtimes should ensure
// the sandbox is in ready state.
func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandboxRequest) (_ *runtime.RunPodSandboxResponse, retErr error) {
	config := r.GetConfig()
	log.G(ctx).Debugf("Sandbox config %+v", config)

1.1 生成 ID，生成 name，注册 name <--> key 映射关系，防治并行创建

// Generate unique id and name for the sandbox and reserve the name.
id := util.GenerateID()
metadata := config.GetMetadata()
if metadata == nil {
	return nil, errors.New("sandbox config must include metadata")
}
name := makeSandboxName(metadata)
log.G(ctx).Debugf("Generated id %q for sandbox %q", id, name)
// Reserve the sandbox name to avoid concurrent `RunPodSandbox` request starting the
// same sandbox.
if err := c.sandboxNameIndex.Reserve(name, id); err != nil {
	return nil, errors.Wrapf(err, "failed to reserve sandbox name %q", name)
}

1.2 实例化 Sandbox，初始状态为 unknown

// Create initial internal sandbox object.
sandbox := sandboxstore.NewSandbox(
	sandboxstore.Metadata{
		ID:             id,
		Name:           name,
		Config:         config,
		RuntimeHandler: r.GetRuntimeHandler(),
	},
	sandboxstore.Status{
		State: sandboxstore.StateUnknown,
	},
)

1.3 确保有镜像，如果没有镜像则 pull 镜像

// Ensure sandbox container image snapshot.
image, err := c.ensureImageExists(ctx, c.config.SandboxImage, config)
if err != nil {
	return nil, errors.Wrapf(err, "failed to get sandbox image %q", c.config.SandboxImage)
}
containerdImage, err := c.toContainerdImage(ctx, *image)
if err != nil {
	return nil, errors.Wrapf(err, "failed to get image from containerd %q", image.ID)
}

1.4 获取 sandbox runtime

注解 io.kubernetes.cri.untrusted-workload = true，设置这个 untrusted 返回 untrusted runtime，否则返回默认 runtime io.containerd.runc.v1

[plugins."io.containerd.grpc.v1.cri".containerd]
snapshotter = "overlayfs"
default_runtime_name = "runc"
no_pivot = false
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
runtime_type = ""
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
runtime_type = ""
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v1"
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false

// getSandboxRuntime returns the runtime configuration for sandbox.
// If the sandbox contains untrusted workload, runtime for untrusted workload will be returned,
// or else default runtime will be returned.
func (c *criService) getSandboxRuntime(config *runtime.PodSandboxConfig, runtimeHandler string) (criconfig.Runtime, error) {
	if untrustedWorkload(config) {
		// If the untrusted annotation is provided, runtimeHandler MUST be empty.
		if runtimeHandler != "" && runtimeHandler != criconfig.RuntimeUntrusted {
			return criconfig.Runtime{}, errors.New("untrusted workload with explicit runtime handler is not allowed")
		}

		//  If the untrusted workload is requesting access to the host/node, this request will fail.
		//
		//  Note: If the workload is marked untrusted but requests privileged, this can be granted, as the
		// runtime may support this.  For example, in a virtual-machine isolated runtime, privileged
		// is a supported option, granting the workload to access the entire guest VM instead of host.
		// TODO(windows): Deprecate this so that we don't need to handle it for windows.
		if hostAccessingSandbox(config) {
			return criconfig.Runtime{}, errors.New("untrusted workload with host access is not allowed")
		}

		runtimeHandler = criconfig.RuntimeUntrusted
	}

	if runtimeHandler == "" {
		runtimeHandler = c.config.ContainerdConfig.DefaultRuntimeName
	}

	handler, ok := c.config.ContainerdConfig.Runtimes[runtimeHandler]
	if !ok {
		return criconfig.Runtime{}, errors.Errorf("no runtime for %q is configured", runtimeHandler)
	}
	return handler, nil
}

1.5 需要为 pod 设置网络

如果不是 host 网络模式，需要创建 namespace

NewNetNS 创建网络 namespace，在目录 /var/run/netns/cni-%x-%x-%x-%x-%x

if podNetwork {
	// If it is not in host network namespace then create a namespace and set the sandbox
	// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
	// namespaces. If the pod is in host network namespace then both are empty and should not
	// be used.
	sandbox.NetNS, err = netns.NewNetNS()
	if err != nil {
		return nil, errors.Wrapf(err, "failed to create network namespace for sandbox %q", id)
	}
	sandbox.NetNSPath = sandbox.NetNS.GetPath()

2. setupPodNetwork 为 sandbox 创建网络

整理传给 CNI 插件的配置，包括 sandbox ID，网络 namespace，以及基本配置，如果包括 bandwidth，dns

// setupPodNetwork setups up the network for a pod
func (c *criService) setupPodNetwork(ctx context.Context, sandbox *sandboxstore.Sandbox) error {
	var (
		id     = sandbox.ID
		config = sandbox.Config
		path   = sandbox.NetNSPath
	)
	if c.netPlugin == nil {
		return errors.New("cni config not initialized")
	}

	opts, err := cniNamespaceOpts(id, config)
	if err != nil {
		return errors.Wrap(err, "get cni namespace options")
	}

2..1 netPlugin.Setup 最终调用 AddNetworkList CNI 插件接口为 sandbox 配置网络

最终调用 plugin 二进制为 sandbox 配置网络

这里轻描淡写，知道个过程过，假设配置配置网络成功，接着看看做了哪些工作

result, err := c.netPlugin.Setup(ctx, id, path, opts...)
if err != nil {
	return err
}
logDebugCNIResult(ctx, id, result)
// Check if the default interface has IP config
if configs, ok := result.Interfaces[defaultIfName]; ok && len(configs.IPConfigs) > 0 {
	sandbox.IP, sandbox.AdditionalIPs = selectPodIPs(configs.IPConfigs)
	sandbox.CNIResult = result
	return nil
}

3. 生成 runtime spec 配置

可以使用 crictl pods，crictl inspectp $id 查看配置

func (c *criService) sandboxContainerSpec(id string, config *runtime.PodSandboxConfig,
	imageConfig *imagespec.ImageConfig, nsPath string, runtimePodAnnotations []string) (*runtimespec.Spec, error) {
	// Creates a spec Generator with the default spec.
	// TODO(random-liu): [P1] Compare the default settings with docker and containerd default.
	specOpts := []oci.SpecOpts{
		customopts.WithoutRunMount,
		customopts.WithoutDefaultSecuritySettings,
		customopts.WithRelativeRoot(relativeRootfsPath),
		oci.WithEnv(imageConfig.Env),
		oci.WithRootFSReadonly(),
		oci.WithHostname(config.GetHostname()),
	}
	if imageConfig.WorkingDir != "" {
		specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir))
	}

3.1 label 的类型为 sandbox

// Generate spec options that will be applied to the spec later.
specOpts, err := c.sandboxContainerSpecOpts(config, &image.ImageSpec.Config)
if err != nil {
	return nil, errors.Wrap(err, "failed to generate sanbdox container spec options")
}

sandboxLabels := buildLabels(config.Labels, containerKindSandbox)

4. 存储 sandbox 信息，创建 root 工作目录

	container, err := c.client.NewContainer(ctx, id, opts...)
	if err != nil {
		return nil, errors.Wrap(err, "failed to create containerd container")
	}


	// Create sandbox container root directories.
	sandboxRootDir := c.getSandboxRootDir(id)
	if err := c.os.MkdirAll(sandboxRootDir, 0755); err != nil {
		return nil, errors.Wrapf(err, "failed to create sandbox root directory %q",
			sandboxRootDir)
	}

4.1 setupSandboxFiles 主要创建 hostname resolv.conf hosts 等文件

// Setup files required for the sandbox.
if err = c.setupSandboxFiles(id, config); err != nil {
	return nil, errors.Wrapf(err, "failed to setup sandbox files")
}

5. 创建 sandbox 任务

这个其实最终是发送 task 请求，分别为 CreateTaskRequest，StartRequest，创建以及启动任务

taskOpts := c.taskOpts(ociRuntime.Type)
// We don't need stdio for sandbox container.
task, err := container.NewTask(ctx, containerdio.NullIO, taskOpts...)
if err != nil {
	return nil, errors.Wrap(err, "failed to create containerd task")
}


// wait is a long running background request, no timeout needed.
exitCh, err := task.Wait(ctrdutil.NamespacedContext())
if err != nil {
	return nil, errors.Wrap(err, "failed to wait for sandbox container task")
}

if err := task.Start(ctx); err != nil {
	return nil, errors.Wrapf(err, "failed to start sandbox container task %q", id)
}

5.1 比如使用默认 tasks-service io.containerd.service.v1

func (l *local) Create(ctx context.Context, r *api.CreateTaskRequest, _ ...grpc.CallOption) (*api.CreateTaskResponse, error) {
	container, err := l.getContainer(ctx, r.ContainerID)
	if err != nil {
		return nil, errdefs.ToGRPC(err)
	}
	checkpointPath, err := getRestorePath(container.Runtime.Name, r.Options)
	if err != nil {
		return nil, err
	}

5.2 比如 io.containerd.runc.v1

实现路径为 contaienrd/runtime/v1/runtime.go

// Create a new task
func (r *Runtime) Create(ctx context.Context, id string, opts runtime.CreateOpts) (_ runtime.Task, err error) {
	namespace, err := namespaces.NamespaceRequired(ctx)
	if err != nil {
		return nil, err
	}

	if err := identifiers.Validate(id); err != nil {
		return nil, errors.Wrapf(err, "invalid task id")
	}

	ropts, err := r.getRuncOptions(ctx, id)
	if err != nil {
		return nil, err
	}

启动 shim 进程

/usr/bin/containerd-shim-runc-v1 -namespace k8s.io -id d84185af26fcc146b4787ed08543c49d327bb97171ed6b669618f9793a8545fc -address /run/containerd/containerd.sock

shimopt := ShimLocal(r.config, r.events)
if !r.config.NoShim {
	var cgroup string
	if opts.TaskOptions != nil {
		v, err := typeurl.UnmarshalAny(opts.TaskOptions)
		if err != nil {
			return nil, err
		}
		cgroup = v.(*runctypes.CreateOptions).ShimCgroup
	}
	exitHandler := func() {
		log.G(ctx).WithField("id", id).Info("shim reaped")

		if _, err := r.tasks.Get(ctx, id); err != nil {
			// Task was never started or was already successfully deleted
			return
		}

		if err = r.cleanupAfterDeadShim(context.Background(), bundle, namespace, id); err != nil {
			log.G(ctx).WithError(err).WithFields(logrus.Fields{
				"id":        id,
				"namespace": namespace,
			}).Warn("failed to clean up after killed shim")
		}
	}
	shimopt = ShimRemote(r.config, r.address, cgroup, exitHandler)
}

与 shim 建立GRPC 连接，发送 CreateTaskRequest

sopts := &shim.CreateTaskRequest{
	ID:         id,
	Bundle:     bundle.path,
	Runtime:    rt,
	Stdin:      opts.IO.Stdin,
	Stdout:     opts.IO.Stdout,
	Stderr:     opts.IO.Stderr,
	Terminal:   opts.IO.Terminal,
	Checkpoint: opts.Checkpoint,
	Options:    opts.TaskOptions,
}
for _, m := range opts.Rootfs {
	sopts.Rootfs = append(sopts.Rootfs, &types.Mount{
		Type:    m.Type,
		Source:  m.Source,
		Options: m.Options,
	})
}
cr, err := s.Create(ctx, sopts)
if err != nil {
	return nil, errdefs.FromGRPC(err)
}

startTaskRequest 一样的流程

6. 更新 sandbox 状态为 ready

if err := sandbox.Status.Update(func(status sandboxstore.Status) (sandboxstore.Status, error) {
	// Set the pod sandbox as ready after successfully start sandbox container.
	status.Pid = task.Pid()
	status.State = sandboxstore.StateReady
	status.CreatedAt = info.CreatedAt
	return status, nil
}); err != nil {
	return nil, errors.Wrap(err, "failed to update sandbox status")
}

总结：

RunPodSandbox 获取配置，生成 ID，name 注册 name <--> 映射关系，防止重复并发创建

确保 image 本地节点存在，不存在册 pull image

获取 runtime，根据 pod 注解，以及配置文件，如果 untrusted 则返回该 runtime，否则返回默认 runtime

为 sandbox 创建网络，与 docker-shim 不一样的是这个先创建网络

生成 spec 配置

发送 GRPC 创建以及启动请求，成功将 sandbox 状态改为 ready

【containerd 源码分析】containerd cri PodRunSandbox 源码分析之二