1 概述:
1.1 环境
版本信息如下:
a、操作系统:centos 7.6
c、kubernetes版本:v1.15.0
1.2 exec原理概述
为进入目标pod的目标容器中执行命令(挂载标准输入和输出、标准错误的情景),kubectl exec访问kube-apiserver的connect接口(中间过程是通过http协议来握手,之后升级为spdy协议),kube-apiserver把请求转发至对应节点的kubelet进程,而kubelet进程此时是一个反向代理,再把请求转发至cri shim程序(kubelet进程中实现了docker shim),cri shim程序再调用容器运行时的exec接口。当cri是dockert,kubelet进程和docker shim可通过localhost网卡来通信。
补充说明:
1)v1.18版本开始,必须是通过kubelet代理客户端的streaming请求。
2 源码简析:
2.1 kube-apiserver侧
kube-apiserver是无状态的web服务,exec接口的注册:
// 注册http handler,重点看restfulConnectResource(...)。
func (a *APIInstaller) registerResourceHandlers(path string, storage rest.Storage, ws *restful.WebService) (*metav1.APIResource, error) {
/*
其他代码
*/
switch action.Verb {
case "CONNECT":
for _, method := range connecter.ConnectMethods() {
connectProducedObject := storageMeta.ProducesObject(method)
if connectProducedObject == nil {
connectProducedObject = "string"
}
doc := "connect " + method + " requests to " + kind
if isSubresource {
doc = "connect " + method + " requests to " + subresource + " of " + kind
}
// http handler,主要是restfulConnectResource(...)
handler := metrics.InstrumentRouteFunc(action.Verb, group, version, resource, subresource, requestScope, metrics.APIServerComponent, restfulConnectResource(connecter, reqScope, admit, path, isSubresource))
// 创建http路由
route := ws.Method(method).Path(action.Path).
To(handler).
Doc(doc).
Operation("connect" + strings.Title(strings.ToLower(method)) + namespaced + kind + strings.Title(subresource) + operationSuffix).
Produces("*/*").
Consumes("*/*").
Writes(connectProducedObject)
// 新增的http路由放入切片
routes = append(routes, route)
}
}
for _, route := range routes {
route.Metadata(ROUTE_META_GVK, metav1.GroupVersionKind{
Group: reqScope.Kind.Group,
Version: reqScope.Kind.Version,
Kind: reqScope.Kind.Kind,
})
route.Metadata(ROUTE_META_ACTION, strings.ToLower(action.Verb)
// 把http路由注册到ws对象
ws.Route(route)
}
/*
其他代码
*/
}
func restfulConnectResource(connecter rest.Connecter, scope handlers.RequestScope, admit admission.Interface, restPath string, isSubresource bool) restful.RouteFunction {
return func(req *restful.Request, res *restful.Response) {
handlers.ConnectResource(connecter, &scope, admit, restPath, isSubresource)(res.ResponseWriter, req.Request)
}
}
func ConnectResource(connecter rest.Connecter, scope *RequestScope, admit admission.Interface, restPath string, isSubresource bool) http.HandlerFunc {
return func(w http.ResponseWriter, req *http.Request) {
/*
其他代码
*/
requestInfo, _ := request.RequestInfoFrom(ctx)
metrics.RecordLongRunning(req, requestInfo, metrics.APIServerComponent, func() {
// connecter对象的类型是ExecREST
// 调用ExecREST结构体的Connect(...)方法来获得一个http handler
handler, err := connecter.Connect(ctx, name, opts, &responder{scope: scope, req: req, w: w})
if err != nil {
scope.err(err, w, req)
return
}
// 处理kubectl exec的请求
handler.ServeHTTP(w, req)
})
}
}
// 为pod exec这个情景,返回一个 http handler
// 核心逻辑是找到正确的后端服务(目标kubelet的ip地址、端口、uri等),进行反向代理。
func (r *ExecREST) Connect(ctx context.Context, name string, opts runtime.Object, responder rest.Responder) (http.Handler, error) {
execOpts, ok := opts.(*api.PodExecOptions)
if !ok {
return nil, fmt.Errorf("invalid options object: %#v", opts)
}
// location是目标kubelet的http url
location, transport, err := pod.ExecLocation(r.Store, r.KubeletConn, ctx, name, execOpts)
if err != nil {
return nil, err
}
// kube-apiserver此时是一个反向代理,访问的是目标kubelet的接口,然后进行流拷贝。
return newThrottledUpgradeAwareProxyHandler(location, transport, false, true, true, responder), nil
}
// 调用proxy.NewUpgradeAwareHandler(...)创建了类型为*proxy.UpgradeAwareHandler的对象,并返回
func newThrottledUpgradeAwareProxyHandler(location *url.URL, transport http.RoundTripper, wrapTransport, upgradeRequired, interceptRedirects bool, responder rest.Responder) *proxy.UpgradeAwareHandler {
handler := proxy.NewUpgradeAwareHandler(location, transport, wrapTransport, upgradeRequired, proxy.NewErrorResponder(responder))
handler.InterceptRedirects = interceptRedirects && utilfeature.DefaultFeatureGate.Enabled(genericfeatures.StreamingProxyRedirects)
handler.RequireSameHostRedirects = utilfeature.DefaultFeatureGate.Enabled(genericfeatures.ValidateProxyRedirects)
handler.MaxBytesPerSec = capabilities.Get().PerConnectionBandwidthLimitBytesPerSec
return handler
}
2.2 kubelet侧
kubelet除了启动很多协程做for循环,还会启动web服务,web服务中就包含了/exec接口。
// 为web server注册用于调试容器的接口,例如/exec、/log等。
// /exec接口的处理方法是getExec(...)
func (s *Server) InstallDebuggingHandlers(criHandler http.Handler) {
klog.Infof("Adding debug handlers to kubelet server.")
ws := new(restful.WebService)
/*
其他代码
*/
ws = new(restful.WebService)
ws.
Path("/exec")
ws.Route(ws.GET("/{podNamespace}/{podID}/{containerName}").
To(s.getExec).
Operation("getExec"))
ws.Route(ws.POST("/{podNamespace}/{podID}/{containerName}").
To(s.getExec).
Operation("getExec"))
ws.Route(ws.GET("/{podNamespace}/{podID}/{uid}/{containerName}").
To(s.getExec).
Operation("getExec"))
ws.Route(ws.POST("/{podNamespace}/{podID}/{uid}/{containerName}").
To(s.getExec).
Operation("getExec"))
s.restfulCont.Add(ws)
/*
其他代码
*/
s.restfulCont.Add(ws)
if criHandler != nil {
s.restfulCont.Handle("/cri/", criHandler)
}
}
kubelet是一个反向代理,代理来自kube-apiserver的streaming请求
func (s *Server) getExec(request *restful.Request, response *restful.Response) {
params := getExecRequestParams(request)
streamOpts, err := remotecommandserver.NewOptions(request.Request)
/*
检查性代码
*/
pod, ok := s.host.GetPodByName(params.podNamespace, params.podName)
/*
检查性代码
*/
podFullName := kubecontainer.GetPodFullName(pod)
// url其实所以一个127.0.0.1:端口/exec/{token},这是docker shim的接口
url, err := s.host.GetExec(podFullName, params.podUID, params.containerName, params.cmd, *streamOpts)
/*
检查性代码
*/
// 让客户端重定向至url
if s.redirectContainerStreaming {
http.Redirect(response.ResponseWriter, request.Request, url.String(), http.StatusFound)
return
}
// 此时kubelet作为一个反向代理(可看成nginx或traefik),url是反向代理背后的一个具体的服务实例
// v1.18版本开始,代码是一定到达此处,必须是通过kubelet代理客户端的streaming请求
proxyStream(response.ResponseWriter, request.Request, url)
}
// 和kube-apiserver一样的套路,还是获得UpgradeAwareHandler结构体对象,然后调用ServeHTTP(w, r)
func proxyStream(w http.ResponseWriter, r *http.Request, url *url.URL) {
handler := proxy.NewUpgradeAwareHandler(url, nil /*transport*/, false /*wrapTransport*/, true /*upgradeRequired*/, &responder{})
handler.ServeHTTP(w, r)
}
2.3 UpgradeAwareHandler
这是一个工具包下的一个结构体,专门作为反向代理。
func (h *UpgradeAwareHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
// h.tryUpgrade(...)尝试处理长连接请求(如果是的话)
// 用户执行kubectl exec命令,则h.tryUpgrade(...)返回true
if h.tryUpgrade(w, req) {
return
}
/*
其他代码
*/
// 代码到达到此处,说明不是长连接
newReq := req.WithContext(context.Background())
newReq.Header = utilnet.CloneHeader(req.Header)
if !h.UseRequestLocation {
newReq.URL = &loc
}
//经典的反向代理用法
proxy := httputil.NewSingleHostReverseProxy(&url.URL{Scheme: h.Location.Scheme, Host: h.Location.Host})
proxy.Transport = h.Transport
proxy.FlushInterval = h.FlushInterval
proxy.ServeHTTP(w, newReq)
}
重点是看UpgradeAwareHandler对象的tryUpgrade(…)方法。tryUpgrade(…)方法里面核心的业务逻辑是拿着tcp连接做流拷贝。
// 如果http请求要求提升协议,则启动协程进行流拷贝,直到连接断开
func (h *UpgradeAwareHandler) tryUpgrade(w http.ResponseWriter, req *http.Request) bool {
//检查http请求是不是要求升级协议,如果不是则直接返回
if !httpstream.IsUpgradeRequest(req) {
klog.V(6).Infof("Request was not an upgrade")
return false
}
var (
// backendConn是指kubelet到cri shim的连接
backendConn net.Conn
rawResponse []byte
err error
)
location := *h.Location
if h.UseRequestLocation {
location = *req.URL
location.Scheme = h.Location.Scheme
location.Host = h.Location.Host
}
// 将http请求进行克隆,是为了保留原有协议、header,但url在后续会被替换。
clone := utilnet.CloneRequest(req)
utilnet.AppendForwardedForHeader(clone)
if h.InterceptRedirects {
backendConn, rawResponse, err = utilnet.ConnectWithRedirects(req.Method, &location, clone.Header, req.Body, utilnet.DialerFunc(h.DialForUpgrade), h.RequireSameHostRedirects)
} else {
// 直接连接cri shim
clone.URL = &location
backendConn, err = h.DialForUpgrade(clone)
}
/*
检查性代码
*/
defer backendConn.Close()
// 尝试访问cri shim,如果响应是失败的,则直接返回
backendHTTPResponse, headerBytes, err := getResponse(io.MultiReader(bytes.NewReader(rawResponse), backendConn))
requestHijacker, ok := w.(http.Hijacker)
/*
检查性代码
*/
// 从获取客户端和kubelet之间的tcp连接
requestHijackedConn, _, err := requestHijacker.Hijack()
/*
检查性代码
*/
defer requestHijackedConn.Close()
/*
其他代码
*/
writerComplete := make(chan struct{})
readerComplete := make(chan struct{})
// 启动两个协程专门做流拷贝
go func() {
var writer io.WriteCloser
if h.MaxBytesPerSec > 0 {
writer = flowrate.NewWriter(backendConn, h.MaxBytesPerSec)
} else {
writer = backendConn
}
_, err := io.Copy(writer, requestHijackedConn)
if err != nil && !strings.Contains(err.Error(), "use of closed network connection") {
klog.Errorf("Error proxying data from client to backend: %v", err)
}
close(writerComplete)
}()
go func() {
var reader io.ReadCloser
if h.MaxBytesPerSec > 0 {
reader = flowrate.NewReader(backendConn, h.MaxBytesPerSec)
} else {
reader = backendConn
}
// io.Copy(...)会一直阻塞
// 在容器中执行的命令退出后,阻塞的io.Copy(...)才会退出
_, err := io.Copy(requestHijackedConn, reader)
if err != nil && !strings.Contains(err.Error(), "use of closed network connection") {
klog.Errorf("Error proxying data from backend to client: %v", err)
}
close(readerComplete)
}()
//一直阻塞,直到以上两个协程有一个退出
select {
case <-writerComplete:
case <-readerComplete:
}
klog.V(6).Infof("Disconnecting from backend proxy %s\n Headers: %v", &location, clone.Header)
return true
}
kubelet访问docker shim,发送的升级协议的请求的格式:
协议是:SPDY/3.1
主机是:127.0.0.1:端口
uri是:/exec/{一个token}
2.4 docker shim侧
docker shim是kubelet的一部分,在创建kubelet对象的时候,就会创建和启动docker shim。
func NewMainKubelet(...) {
/*
其他代码
*/
//创建kubelet对象
klet := &Kubelet{
...
}
switch containerRuntime {
case kubetypes.DockerContainerRuntime: // 当容器运行时为docker
// ds就是docker shim
ds, err := dockershim.NewDockerService(kubeDeps.DockerClientConfig, crOptions.PodSandboxImage, streamingConfig,
&pluginSettings, runtimeCgroups, kubeCfg.CgroupDriver, crOptions.DockershimRootDirectory, !crOptions.RedirectContainerStreaming)
if err != nil {
return nil, err
}
if crOptions.RedirectContainerStreaming {
// 为kubelet对象的属性criHandler进行赋值
klet.criHandler = ds
}
// server其实是ds的封装
server := dockerremote.NewDockerServer(remoteRuntimeEndpoint, ds)
// 启动ds
if err := server.Start(); err != nil {
return nil, err
}
/*
其他代码
*/
}
/*
其他代码
*/
return klet, nil
}
docker shim中包含了stream server,stream server是它的一个属性。
func NewDockerService(config *ClientConfig, podSandboxImage string, streamingConfig *streaming.Config, pluginSettings *NetworkPluginSettings,
cgroupsName string, kubeCgroupDriver string, dockershimRootDir string, startLocalStreamingServer bool) (DockerService, error) {
/*
其他代码
*/
ds := &dockerService{
client: c,
os: kubecontainer.RealOS{},
podSandboxImage: podSandboxImage,
streamingRuntime: &streamingRuntime{
client: client,
execHandler: &NativeExecHandler{},
},
containerManager: cm.NewContainerManager(cgroupsName, client),
checkpointManager: checkpointManager,
startLocalStreamingServer: startLocalStreamingServer,
networkReady: make(map[string]bool),
containerCleanupInfos: make(map[string]*containerCleanupInfo),
}
// 创建streaming server,
if streamingConfig != nil {
var err error
// streaming server是docker service的一个属性
ds.streamingServer, err = streaming.NewServer(*streamingConfig, ds.streamingRuntime)
if err != nil {
return nil, err
}
}
/*
其他代码
*/
return ds, nil
}
stream server(一个web server)的构造方法如下,得知/exec接口的处理器是stream server的serveExec(…)方法。
func NewServer(config Config, runtime Runtime) (Server, error) {
s := &server{
config: config,
runtime: &criAdapter{runtime},
cache: newRequestCache(),
}
if s.config.BaseURL == nil {
s.config.BaseURL = &url.URL{
Scheme: "http",
Host: s.config.Addr,
}
if s.config.TLSConfig != nil {
s.config.BaseURL.Scheme = "https"
}
}
ws := &restful.WebService{}
endpoints := []struct {
path string
handler restful.RouteFunction
}{
{"/exec/{token}", s.serveExec}, // /exec接口的处理器
{"/attach/{token}", s.serveAttach},
{"/portforward/{token}", s.servePortForward},
}
pathPrefix := path.Dir(s.config.BaseURL.Path)
// 为web server注册处理器
for _, e := range endpoints {
for _, method := range []string{"GET", "POST"} {
ws.Route(ws.
Method(method).
Path(path.Join(pathPrefix, e.path)).
To(e.handler))
}
}
handler := restful.NewContainer()
handler.Add(ws)
s.handler = handler
s.server = &http.Server{
Addr: s.config.Addr,
Handler: s.handler,
TLSConfig: s.config.TLSConfig,
}
return s, nil
}
2.5 docker shim中的stream server
stream server的/exec接口的处理器
func (s *server) serveExec(req *restful.Request, resp *restful.Response) {
token := req.PathParameter("token")
// 从缓存中根据token的值拿出grpc请求
cachedRequest, ok := s.cache.Consume(token)
if !ok {
http.NotFound(resp.ResponseWriter, req.Request)
return
}
exec, ok := cachedRequest.(*runtimeapi.ExecRequest)
if !ok {
http.NotFound(resp.ResponseWriter, req.Request)
return
}
streamOpts := &remotecommandserver.Options{
Stdin: exec.Stdin,
Stdout: exec.Stdout,
Stderr: exec.Stderr,
TTY: exec.Tty,
}
// remotecommandserver.ServeExec(...)是一个静态方法
// 在容器中执行的命令退出后,xshell终端是卡住状态,直到remotecommandserver.ServeExec(...)退出
remotecommandserver.ServeExec(
resp.ResponseWriter,
req.Request,
s.runtime,
"", // unused: podName
"", // unusued: podUID
exec.ContainerId,
exec.Cmd,
streamOpts,
s.config.StreamIdleTimeout,
s.config.StreamCreationTimeout,
s.config.SupportedRemoteCommandProtocols)
}
静态方法ServeExec(…)
func ServeExec(w http.ResponseWriter, req *http.Request, executor Executor, podName string, uid types.UID, container string, cmd []string, streamOpts *Options, idleTimeout, streamCreationTimeout time.Duration, supportedProtocols []string) {
// ctx对象包含了conn对象、stdinStream对象、stdoutStream对象、stderrStream对象
// 和客户端建立spdy stream
ctx, ok := createStreams(req, w, streamOpts, supportedProtocols, idleTimeout, streamCreationTimeout)
if !ok {
// error is handled by createStreams
return
}
defer ctx.conn.Close()
// executor的实现是结构体criAdapter
// 本质是调用docker client访问docker daemon的/exec接口
err := executor.ExecInContainer(podName, uid, container, cmd, ctx.stdinStream, ctx.stdoutStream, ctx.stderrStream, ctx.tty, ctx.resizeChan, 0)
if err != nil {
if exitErr, ok := err.(utilexec.ExitError); ok && exitErr.Exited() {
rc := exitErr.ExitStatus()
ctx.writeStatus(&apierrors.StatusError{ErrStatus: metav1.Status{
Status: metav1.StatusFailure,
Reason: remotecommandconsts.NonZeroExitCodeReason,
Details: &metav1.StatusDetails{
Causes: []metav1.StatusCause{
{
Type: remotecommandconsts.ExitCodeCauseType,
Message: fmt.Sprintf("%d", rc),
},
},
},
Message: fmt.Sprintf("command terminated with non-zero exit code: %v", exitErr),
}})
} else {
err = fmt.Errorf("error executing command in container: %v", err)
runtime.HandleError(err)
ctx.writeStatus(apierrors.NewInternalError(err))
}
} else {
ctx.writeStatus(&apierrors.StatusError{ErrStatus: metav1.Status{
Status: metav1.StatusSuccess,
}})
}
}
结构体criAdapter的ExecInContainer(…)方法本质是调用结构体streamingRuntime的属性execHandler的方法ExecInContainer(…)
func (a *criAdapter) ExecInContainer(podName string, podUID types.UID, container string, cmd []string, in io.Reader, out, err io.WriteCloser, tty bool, resize <-chan remotecommand.TerminalSize, timeout time.Duration) error {
/*
a.Runtime的实现类就是
type streamingRuntime struct {
client libdocker.Interface
execHandler ExecHandler //实现类是NativeExecHandler
}
*/
// 本质是调用结构体streamingRuntime的属性execHandler的方法ExecInContainer(...)
return a.Runtime.Exec(container, cmd, in, out, err, tty, resize)
}
结构体streamingRuntime的属性execHandler的实现就是NativeExecHandler
/*
type streamingRuntime struct {
client libdocker.Interface
execHandler ExecHandler // 实现是NativeExecHandler
}
*/
// 其实就是调用docker client: client.CreateExec(...)和client.StartExec(...)
func (*NativeExecHandler) ExecInContainer(client libdocker.Interface, container *dockertypes.ContainerJSON, cmd []string, stdin io.Reader, stdout, stderr io.WriteCloser, tty bool, resize <-chan remotecommand.TerminalSize, timeout time.Duration) error {
done := make(chan struct{})
defer close(done)
createOpts := dockertypes.ExecConfig{
Cmd: cmd,
AttachStdin: stdin != nil,
AttachStdout: stdout != nil,
AttachStderr: stderr != nil,
Tty: tty,
}
execObj, err := client.CreateExec(container.ID, createOpts)
if err != nil {
return fmt.Errorf("failed to exec in container - Exec setup failed - %v", err)
}
// 给docker发送resize tty的请求
execStarted := make(chan struct{})
go func() {
select {
case <-execStarted:
// client.StartExec has started the exec, so we can start resizing
case <-done:
// ExecInContainer has returned, so short-circuit
return
}
kubecontainer.HandleResizing(resize, func(size remotecommand.TerminalSize) {
client.ResizeExecTTY(execObj.ID, uint(size.Height), uint(size.Width))
})
}()
startOpts := dockertypes.ExecStartCheck{Detach: false, Tty: tty}
streamOpts := libdocker.StreamOptions{
InputStream: stdin,
OutputStream: stdout,
ErrorStream: stderr,
RawTerminal: tty,
ExecStarted: execStarted,
}
// 用户能在终端随意在容器中执行命令
// 指令执行完成,client.StartExec(...)才进行返回
err = client.StartExec(execObj.ID, startOpts, streamOpts)
if err != nil {
return err
}
// 此时用户的xshell终端是卡住状态
// 本方法就算return了,用户的xshell终端依然卡住
// 得等到上层方法remotecommandserver.ServeExec(...)退出,则xshell终端由卡住状态变成自由状态(即标准输入、标准输出、标准错误从容器中卸载了)。
// 根据在容器中执行的命令的退出码来决定本方法返回的是nil还是具体的err
ticker := time.NewTicker(2 * time.Second)
defer ticker.Stop()
count := 0
for {
inspect, err2 := client.InspectExec(execObj.ID)
if err2 != nil {
return err2
}
if !inspect.Running {
// 如果命令的退出码不是0,修改err
if inspect.ExitCode != 0 {
err = &dockerExitError{inspect}
}
break // 则退出for循环
}
count++
if count == 5 {
klog.Errorf("Exec session %s in container %s terminated but process still running!", execObj.ID, container.ID)
break
}
<-ticker.C
}
return err
}
当在容器中执行的指令执行完成,client.StartExec(…)进行返回,然后进入for循环探测该命令是否退出、退出码是不是0。
说明:
但用户的xshell终端依然是处于卡住状态,直到上层方法remotecommandserver.ServeExec(…)退出。
3 总结:
kubectl exec经过两个反向代理(先是kube-apiserver,然后是kubelet),到达docker shim(虽然还是属于kubelet进程,kubelet和docker shim通过127.0.0.1通信)的stream server,stream server调用docker client访问docer daemon的/exec接口。