docker原理 Docker概述

Docker原理 11

Linux Namespace 11

AUFS文件系统 17

重新理解Docker的各种命令 18

Docker原理

Linux Namespace

docker是一个容器引擎，容器就要求对进程空间、用户空间、网络空间、硬盘空间等等做一些隔离，docker的底层是使用LXC实现的，LXC则使用Linux Namespace技术对各种技术做隔离。

Linux Namespace是Linux提供的一种内核级别环境隔离的方法, 隔离的资源包括：Mount、UTS、IPC、PID、Network、User。篇幅限制，本文只介绍UTS、PID和Mount的隔离。

网上找来一段代码：

#define _GNU_SOURCE

#include <sys/types.h>

#include <sys/wait.h>

#include <stdio.h>

#include <sched.h>

#include <signal.h>

#include <unistd.h>

#include <errno.h>


/* 定义一个给 clone 用的栈，栈大小1M */

#define STACK_SIZE (1024 * 1024)

static char container_stack[STACK_SIZE];


char* const container_args[] = {

"/bin/bash",

NULL

};


int container_main(void* arg)

{

printf("Container - inside the container!\n");

/* 直接执行一个shell，以便我们观察这个进程空间里的资源是否被隔离了 */

execv(container_args[0], container_args);

printf("Something's wrong!\n");

return 1;

}


int main()

{

printf("Parent - start a container!\n");

/* 调用clone函数，其中传出一个函数，还有一个栈空间的（为什么传尾指针，因为栈是反着的） */

int container_pid = clone(container_main, container_stack+STACK_SIZE, SIGCHLD, NULL);

if (container_pid < 0)

{

fprintf(stderr, "clone failed WTF!!!! %s\n", strerror(errno));

return -1;

}


/* 等待子进程结束 */

waitpid(container_pid, NULL, 0);

printf("Parent - container stopped!\n");

return 0;

}

代码比较简单，就是用clone系统调用生成一个新的子进程，并运行container_main函数。

运行结果：

$./simple_clone

Parent - start a container!

Container - inside the container!


$ps aux |head

USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND

root 1 0.0 0.0 41376 3520 ? Ss 10:25 0:07 /usr/lib/systemd/systemd --switched-root --system --deserialize 21

root 2 0.0 0.0 0 0 ? S 10:25 0:00 [kthreadd]

root 3 0.0 0.0 0 0 ? S 10:25 0:02 [ksoftirqd/0]

root 6 0.0 0.0 0 0 ? S 10:25 0:16 [kworker/u30:0]

root 7 0.0 0.0 0 0 ? S 10:25 0:01 [migration/0]

root 8 0.0 0.0 0 0 ? S 10:25 0:00 [rcu_bh]

root 9 0.0 0.0 0 0 ? S 10:25 0:00 [rcuob/0]

root 10 0.0 0.0 0 0 ? S 10:25 0:00 [rcuob/1]

root 11 0.0 0.0 0 0 ? S 10:25 0:00 [rcuob/2]

可以看到，在进入了子进程后看到的跟父进程完全一样。

我们往上段代码的clone函数中加入CLONE_NEWUTS flag, 并且在container_main函数中设置主机名：

#define _GNU_SOURCE

#include <sys/types.h>

#include <sys/wait.h>

#include <stdio.h>

#include <sched.h>

#include <signal.h>


/* 定义一个给 clone 用的栈，栈大小1M */

#define STACK_SIZE (1024 * 1024)

static char container_stack[STACK_SIZE];


char* const container_args[] = {

"/bin/bash",

NULL

};


int container_main(void* arg)

{

printf("Container - inside the container!\n");


sethostname("container",10); /* 设置hostname */


/* 直接执行一个shell，以便我们观察这个进程空间里的资源是否被隔离了 */

execv(container_args[0], container_args);

printf("Something's wrong!\n");

return 1;

}


int main()

{

printf("Parent - start a container!\n");

/* 调用clone函数，其中传出一个函数，还有一个栈空间的（为什么传尾指针，因为栈是反着的） */

int container_pid = clone(container_main, container_stack+STACK_SIZE,

CLONE_NEWUTS | SIGCHLD, NULL); /*启用CLONE_NEWUTS Namespace隔离 */

if (container_pid < 0)

{

printf("%d clone failed WTF!!!! %s\n", container_pid, strerror(errno));

return -1;

}


/* 等待子进程结束 */

waitpid(container_pid, NULL, 0);

printf("Parent - container stopped!\n");

return 0;

}

运行：

$sudo ./uts

Parent - start a container!

Container - inside the container!


#hostname

container


#exit

exit

Parent - container stopped!


$

可以看到子进程中的hostname变成了container

我们接着在clone函数中加入CLONE_NEWPID flag, 并在主子进程中都打出pid:

#define _GNU_SOURCE

#include <sys/types.h>

#include <sys/wait.h>

#include <stdio.h>

#include <sched.h>

#include <signal.h>

#include <unistd.h>

#include <errno.h>


/* 定义一个给 clone 用的栈，栈大小1M */

#define STACK_SIZE (1024 * 1024)

static char container_stack[STACK_SIZE];


char* const container_args[] = {

"/bin/bash",

NULL

};


int container_main(void* arg)

{

printf("Container [%5d] - inside the container!\n", getpid());


sethostname("container",10); /* 设置hostname */


/* 直接执行一个shell，以便我们观察这个进程空间里的资源是否被隔离了 */

execv(container_args[0], container_args);

printf("Something's wrong!\n");

return 1;

}


int main()

{

printf("Parent [%5d] - start a container!\n", getpid());

/* 调用clone函数，其中传出一个函数，还有一个栈空间的（为什么传尾指针，因为栈是反着的） */

int container_pid = clone(container_main, container_stack+STACK_SIZE,

CLONE_NEWUTS | CLONE_NEWPID | SIGCHLD, NULL); /*启用CLONE_NEWUTS Namespace隔离 */

if (container_pid < 0)

{

printf("%d clone failed WTF!!!! %s\n", container_pid, strerror(errno));

return -1;

}


/* 等待子进程结束 */

waitpid(container_pid, NULL, 0);

printf("Parent - container stopped!\n");

return 0;

}

运行：

$sudo ./pid

Parent [17121] - start a container!

Container [ 1] - inside the container!


#ps aux |head

USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND

root 1 0.0 0.0 41376 3520 ? Ss 10:25 0:07 /usr/lib/systemd/systemd --switched-root --system --deserialize 21

root 2 0.0 0.0 0 0 ? S 10:25 0:00 [kthreadd]

root 3 0.0 0.0 0 0 ? S 10:25 0:02 [ksoftirqd/0]

root 6 0.0 0.0 0 0 ? S 10:25 0:16 [kworker/u30:0]

root 7 0.0 0.0 0 0 ? S 10:25 0:01 [migration/0]

root 8 0.0 0.0 0 0 ? S 10:25 0:00 [rcu_bh]

root 9 0.0 0.0 0 0 ? S 10:25 0:00 [rcuob/0]

root 10 0.0 0.0 0 0 ? S 10:25 0:00 [rcuob/1]

root 11 0.0 0.0 0 0 ? S 10:25 0:00 [rcuob/2]


#

可以看到子进程的pid变成了1。这个变化很重要，意味着子进程后面的所有进程，都是挂在这个PID为1的进程后面。看起来就像是一个新的系统，而该子进程就像是pid为1的init进程。

但是上面的ps结果也看到，子进程仍然可以看以父进程的所有进程，原因是主子进程中的ps命令都是去读的/proc文件系统，我们需要对子进程单独mount一个proc文件系统出来。

接着改代码，给clone函数加一个CLONE_NEWNS flag, 并在子进程中运行 mount -t /proc proc /proc命令：

#define _GNU_SOURCE

#include <sys/types.h>

#include <sys/wait.h>

#include <stdio.h>

#include <sched.h>

#include <signal.h>

#include <unistd.h>

#include <errno.h>


/* 定义一个给 clone 用的栈，栈大小1M */

#define STACK_SIZE (1024 * 1024)

static char container_stack[STACK_SIZE];


char* const container_args[] = {

"/bin/bash",

NULL

};


int container_main(void* arg)

{

printf("Container [%5d] - inside the container!\n", getpid());


sethostname("container",10); /* 设置hostname */


/* 重新mount proc文件系统到 /proc下 */

system("mount -t proc proc /proc");


/* 直接执行一个shell，以便我们观察这个进程空间里的资源是否被隔离了 */

execv(container_args[0], container_args);

printf("Something's wrong!\n");

return 1;

}


int main()

{

printf("Parent [%5d] - start a container!\n", getpid());

/* 调用clone函数，其中传出一个函数，还有一个栈空间的（为什么传尾指针，因为栈是反着的） */

int container_pid = clone(container_main, container_stack+STACK_SIZE,

CLONE_NEWUTS | CLONE_NEWPID | CLONE_NEWNS | SIGCHLD, NULL); /*启用CLONE_NEWUTS Namespace隔离 */

if (container_pid < 0)

{

printf("%d clone failed WTF!!!! %s\n", container_pid, strerror(errno));

return -1;

}


/* 等待子进程结束 */

waitpid(container_pid, NULL, 0);

printf("Parent - container stopped!\n");

return 0;

}

运行：

$sudo ./mount

Parent [18594] - start a container!

Container [ 1] - inside the container!


#ps aux

USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND

root 1 1.3 0.0 116704 3276 pts/0 S 22:55 0:00 /bin/bash

root 30 0.0 0.0 139492 1632 pts/0 R+ 22:56 0:00 ps aux

可以看到，ps也只能看到子进程中的进程了。
这个时候还可以看一下，父进程实际上可以看到子进程的进程，并且pid不为1，而子进程就像一只井底之蛙，只能看到自己被隔离出来的进程。

$ps aux |grep /bin/bash

root 20323 0.0 0.0 116704 3284 pts/0 S+ 23:00 0:00 /bin/bash

lijun.s+ 20789 0.0 0.0 112644 956 pts/3 S+ 23:02 0:00 grep --color=auto /bin/bash

AUFS文件系统

docker使用的文件系统有aufs、devicemapper等，aufs是docker的首选文件系统，但是可惜没有合到Linux主干代码中，不过主流的系统像ubuntu都是支持的。而像centos这种系统不支持aufs, 就只能使用devicemapper了。
由于aufs对理解docker的layer(层)的概念更容易一些，这里介绍下aufs文件系统。

闲话不多说，找个ubuntu 12.04版本的系统做如下测试：
建两个目录d1,d2 d1中有文件a和b, d2中有文件b和c：

root@vultr:~/test_aufs/test1# ls -R

d1 d2


./d1:

a b


./d2:

b c

其中每个文件的值为： d1/a -> 'a', d1/b -> 'b1', d2/b -> 'b2', d2/c -> 'c'
用d1, d2 mount一个aufs文件系统的目录：

root@vultr:~/test_aufs/test1# mount -t aufs -o dirs=./d1:./d2 none ./mnt


root@vultr:~/test_aufs/test1# ls mnt/

a b c


root@vultr:~/test_aufs/test1# cat mnt/b

b1

可以看到mnt/b中的值为d1/b中的值，而d2/b被丢掉了。可见mnt多个目录同一个文件名的文件，只保留按顺序第一次出现的那个。

再尝试着更改文件内容：

root@vultr:~/test_aufs/test1/mnt# echo 'new_a' > a

root@vultr:~/test_aufs/test1/mnt# cat ../d1/a

new_a

root@vultr:~/test_aufs/test1/mnt# echo 'new_b' > b

root@vultr:~/test_aufs/test1/mnt# cat ../d1/b

new_b

root@vultr:~/test_aufs/test1/mnt# cat ../d2/b

b2

root@vultr:~/test_aufs/test1/mnt# echo 'new_c' > c

root@vultr:~/test_aufs/test1/mnt# cat ../d2/c

c

root@vultr:~/test_aufs/test1/mnt# cat ../d1/c

new_c

root@vultr:~/test_aufs/test1/mnt#

前几个都好理解，注意往mnt/c中写一段内容后，d2/c的内容并没有改变，反而在d1目录下面出现了一个c,内容为mnt/c的内容，好诡异，这是什么逻辑呢。

原来mount aufs文件系统的目录时，最前面的目录是可写的，而后面的都是只读的，往mnt下面的文件写内容时，会先找到第一个可写的目录，然后更新其内容, 如果文件不存在则会建一个。 d2/c被mnt成只读的了不会改变内容，而且d1目录是可写的，所以会在d1下面新生成一个c文件。

我们还可以试着mount aufs时在目录后面加上:rw, :ro来表示读写和只读，会有不同的结果，但是原理与上段描述的一样，可以猜猜结果会是怎样。

不知道大家有没有用过Ubuntu或Fedora的live系统盘，只要插上光盘就可以运行系统，而且还可以写数据，只不过系统退出后变更的文件就找不到了。当时觉得很神奇，现在想想也正是使用了aufs这种文件系统的特性，只要将光盘和硬盘mount在一起，就可以看上去在光盘上读写数据了。

回到docker，docker的镜像其实就是一些只读层，而容器是在docker镜像上加了一层读写层，这样就可以在不更改镜像的基础上还能像普通vm一样读写数据。网上有张图比较好：

重新理解Docker的各种命令

我们了解了docker的文件系统及namespace功能后，再试图重新理解一下docker的几个命令：

docker images : 列出所有顶层的只读镜像

docker run : 先是利用只读的镜像外加一层可读写的层，并且加了一个被隔离的进程空间来创建了一个容器，然后运行指定的程序。

docker stop : 保留可读写层，收回隔离的进程空间。

docker ps -a : 列出所有包含读写层的容器，包含stop(Exit)状态的。

docker commit : 将当前容器的只读层加可读写层一起产生一个新的只读层做为镜像。

参考：

1. Docker概述

2. docker的使用及原理知乎