Linux 内核设计与实现(linux kernel design and realization)

Author: Robert Love

command:

  • ps-el :查看系统中的进程列表
  • ps -eo state,uid, pid, ppid, rtprio, time, comm

kernel introduction

  • processor state:

    1. user mode, process state
    2. kernel mode, process state
    3. kernel mode, interrupt state
  • kernel type:

    1. single kernel: share common address space
    2. micro kernel: separate address space, IPC
  • linux & Unix difference

    1. load kernel module dynamic
    2. symmetry multiple process(SMP)
    3. preemptive
    4. common treat with all kernel thread
    5. object oriented device module and etc
    6. drop out-date Unix design
    7. freedom
  • where is the source code can be located???/usr/src/linux

  • kernel configuration

  • kernel installation

内核开发

内核开发的特点

  1. 不能使用 C lib or 其他standard header file
  2. 必须使用GNU C
  3. 没有类似用户空间的内存保护机制
  4. 无浮点运算
  5. 每个进程只有很短的固定长度的堆栈
  6. synchronization and co-working 在内核开发中尤为重要,最终以支持异步中断,抢占和SMP

内核编译

切换到内核根目录,进行内核配置后,进行编译:
make -jn
拆分编译任务到多个task,提高编译效率

make -j32 > /dev/null
在16核CPU 上,多task作业,为了避免无关消息过多打印,可以将其写入/dev/null 文件,同时警告信息和错误信息仍然可以输出。

GNU C

  • inline function
    inline is preferred in kernel development than Macro
    inline usually defined in header file in the form of static function,
    ?using “static” before inline, the compiler will not create an function entity for inline function -->page 17

  • likely( ) & unlikely( )
    optimize the condition branch

  • No memory protection
    without memory page mechanism

process management

In linux, thread is just the special process

进程描述符

  • 双向队列存储进程描述符

  • 进程描述符存放至内核栈的尾端(x86系统),power pc下有专门寄存器保存进程描述符

  • process state:

    • TASK_RUNNING
    • TASK_INTERRUPTIBLE
    • TASK_UNINTERRUPTIBLE
    • _TASK_TRACED
    • _TASK_STOPPED
  • 进程家族树

    • init 进程:内核启动最 后阶段启动init进程

    • 访问父进程:
      struct task_struct *myparent=current->parent

    • 遍历访问子进程:
      struct task_struct *task;
      struct list_head *list;
      list_for_each(list,&current->childred)
      {
      task=list_entry(list, struct task_struct, sibling) //??what meaning
      }

    • 写时拷贝:
      进程创建被拆解为fork过程和exec过程
      fork调用时推迟甚至免除拷贝数据的技术。实际开销为负值父进程的页表以及为子进程创建唯一进程描述符

    • fork, vfork
      根据各自参数标志调用clone(),由clone调用do_fork(),在其函数体内调用copy_process(),成功返回后调用子进程执行。

      内核让子进程优先执行,一般子进程都会马上调用exec() function,可以避免写时拷贝的额外开销,如果父进程先执行,可能会开始向地址空间写入
      vfork 与fork的区别:
      不拷贝父进程页表项

      clone(SIGCHLD,0)//fork
      clone(CLONE_VFORK|CLONE_VM|SIGCHLD,0)//vfork

    • 线程实现
      在linux中,线程被视为特殊的进程,不做单独的结构区分,创建一个线程时,视为与父进程共享特定空间的进程。
      创建线程:
      clone(CLONE_VM|CLONE_FS|CLONE_FILES| CLONE_SIGHAND, 0)
      clone 标志表:page 27

  • 内核线程

    • 内核线程只能由现有内核线程进行创建
    • 创建方法:
      struct task_struct *kthread_create(int(*threadfn)(void *data), void *data, const char namefmt[],… ) //call clone create new process, and still wait to run
    • 运行:
      wake_up_process()
  • 进程终结:
    do_exit()

    • call exit_notify() 向父进程发送信号,为子进程寻找“养父”,养父为进程组中的其他进程或init进程。
    • call schedule, 置当前进程为EXIT_ZOMBIE退出状态,此时系统还保留有进程的内存(内核栈,thread_info, task_struct),
      最终有父进程检索并向内核上报此无用信息后,此部分内存才被释放。
  • 孤儿进程:
    在进程终结前,必须为其子进程找好养父,否则其子进程在退出时处于ZOMBIE状态,但不能释放内存,成为孤儿进程。

进程调度

System Call

#define SYSCALL_DEFINE1(name, …) SYSCALL_DEFINEx(1, _##name, VA_ARGS)
#define SYSCALL_DEFINE2(name, …) SYSCALL_DEFINEx(2, _##name, VA_ARGS)
#define SYSCALL_DEFINE3(name, …) SYSCALL_DEFINEx(3, _##name, VA_ARGS)
#define SYSCALL_DEFINE4(name, …) SYSCALL_DEFINEx(4, _##name, VA_ARGS)
#define SYSCALL_DEFINE5(name, …) SYSCALL_DEFINEx(5, _##name, VA_ARGS)
#define SYSCALL_DEFINE6(name, …) SYSCALL_DEFINEx(6, _##name, VA_ARGS)

Append:

  • Task struct :
    struct task_struct {
    volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
    void stack;
    atomic_t usage;
    unsigned int flags; /
    per process flags, defined below */
    unsigned int ptrace;

#ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
unsigned int wakee_flips;
unsigned long wakee_flip_decay_ts;
struct task_struct *last_wakee;

 int wake_cpu;

#endif
int on_rq;

 int prio, static_prio, normal_prio;
 unsigned int rt_priority;
 const struct sched_class *sched_class;
 struct sched_entity se;
 struct sched_rt_entity rt;

#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
struct sched_dl_entity dl;

#ifdef CONFIG_PREEMPT_NOTIFIERS
/* list of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
#endif

#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif

 unsigned int policy;
 int nr_cpus_allowed;
 cpumask_t cpus_allowed;

#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
union rcu_special rcu_read_unlock_special;
struct list_head rcu_node_entry;
struct rcu_node rcu_blocked_node;
#endif /
#ifdef CONFIG_PREEMPT_RCU /
#ifdef CONFIG_TASKS_RCU
unsigned long rcu_tasks_nvcsw;
bool rcu_tasks_holdout;
struct list_head rcu_tasks_holdout_list;
int rcu_tasks_idle_cpu;
#endif /
#ifdef CONFIG_TASKS_RCU */

#ifdef CONFIG_SCHED_INFO
struct sched_info sched_info;
#endif

 struct list_head tasks;

#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
struct rb_node pushable_dl_tasks;
#endif

 struct mm_struct *mm, *active_mm;
 /* per-thread vma caching */
 u32 vmacache_seqnum;
 struct vm_area_struct *vmacache[VMACACHE_SIZE];

#if defined(SPLIT_RSS_COUNTING)
struct task_rss_stat rss_stat;
#endif
/* task state /
int exit_state;
int exit_code, exit_signal;
int pdeath_signal; /
The signal sent when the parent dies /
unsigned long jobctl; /
JOBCTL_*, siglock protected */

 /* Used for emulating ABI behavior of previous Linux versions */
 unsigned int personality;

 unsigned in_execve:1;     /* Tell the LSMs that the process is doing an
                * execve */
 unsigned in_iowait:1;

 /* Revert to default priority/policy when forking */
 unsigned sched_reset_on_fork:1;
 unsigned sched_contributes_to_load:1;
 unsigned sched_migrated:1;

#ifdef CONFIG_MEMCG_KMEM
unsigned memcg_kmem_skip_account:1;
#endif
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif

 unsigned long atomic_flags; /* Flags needing atomic access. */

 struct restart_block restart_block;

 pid_t pid;
 pid_t tgid;

#ifdef CONFIG_CC_STACKPROTECTOR
/* Canary value for the -fstack-protector gcc feature /
unsigned long stack_canary;
#endif
/

* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->real_parent->pid)
*/
struct task_struct __rcu real_parent; / real parent process */
struct task_struct __rcu parent; / recipient of SIGCHLD, wait4() reports /
/

* children/sibling forms the list of my natural children
/
struct list_head children; /
list of my children /
struct list_head sibling; /
linkage in my parent’s children list */
struct task_struct group_leader; / threadgroup leader */

 /*
 * ptraced is the list of tasks this task is using ptrace on.
 * This includes both natural children and PTRACE_ATTACH targets.
 * p->ptrace_entry is p's link on the p->parent->ptraced list.
 */
 struct list_head ptraced;
 struct list_head ptrace_entry;

 /* PID/PID hash table linkage. */
 struct pid_link pids[PIDTYPE_MAX];
 struct list_head thread_group;
 struct list_head thread_node;

 struct completion *vfork_done;          /* for vfork() */
 int __user *set_child_tid;          /* CLONE_CHILD_SETTID */
 int __user *clear_child_tid;          /* CLONE_CHILD_CLEARTID */

 cputime_t utime, stime, utimescaled, stimescaled;
 cputime_t gtime;
 struct prev_cputime prev_cputime;

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqlock_t vtime_seqlock;
unsigned long long vtime_snap;
enum {
VTIME_SLEEPING = 0,
VTIME_USER,
VTIME_SYS,
} vtime_snap_whence;
#endif
unsigned long nvcsw, nivcsw; /* context switch counts /
u64 start_time; /
monotonic time in nsec /
u64 real_start_time; /
boot based time in nsec /
/
mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt;

 struct task_cputime cputime_expires;
 struct list_head cpu_timers[3];

/* process credentials */
const struct cred __rcu real_cred; / objective and real subjective task
* credentials (COW) */
const struct cred __rcu cred; / effective (overridable) subjective task
* credentials (COW) /
char comm[TASK_COMM_LEN]; /
executable name excluding path
- access with [gs]et_task_comm (which lock
it with task_lock())
- initialized normally by setup_new_exec /
/
file system info */
struct nameidata nameidata;
#ifdef CONFIG_SYSVIPC
/
ipc stuff /
struct sysv_sem sysvsem;
struct sysv_shm sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
/
hung task detection /
unsigned long last_switch_count;
#endif
/
filesystem information */
struct fs_struct fs;
/
open file information */
struct files_struct files;
/
namespaces */
struct nsproxy nsproxy;
/
signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand;

 sigset_t blocked, real_blocked;
 sigset_t saved_sigmask;     /* restored if set_restore_sigmask() was used */
 struct sigpending pending;

 unsigned long sas_ss_sp;
 size_t sas_ss_size;
 int (*notifier)(void *priv);
 void *notifier_data;
 sigset_t *notifier_mask;
 struct callback_head *task_works;

 struct audit_context *audit_context;

#ifdef CONFIG_AUDITSYSCALL
kuid_t loginuid;
unsigned int sessionid;
#endif
struct seccomp seccomp;

/* Thread group tracking /
u32 parent_exec_id;
u32 self_exec_id;
/
Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,

  • mempolicy */
    spinlock_t alloc_lock;

    /* Protection of the PI data structures: */
    raw_spinlock_t pi_lock;

    struct wake_q_node wake_q;

#ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task */
struct rb_root pi_waiters;
struct rb_node pi_waiters_leftmost;
/
Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
/* mutex deadlock detection */
struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
unsigned int irq_events;
unsigned long hardirq_enable_ip;
unsigned long hardirq_disable_ip;
unsigned int hardirq_enable_event;
unsigned int hardirq_disable_event;
int hardirqs_enabled;
int hardirq_context;
unsigned long softirq_disable_ip;
unsigned long softirq_enable_ip;
unsigned int softirq_disable_event;
unsigned int softirq_enable_event;
int softirqs_enabled;
int softirq_context;
#endif
#ifdef CONFIG_LOCKDEP

define MAX_LOCK_DEPTH 48UL

 u64 curr_chain_key;
 int lockdep_depth;
 unsigned int lockdep_recursion;
 struct held_lock held_locks[MAX_LOCK_DEPTH];
 gfp_t lockdep_reclaim_gfp;

#endif

/* journalling filesystem info */
void *journal_info;

/* stacked block device info */
struct bio_list *bio_list;

#ifdef CONFIG_BLOCK
/* stack plugging */
struct blk_plug *plug;
#endif

/* VM state */
struct reclaim_state *reclaim_state;

 struct backing_dev_info *backing_dev_info;

 struct io_context *io_context;

 unsigned long ptrace_message;
 siginfo_t *last_siginfo; /* For ptrace use.  */
 struct task_io_accounting ioac;

#if defined(CONFIG_TASK_XACCT)
u64 acct_rss_mem1; /* accumulated rss usage /
u64 acct_vm_mem1; /
accumulated virtual memory usage /
cputime_t acct_timexpd; /
stime + utime since last update /
#endif
#ifdef CONFIG_CPUSETS
nodemask_t mems_allowed; /
Protected by alloc_lock /
seqcount_t mems_allowed_seq; /
Seqence no to catch updates /
int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
/
Control Group info protected by css_set_lock */
struct css_set __rcu cgroups;
/
cg_list protected by css_set_lock and tsk->alloc_lock */
struct list_head cg_list;
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;
#endif
#ifdef CONFIG_PERF_EVENTS
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
unsigned long preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
struct mempolicy mempolicy; / Protected by alloc_lock /
short il_next;
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
int numa_scan_seq;
unsigned int numa_scan_period;
unsigned int numa_scan_period_max;
int numa_preferred_nid;
unsigned long numa_migrate_retry;
u64 node_stamp; /
migration stamp */
u64 last_task_numa_placement;
u64 last_sum_exec_runtime;
struct callback_head numa_work;

 struct list_head numa_entry;
 struct numa_group *numa_group;

 /*
 * numa_faults is an array split into four regions:
 * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
 * in this precise order.
 *
 * faults_memory: Exponential decaying average of faults on a per-node
 * basis. Scheduling placement decisions are made based on these
 * counts. The values remain static for the duration of a PTE scan.
 * faults_cpu: Track the nodes the process was running on when a NUMA
 * hinting fault was incurred.
 * faults_memory_buffer and faults_cpu_buffer: Record faults per node
 * during the current scan window. When the scan completes, the counts
 * in faults_memory and faults_cpu decay and these values are copied.
 */
 unsigned long *numa_faults;
 unsigned long total_numa_faults;

 /*
 * numa_faults_locality tracks if faults recorded during the last
 * scan window were remote/local or failed to migrate. The task scan
 * period is adapted based on the locality of the faults with different
 * weights depending on whether they were shared or private faults
 */
 unsigned long numa_faults_locality[3];

 unsigned long numa_pages_migrated;

#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
struct tlbflush_unmap_batch tlb_ubc;
#endif

 struct rcu_head rcu;

 /*
 * cache last used pipe for splice
 */
 struct pipe_inode_info *splice_pipe;

 struct page_frag task_frag;

#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
#endif
/

* when (nr_dirtied >= nr_dirtied_pause), it’s time to call
* balance_dirty_pages() for some dirty throttling pause
/
int nr_dirtied;
int nr_dirtied_pause;
unsigned long dirty_paused_when; /
start of a write-and-pause period */

#ifdef CONFIG_LATENCYTOP
int latency_record_count;
struct latency_record latency_record[LT_SAVECOUNT];
#endif
/*
* time slack values; these are used to round up poll() and
* select() etc timeout values. These are in nanoseconds.
*/
unsigned long timer_slack_ns;
unsigned long default_timer_slack_ns;

#ifdef CONFIG_KASAN
unsigned int kasan_depth;
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* Index of current stored address in ret_stack /
int curr_ret_stack;
/
Stack of return addresses for return function tracing */
struct ftrace_ret_stack ret_stack;
/
time stamp for last schedule /
unsigned long long ftrace_timestamp;
/

* Number of functions that haven’t been traced
* because of depth overrun.
/
atomic_t trace_overrun;
/
Pause for the tracing /
atomic_t tracing_graph_pause;
#endif
#ifdef CONFIG_TRACING
/
state flags for use by tracers /
unsigned long trace;
/
bitmask and counter of trace recursion /
unsigned long trace_recursion;
#endif /
CONFIG_TRACING */
#ifdef CONFIG_MEMCG
struct memcg_oom_info {
struct mem_cgroup *memcg;
gfp_t gfp_mask;
int order;
unsigned int may_oom:1;
} memcg_oom;
#endif
#ifdef CONFIG_UPROBES
struct uprobe_task utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
unsigned int sequential_io;
unsigned int sequential_io_avg;
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
unsigned long task_state_change;
#endif
int pagefault_disabled;
/
CPU-specific state of this task /
struct thread_struct thread;
/

  • WARNING: on x86, ‘thread_struct’ contains a variable-sized
  • structure. It MUST be at the end of ‘task_struct’.
  • Do not put anything below here!
    */
    };

猜你喜欢

转载自blog.csdn.net/huntershuai/article/details/82893391