xfocus logo xfocus title
首页 焦点原创 安全文摘 安全工具 安全漏洞 焦点项目 焦点论坛 关于我们
添加文章 Xcon English Version

Linux2.6内核进程创建过程分析


创建时间:2007-10-18
文章属性:原创
文章提交:pr0cess (pr0cess_at_cnbct.org)

Linux2.6内核进程创建过程分析
/*Kernel version: linux-2.6.22.9
*作者:旋木木
*日期:2007/10/17
*E-mail:xuanmumu@gmail.com
*/
Fork的系统调用代码在linux/arch/i386/kernel/process.c中:
asmlinkage int sys_fork(struct pt_regs regs)
{
    return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
}
Sys_fork系统调用通过 do_fork()函数实现,通过对do_fork()函数传递不同的clone_flags来实现fork,clone,vfork。
Syn_clone和syn_vfork的系统调用代码如下:
asmlinkage int sys_clone(struct pt_regs regs)
{
    unsigned long clone_flags;
    unsigned long newsp;
    int __user *parent_tidptr, *child_tidptr;

    clone_flags = regs.ebx;
    newsp = regs.ecx;
    parent_tidptr = (int __user *)regs.edx;
    child_tidptr = (int __user *)regs.edi;
    if (!newsp)
        newsp = regs.esp;
    return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
}
asmlinkage int sys_vfork(struct pt_regs regs)
{
    return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
}
其中clone_flas在include\linux\sched.h中定义
/*
* cloning flags:
*/
#define CSIGNAL        0x000000ff    /* 进程退出时需要传递的信号*/
#define CLONE_VM    0x00000100    /* 父子进程共享地址空间 */
#define CLONE_FS    0x00000200    /* 父子进程共享文件系统信息 */
#define CLONE_FILES    0x00000400    /* 父子进程共享已打开的文件 */
#define CLONE_SIGHAND    0x00000800    /* 父子进程共享信号处理 */
#define CLONE_PTRACE    0x00002000    /* 继续调试子进程 */
#define CLONE_VFORK    0x00004000    /* 调用vfork(),父进程休眠*/
#define CLONE_PARENT    0x00008000    /* 设置一个共有的父进程 */
#define CLONE_THREAD    0x00010000    /* 父子进程在同一个线程组 */
#define CLONE_NEWNS    0x00020000    /* 为子进程创建一个新的命名空间 */
#define CLONE_SYSVSEM    0x00040000    /* 父子进程共享system V SEM_UNDO */
#define CLONE_SETTLS    0x00080000    /* 为子进程创建新的TLS */
#define CLONE_PARENT_SETTID    0x00100000    /* 设置父进程TID */
#define CLONE_CHILD_CLEARTID    0x00200000    /* 清除子进程TID */
#define CLONE_DETACHED        0x00400000    /* Unused, ignored */
#define CLONE_UNTRACED        0x00800000    /* 不允许调试子进程 */
#define CLONE_CHILD_SETTID    0x01000000    /* 设置子进程TID */
#define CLONE_STOPPED        0x02000000    /* 设置进程停止状态 */
#define CLONE_NEWUTS        0x04000000    /* 创建新的utsname组 */
#define CLONE_NEWIPC        0x08000000    /* 创建新的IPC */
Do_fork()在kernel/fork.c中定义,代码如下:

/*
*  Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          struct pt_regs *regs,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{
    struct task_struct *p;
    int trace = 0;
    struct pid *pid = alloc_pid();
    long nr;
    if (!pid)
        return -EAGAIN;
    nr = pid->nr;
    if (unlikely(current->ptrace)) {
        trace = fork_traceflag (clone_flags);
        if (trace)
            clone_flags |= CLONE_PTRACE;
    }
    p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
    /*
     * Do this prior waking up the new thread - the thread pointer
     * might get invalid after that point, if the thread exits quickly.
     */
    if (!IS_ERR(p)) {
        struct completion vfork;

        if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
        }

        if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
            /*
             * We'll start up with an immediate SIGSTOP.
             */
            sigaddset(&p->pending.signal, SIGSTOP);
            set_tsk_thread_flag(p, TIF_SIGPENDING);
        }

        if (!(clone_flags & CLONE_STOPPED))
            wake_up_new_task(p, clone_flags);
        else
            p->state = TASK_STOPPED;

        if (unlikely (trace)) {
            current->ptrace_message = nr;
            ptrace_notify ((trace << 8) | SIGTRAP);
        }

        if (clone_flags & CLONE_VFORK) {
            freezer_do_not_count();
            wait_for_completion(&vfork);
            freezer_count();
            if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
                current->ptrace_message = nr;
                ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
            }
        }
    } else {
        free_pid(pid);
        nr = PTR_ERR(p);
    }
    return nr;
}
Do_fork()函数的核心是copy_process()函数,该函数完成了进程创建的绝大部分工作并且也在fork.c定义,copy_process函数较长,逐段往下看:
static struct task_struct *copy_process(unsigned long clone_flags,
                    unsigned long stack_start,
                    struct pt_regs *regs,
                    unsigned long stack_size,
                    int __user *parent_tidptr,
                    int __user *child_tidptr,
                    struct pid *pid)
{
    int retval;
    struct task_struct *p = NULL;

    if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
        return ERR_PTR(-EINVAL);

    /*
     * Thread groups must share signals as well, and detached threads
     * can only be started up within the thread group.
     */
    if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
        return ERR_PTR(-EINVAL);

    /*
     * Shared signal handlers imply shared VM. By way of the above,
     * thread groups also imply shared VM. Blocking this case allows
     * for various simplifications in other code.
     */
    if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
        return ERR_PTR(-EINVAL);

    retval = security_task_create(clone_flags);
    if (retval)
        goto fork_out;

    retval = -ENOMEM;
    p = dup_task_struct(current);
    if (!p)
        goto fork_out;

    rt_mutex_init_task(p);

#ifdef CONFIG_TRACE_IRQFLAGS
    DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
    DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
这段代码首先对传入的clone_flag进行检查,接着调用了dup_task_struct()函数,该函数的主要作用是:为子进程创建一个新的内核栈,复制task_struct结构和thread_info结构,这里只是对结构完整的复制,所以子进程的进程描述符跟父进程完全一样。跟进dup_task_struct()函数看代码:
static struct task_struct *dup_task_struct(struct task_struct *orig)
{
    struct task_struct *tsk;
    struct thread_info *ti;

    prepare_to_copy(orig);

    tsk = alloc_task_struct();
    if (!tsk)
        return NULL;

    ti = alloc_thread_info(tsk);
    if (!ti) {
        free_task_struct(tsk);
        return NULL;
    }

    *tsk = *orig;
    tsk->stack = ti;
    setup_thread_stack(tsk, orig);

#ifdef CONFIG_CC_STACKPROTECTOR
    tsk->stack_canary = get_random_int();
#endif

    /* One for us, one for whoever does the "release_task()" (usually parent) */
    atomic_set(&tsk->usage,2);
    atomic_set(&tsk->fs_excl, 0);
#ifdef CONFIG_BLK_DEV_IO_TRACE
    tsk->btrace_seq = 0;
#endif
    tsk->splice_pipe = NULL;
    return tsk;
}
通过alloc_task_struct()函数创建内核栈和task_struct结构空间,alloc_task_struct()函数定义为
# define alloc_task_struct()    kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
接着分配thread_info结构空间
ti = alloc_thread_info(tsk);
thread_info结构定义在asm/thread_info.h中
struct thread_info {
        struct task_struct    *task;
        struct exec_domain    *exec_domain;
        unsigned long         flags;
        unsigned long         status;
        __u32                 cpu;
        __s32                 preempt_count;
        mm_segment_t          addr_limit;
        struct restart_block  restart_block;
        unsigned long         previous_esp;
        __u8                  supervisor_stack[0];
};
继续
*tsk = *orig;
为整个task_struct结构复制
再调用setup_thread_stack()函数为thread_info结构复制
static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
{
    *task_thread_info(p) = *task_thread_info(org);
    task_thread_info(p)->task = p;
}
其中
task_thread_info(p)->task = p;
thread_info结构中的task成员中存放的是指向当前进程task_struct结构的指针。
回到copy_process()函数,继续看:
    if (atomic_read(&p->user->processes) >=
            p->signal-> rlim[RLIMIT_NPROC].rlim_cur) {
        if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
                p->user != &root_user)
            goto bad_fork_free;
    }

    atomic_inc(&p->user->__count);
    atomic_inc(&p->user->processes);
    get_group_info(p->group_info);
首先看前面的两个if,第一个if里面的rlim数组包含在task_sturct数组中。对进程占用的资源数做出限制,rlim[RLIMIT_NPROC]限制了改进程用户可以拥有的总进程数量,如果当前用户所拥有的进程数量超过了规定的最大拥有进程数量,在2.4内核中就直接goto bad_fork_free了。第2个if使用了capable()函数来对权限做出检查,检查是否有权对指定的资源进行操作,该函数返回0则代表无权操作。该函数的定义在linux/capability.h中,其中包含了与之相对应的权限列表。
在task_struct结构中有一个指针user,该指针指向一个user_struct结构,一个用户的多个进程可以通过user指针共享该用户的资源信息,该结构定义在include/linux/sched.h中:
struct user_struct {
    atomic_t __count;    /*统计用户拥有进程数量的计数器 */
    atomic_t processes;    /*统计用户拥有进程数 */
    atomic_t files;        /* 统计用户打开的文件数 */
    atomic_t sigpending;    /* 统计用户拥有的信号 */
#ifdef CONFIG_INOTIFY_USER
    atomic_t inotify_watches; /* How many inotify watches does this user have? */
    atomic_t inotify_devs;    /* How many inotify devs does this user have opened? */
#endif
    /* protected by mq_lock    */
    unsigned long mq_bytes;    /* How many bytes can be allocated to mqueue? */
    unsigned long locked_shm; /* How many pages of mlocked shm ? */

#ifdef CONFIG_KEYS
    struct key *uid_keyring;    /* UID specific keyring */
    struct key *session_keyring;    /* UID's default session keyring */
#endif

    /* Hash table maintenance information */
    struct list_head uidhash_list;
    uid_t uid;
};
既然新创建了一个进程,自然要更新该用户的user_struct结构,累加相应的计数器,这个工作就由atomic_inc()函数完成,atomic_inc函数定义在include/asm-blackfin/atomic.h中:
static __inline__ void atomic_inc(volatile atomic_t * v)
{
    long flags;

    local_irq_save(flags);
    v->counter++;
    local_irq_restore(flags);
}
函数保存当前各成员的标记,然后进行累加,最后更新各成员,完成累加计数器的操作。
继续看copy_process函数的代码:
if (nr_threads >= max_threads)
        goto bad_fork_cleanup_count;

    if (!try_module_get(task_thread_info(p)->exec_domain->module))
        goto bad_fork_cleanup_count;

    if (p->binfmt && !try_module_get(p->binfmt->module))
        goto bad_fork_cleanup_put_domain;

    p->did_exec = 0;
    delayacct_tsk_init(p);    /* Must remain after dup_task_struct() */
    copy_flags(clone_flags, p);
    p->pid = pid_nr(pid);
    retval = -EFAULT;
    if (clone_flags & CLONE_PARENT_SETTID)
        if (put_user(p->pid, parent_tidptr))
            goto bad_fork_cleanup_delays_binfmt;

    INIT_LIST_HEAD(&p->children);
    INIT_LIST_HEAD(&p->sibling);
    p->vfork_done = NULL;
    spin_lock_init(&p->alloc_lock);

    clear_tsk_thread_flag(p, TIF_SIGPENDING);
    init_sigpending(&p->pending);
代码段
if (nr_threads >= max_threads)
        goto bad_fork_cleanup_count;
检查创建的进程是否超过了系统进程总量
if (!try_module_get(task_thread_info(p)->exec_domain->module))
        goto bad_fork_cleanup_count;
获得进程执行域
if (p->binfmt && !try_module_get(p->binfmt->module))
        goto bad_fork_cleanup_put_domain;
不同进程所执行的程序的格式也不一样,系统对不同格式的支持通过动态安装驱动模块实现,task_struct结构中有一个指向linux_binfmt结构的指针,获得进程执行程序映象。
copy_flags(clone_flags, p);
调用copy_flags函数更新task_struct结构中flags成员。表明进程是否拥有超级用户权限的PF_SUPERPPRIV标志被清除,表明进程还没有exec()的PF_FORKNOEXEC被设置,相关实现代码也在fork..c中:
static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
{
    unsigned long new_flags = p->flags;

    new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
    new_flags |= PF_FORKNOEXEC;
    if (!(clone_flags & CLONE_PTRACE))
        p->ptrace = 0;
    p->flags = new_flags;
}
接着p->pid = pid_nr(pid);获取一个PID
p->vfork_done = NULL;
vfork()在调用copy_process()时,task_struct结构的vfork_done成员被设置为NULL,在回到do_fork()执行时vfork_done会指向一个特殊的地址,这在do_fork中可以清楚的看到。
继续走下去:
p->utime = cputime_zero;
    p->stime = cputime_zero;
    p->sched_time = 0;
#ifdef CONFIG_TASK_XACCT
    p->rchar = 0;        /* I/O counter: bytes read */
    p->wchar = 0;        /* I/O counter: bytes written */
    p->syscr = 0;        /* I/O counter: read syscalls */
    p->syscw = 0;
…………………..
开始漫长的对子进程task_struct结构的初始化
…………………..
…………………..
…………………..
…………………..
继续
p->tgid = p->pid;
    if (clone_flags & CLONE_THREAD)
        p->tgid = current->tgid;
如果设置了同在一个线程组则继承TGID。对于普通进程来说TGID和PID相等,对于线程来说,同一线程组内的所有线程的TGID都相等,这使得这些多线程可以通过调用getpid()获得相同的PID。
又该继续了-_-….
if ((retval = security_task_alloc(p)))
        goto bad_fork_cleanup_policy;
    if ((retval = audit_alloc(p)))
        goto bad_fork_cleanup_security;
    /* copy all the process information */
    if ((retval = copy_semundo(clone_flags, p)))
        goto bad_fork_cleanup_audit;
    if ((retval = copy_files(clone_flags, p)))
        goto bad_fork_cleanup_semundo;
    if ((retval = copy_fs(clone_flags, p)))
        goto bad_fork_cleanup_files;
    if ((retval = copy_sighand(clone_flags, p)))
        goto bad_fork_cleanup_fs;
    if ((retval = copy_signal(clone_flags, p)))
        goto bad_fork_cleanup_sighand;
    if ((retval = copy_mm(clone_flags, p)))
        goto bad_fork_cleanup_signal;
    if ((retval = copy_keys(clone_flags, p)))
        goto bad_fork_cleanup_mm;
    if ((retval = copy_namespaces(clone_flags, p)))
        goto bad_fork_cleanup_keys;
    retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
    if (retval)
        goto bad_fork_cleanup_namespaces;
对task_struct结构的初始化完了就该继续copy其他的资源了,这部分调用的函数较多,基本都是在fork.c中定义的,比如copy_files():
static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
{
    struct files_struct *oldf, *newf;
    int error = 0;

    /*
     * A background process may not have any files ...
     */
    oldf = current->files;
    if (!oldf)
        goto out;

    if (clone_flags & CLONE_FILES) {
        atomic_inc(&oldf->count);
        goto out;
    }

    /*
     * Note: we may be using current for both targets (See exec.c)
     * This works because we cache current->files (old) as oldf. Don't
     * break this.
     */
    tsk->files = NULL;
    newf = dup_fd(oldf, &error);
    if (!newf)
        goto out;

    tsk->files = newf;
    error = 0;
out:
    return error;
}
task_struct结构中有一个指针flies指向一个file_struct结构,因为是从当前进程复制到子进程,所以oldf = current->files;
然后
if (clone_flags & CLONE_FILES) {
        atomic_inc(&oldf->count);
        goto out;
    }
如果设置了CLONE_FILES,也就是CLONE_FILES=1,就只是共享,通过调用atomic_inc(这个函数之前说过了)增加共享计数,之前复制整个task_struct结构时,把flies指针也复制给子进程了,所以子进程可以通过指针共享file_sturct结构,不要忘记fork()函数调用传递的clone_flags都为0,既不是简单共享而是全部复制。
接着调用dup_fd函数来进行复制。庆幸的是该函数定义也在fork.c中,不幸的是该函数又是疯狂调用其他函数…
由于代码长不全部列举了,进入dup_fd函数中去:
newf = alloc_files();
调用了alloc_files(),跟进alloc_files()函数:
static struct files_struct *alloc_files(void)
{
    struct files_struct *newf;
    struct fdtable *fdt;

    newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
    if (!newf)
        goto out;

    atomic_set(&newf->count, 1);

    spin_lock_init(&newf->file_lock);
    newf->next_fd = 0;
    fdt = &newf->fdtab;
    fdt->max_fds = NR_OPEN_DEFAULT;
    fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
    fdt->open_fds = (fd_set *)&newf->open_fds_init;
    fdt->fd = &newf->fd_array[0];
    INIT_RCU_HEAD(&fdt->rcu);
    fdt->next = NULL;
    rcu_assign_pointer(newf->fdt, fdt);
out:
    return newf;
}
调用kmem_cache_alloc函数来为子进程分配一个file_struct结构,接着设置这个新的file_struct结构的count成员
fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
    fdt->open_fds = (fd_set *)&newf->open_fds_init;
    fdt->fd = &newf->fd_array[0];
这3个指针分别指向:位图close_on_exec_init,位图open_fds_init,数组fd_array[],这3个成员大小都是固定的。
出来后就开始进行copy,把oldf的内容copy到新创建的newf中。
中间继续复制其他资源,只有当clone_flags为0时才是真正的复制
Do_fork()之前调用了dup_task_struct函数分配了2个连续页面,低端存放task_strust结构,高端作为系统空间堆栈,由copy_thread来完成。该函数复制父进程的系统空间堆栈,堆栈中有完整路线指明父进程通过系统调用进入内核空间的过程,子进程退出时需要按照完整路线返回。
struct pt_regs * regs结构存放着进入内核空间前各寄存器的内容。如果完全复制父进程的系统空间堆栈则无法区分子进程和父进程,所以要对子进程的相关内容进行调整。
struct pt_regs * childregs;
    struct task_struct *tsk;
    int err;

    childregs = task_pt_regs(p);
    *childregs = *regs;
    childregs->eax = 0;
    childregs->esp = esp;
首先将eax设0,作为系统调用结束时的返回值
hildregs->esp = esp;
指出进程在用户态的堆栈地址,该值在fork()中为传递进去的regs.esp
p->thread.esp = (unsigned long) childregs;
    p->thread.esp0 = (unsigned long) (childregs+1);

    p->thread.eip = (unsigned long) ret_from_fork;

    savesegment(gs,p->thread.gs);
P指向的task_struct结构中有一个thread指针,指向一个thread_struct结构,里面记录着进程切换时的堆栈指针,在子进程中也需要进行调整
p->thread.esp = (unsigned long) childregs;
指向子进程的pt_regs结构起始地址
p->thread.esp0 = (unsigned long) (childregs+1);
指向子进程的系统空间栈顶,当进程被调度运行时,内核会将这个值写入esp0字段,标志该进程在ring0运行时的堆栈地址。
p->thread.eip = (unsigned long) ret_from_fork;
指向当进程下一次被切换运行时的入口处
savesegment(gs,p->thread.gs);
把当前段寄存器的gs的值保存在thread.gs中
p->parent_exec_id = p->self_exec_id;

    /* ok, now we should be set up.. */
    p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
    p->pdeath_signal = 0;
    p->exit_state = 0;
p->parent_exec_id = p->self_exec_id;
设置子进程的执行域
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
设置子进程退出时要象父进程发送的信号
最后将子进程连入进程队列等待被唤醒,再处理其他的一些收尾工作然后返回一个指向子进程的指针。
/*
*晕死了…
*/
回到do_fork()函数中
if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
        }
调用vfork
if (!(clone_flags & CLONE_STOPPED))
            wake_up_new_task(p, clone_flags);
        else
            p->state = TASK_STOPPED;
唤醒子进程并开始运行。
至此,一个进程创建就完成了。