fork

fork

sys.h - tools/include/nolibc/sys.h - Linux source code v6.8 - Bootlin Elixir Cross Referencer

1
2
3
4
// tools/include/nolibc/sys.h
pid_t fork(void){
return __sysret(sys_fork());
}

__sysret

该函数复制了arg,并判断arg的值,如果arg小于零,则尝试设置EROORNO为-arg,并返回-1,否则直接返回arg。
sys.h - tools/include/nolibc/sys.h - Linux source code v6.8 - Bootlin Elixir Cross Referencer

1
2
3
4
5
6
7
#define __sysret(arg)							\
({ \
__typeof__(arg) __sysret_arg = (arg); \
(__sysret_arg < 0) /* error ? */ \
? (({ SET_ERRNO(-__sysret_arg); }), -1) /* ret -1 with errno = -arg */ \
: __sysret_arg; /* return original value */ \
})

SET_ERRORNO

errno.h - tools/include/nolibc/errno.h - Linux source code v6.8 - Bootlin Elixir Cross Referencer

1
2
3
4
5
6
7
#ifndef NOLIBC_IGNORE_ERRNO
#define SET_ERRNO(v) do { errno = (v); } while (0)
int errno __attribute__((weak));
#else
#define SET_ERRNO(v) do { } while (0)
#endif

关于ERROR_NOLinux errno详解 - Jimmy_Nie - 博客园

sys_fork

fork.c - kernel/fork.c - Linux source code v6.8 - Bootlin Elixir Cross Referencer

1
2
3
4
5
6
7
8
9
10
11
12
13
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
struct kernel_clone_args args = {
.exit_signal = SIGCHLD,
};

return kernel_clone(&args);
#else
/* can not support in nommu mode */
return -EINVAL;
#endif
}

struct kernel_clone_args

task.h - include/linux/sched/task.h - Linux source code v6.8 - Bootlin Elixir Cross Referencer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
struct kernel_clone_args {
u64 flags;
int __user *pidfd;
int __user *child_tid;
int __user *parent_tid;
const char *name;
int exit_signal;
u32 kthread:1;
u32 io_thread:1;
u32 user_worker:1;
u32 no_files:1;
unsigned long stack;
unsigned long stack_size;
unsigned long tls;
pid_t *set_tid;
/* Number of elements in *set_tid */
size_t set_tid_size;
int cgroup;
int idle;
int (*fn)(void *);
void *fn_arg;
struct cgroup *cgrp;
struct css_set *cset;
};

kernel_clone

fork.c - kernel/fork.c - Linux source code v6.8 - Bootlin Elixir Cross Referencer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*
* args->exit_signal is expected to be checked for sanity by the caller.
*/
pid_t kernel_clone(struct kernel_clone_args *args)
{
u64 clone_flags = args->flags;
struct completion vfork;
struct pid *pid;
struct task_struct *p;
int trace = 0;
pid_t nr;

/*
* For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
* to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
* mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
* field in struct clone_args and it still doesn't make sense to have
* them both point at the same memory location. Performing this check
* here has the advantage that we don't need to have a separate helper
* to check for legacy clone().
*/
if ((args->flags & CLONE_PIDFD) &&
(args->flags & CLONE_PARENT_SETTID) &&
(args->pidfd == args->parent_tid))
return -EINVAL;

/*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if (args->exit_signal != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;

if (likely(!ptrace_event_enabled(current, trace)))
trace = 0;
}

p = copy_process(NULL, trace, NUMA_NO_NODE, args);
add_latent_entropy();

if (IS_ERR(p))
return PTR_ERR(p);

/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
trace_sched_process_fork(current, p);

pid = get_task_pid(p, PIDTYPE_PID);
nr = pid_vnr(pid);

if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, args->parent_tid);

if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
get_task_struct(p);
}

if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
/* lock the task to synchronize with memcg migration */
task_lock(p);
lru_gen_add_mm(p->mm);
task_unlock(p);
}

wake_up_new_task(p);

/* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
ptrace_event_pid(trace, pid);

if (clone_flags & CLONE_VFORK) {
if (!wait_for_vfork_done(p, &vfork))
ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}

put_pid(pid);
return nr;
}

copy_process

fork.c - kernel/fork.c - Linux source code v6.8 - Bootlin Elixir Cross Referencer

1
2
3
4
5
6
__latent_entropy struct task_struct *copy_process(
struct pid *pid,
int trace,
int node,
struct kernel_clone_args *args)
{

这个函数非常长,自己去浏览器看。主要的函数是

1
p = dup_task_struct(current, node);

current :
current.h - include/asm-generic/current.h - Linux source code v6.8 - Bootlin Elixir Cross Referencer

1
2
#define get_current() (current_thread_info()->task)
#define current get_current()

再往下就没办法深纠了:
current_thread_info()的定义
只能知道它返回了thread_info指针:

1
2
3
4
5
static inline __attribute_const__ struct thread_info *current_thread_info(void)
{
register unsigned long sp asm("sp");
return (struct thread_info *)(sp & ~(THREAD_SIZE - 1));
}

thread_info.h - arch/arc/include/asm/thread_info.h - Linux source code v6.8 - Bootlin Elixir Cross Referencer

dup_task_struct

fork.c - kernel/fork.c - Linux source code v6.8 - Bootlin Elixir Cross Referencer
该函数完成了复制进程的操作,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
static struct task_struct *dup_task_struct(struct task_struct *orig, int node){
...
tsk = alloc_task_struct_node(node);
...
account_kernel_stack(tsk, 1);
...
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
set_task_stack_end_magic(tsk);
clear_syscall_work_syscall_user_dispatch(tsk);
#ifdef CONFIG_STACKPROTECTOR
tsk->stack_canary = get_random_canary();
#endif
if (orig->cpus_ptr == &orig->cpus_mask)
tsk->cpus_ptr = &tsk->cpus_mask;
dup_user_cpus_ptr(tsk, orig, node);

/*
* One for the user space visible state that goes away when reaped.
* One for the scheduler.
*/
refcount_set(&tsk->rcu_users, 2);
/* One for the rcu users */
refcount_set(&tsk->usage, 1);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
tsk->worker_private = NULL;

kcov_task_init(tsk);
kmsan_task_create(tsk);
kmap_local_fork(tsk);

#ifdef CONFIG_FAULT_INJECTION
tsk->fail_nth = 0;
#endif

#ifdef CONFIG_BLK_CGROUP
tsk->throttle_disk = NULL;
tsk->use_memdelay = 0;
#endif

#ifdef CONFIG_ARCH_HAS_CPU_PASID
tsk->pasid_activated = 0;
#endif

#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
#endif

#ifdef CONFIG_CPU_SUP_INTEL
tsk->reported_split_lock = 0;
#endif

#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid = -1;
tsk->last_mm_cid = -1;
tsk->mm_cid_active = 0;
tsk->migrate_from_cpu = -1;
#endif
return tsk;

free_stack:
exit_task_stack_account(tsk);
free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
return NULL;
}

当然里面还是嵌套了很多层:

account_kernel_stack

fork.c - kernel/fork.c - Linux source code v6.13.5 - Bootlin Elixir Cross Referencer
这个函数分别处理了有VMAP和没有VMAP的情况,对于有VMAP的,该函数复制了父进程的vm_struct,然后对每个分页进行一个偏移,其中mode_lruvec_page_state其实只是对第一个参数进行了偏移,偏移量为参数三。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
static void account_kernel_stack(struct task_struct *tsk, int account)
{
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
struct vm_struct *vm = task_stack_vm_area(tsk);
int i;

for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
account * (PAGE_SIZE / 1024));
} else {
void *stack = task_stack_page(tsk);

/* All stack pages are in the same node. */
mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
account * (THREAD_SIZE / 1024));
}
}

对于参数二,是指偏移的单位,根据源代码来看,这个偏移只能是以字节为单位,或不以字节为单位了。因为vmsat_item_in_bytes返回是否以字节为单位。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
static inline void __mod_node_page_state(struct pglist_data *pgdat,
enum node_stat_item item, int delta)
{
if (vmstat_item_in_bytes(item)) {
/*
* Only cgroups use subpage accounting right now; at
* the global level, these items still change in
* multiples of whole pages. Store them as pages
* internally to keep the per-cpu counters compact.
*/
VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
delta >>= PAGE_SHIFT;
}

node_page_state_add(delta, pgdat, item);
}
setup_thread_stack

该函数复制了thread_info
task_stack.h - include/linux/sched/task_stack.h - Linux source code v6.13.5 - Bootlin Elixir Cross Referencer

1
2
3
4
5
static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
{
*task_thread_info(p) = *task_thread_info(org);
task_thread_info(p)->task = p;
}

__latent_entropy

Fetching Title#73pr
​ latent_entropy是gcc编译器支持的一个编译特性,它只能作用于函数和变量。如果它是在一个函数上,那么plugin将对其进行检测;如果属性在一个变量上,那么plugin将用一个随机值初始化它。变量必须是整型、整型数组类型或具有整型字段的结构。

SYSCALL_DEFINED0

syscalls.h - include/linux/syscalls.h - Linux source code v6.8 - Bootlin Elixir Cross Referencer

1
2
3
4
5
#define SYSCALL_DEFINE0(sname)					\
SYSCALL_METADATA(_##sname, 0); \
asmlinkage long sys_##sname(void); \
ALLOW_ERROR_INJECTION(sys_##sname, ERRNO); \
asmlinkage long sys_##sname(void)

fork
http://hexo.zhywyt.me/posts/fe00f6f7dd14/
作者
zhywyt
发布于
2025年3月17日
更新于
2025年4月23日
许可协议