diff options
author | Pavel Emelyanov <xemul@parallels.com> | 2012-01-12 20:20:27 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-12 23:13:11 -0500 |
commit | b8f566b04d3cddd192cfd2418ae6d54ac6353792 (patch) | |
tree | 32a5bf86548cd43feff4822d800b6a99e157b5d7 | |
parent | f5138e42211d4e8bfbd6ac5b3816348da1533433 (diff) |
sysctl: add the kernel.ns_last_pid control
The sysctl works on the current task's pid namespace, getting and setting
its last_pid field.
Writing is allowed for CAP_SYS_ADMIN-capable tasks thus making it possible
to create a task with desired pid value. This ability is required badly
for the checkpoint/restore in userspace.
This approach suits all the parties for now.
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/sysctl/kernel.txt | 8 | ||||
-rw-r--r-- | kernel/pid.c | 4 | ||||
-rw-r--r-- | kernel/pid_namespace.c | 31 |
3 files changed, 42 insertions, 1 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 6d8cd8b2c30d..8c20fbd8b42d 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
@@ -415,6 +415,14 @@ PIDs of value pid_max or larger are not allocated. | |||
415 | 415 | ||
416 | ============================================================== | 416 | ============================================================== |
417 | 417 | ||
418 | ns_last_pid: | ||
419 | |||
420 | The last pid allocated in the current (the one task using this sysctl | ||
421 | lives in) pid namespace. When selecting a pid for a next task on fork | ||
422 | kernel tries to allocate a number starting from this one. | ||
423 | |||
424 | ============================================================== | ||
425 | |||
418 | powersave-nap: (PPC only) | 426 | powersave-nap: (PPC only) |
419 | 427 | ||
420 | If set, Linux-PPC will use the 'nap' mode of powersaving, | 428 | If set, Linux-PPC will use the 'nap' mode of powersaving, |
diff --git a/kernel/pid.c b/kernel/pid.c index fa5f72227e5f..ce8e00deaccb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b) | |||
137 | } | 137 | } |
138 | 138 | ||
139 | /* | 139 | /* |
140 | * We might be racing with someone else trying to set pid_ns->last_pid. | 140 | * We might be racing with someone else trying to set pid_ns->last_pid |
141 | * at the pid allocation time (there's also a sysctl for this, but racing | ||
142 | * with this one is OK, see comment in kernel/pid_namespace.c about it). | ||
141 | * We want the winner to have the "later" value, because if the | 143 | * We want the winner to have the "later" value, because if the |
142 | * "earlier" value prevails, then a pid may get reused immediately. | 144 | * "earlier" value prevails, then a pid may get reused immediately. |
143 | * | 145 | * |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index e9c9adc84ca6..a8968396046d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
191 | return; | 191 | return; |
192 | } | 192 | } |
193 | 193 | ||
194 | static int pid_ns_ctl_handler(struct ctl_table *table, int write, | ||
195 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
196 | { | ||
197 | struct ctl_table tmp = *table; | ||
198 | |||
199 | if (write && !capable(CAP_SYS_ADMIN)) | ||
200 | return -EPERM; | ||
201 | |||
202 | /* | ||
203 | * Writing directly to ns' last_pid field is OK, since this field | ||
204 | * is volatile in a living namespace anyway and a code writing to | ||
205 | * it should synchronize its usage with external means. | ||
206 | */ | ||
207 | |||
208 | tmp.data = ¤t->nsproxy->pid_ns->last_pid; | ||
209 | return proc_dointvec(&tmp, write, buffer, lenp, ppos); | ||
210 | } | ||
211 | |||
212 | static struct ctl_table pid_ns_ctl_table[] = { | ||
213 | { | ||
214 | .procname = "ns_last_pid", | ||
215 | .maxlen = sizeof(int), | ||
216 | .mode = 0666, /* permissions are checked in the handler */ | ||
217 | .proc_handler = pid_ns_ctl_handler, | ||
218 | }, | ||
219 | { } | ||
220 | }; | ||
221 | |||
222 | static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; | ||
223 | |||
194 | static __init int pid_namespaces_init(void) | 224 | static __init int pid_namespaces_init(void) |
195 | { | 225 | { |
196 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); | 226 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); |
227 | register_sysctl_paths(kern_path, pid_ns_ctl_table); | ||
197 | return 0; | 228 | return 0; |
198 | } | 229 | } |
199 | 230 | ||