aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCyrill Gorcunov <gorcunov@openvz.org>2012-05-31 19:26:44 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-05-31 20:49:32 -0400
commitd97b46a64674a267bc41c9e16132ee2a98c3347d (patch)
tree316f77d212c84aef226684eb05d5d33f40743ac9
parent818411616baf46ceba0cff6f05af3a9b294734f7 (diff)
syscalls, x86: add __NR_kcmp syscall
While doing the checkpoint-restore in the user space one need to determine whether various kernel objects (like mm_struct-s of file_struct-s) are shared between tasks and restore this state. The 2nd step can be solved by using appropriate CLONE_ flags and the unshare syscall, while there's currently no ways for solving the 1st one. One of the ways for checking whether two tasks share e.g. mm_struct is to provide some mm_struct ID of a task to its proc file, but showing such info considered to be not that good for security reasons. Thus after some debates we end up in conclusion that using that named 'comparison' syscall might be the best candidate. So here is it -- __NR_kcmp. It takes up to 5 arguments - the pids of the two tasks (which characteristics should be compared), the comparison type and (in case of comparison of files) two file descriptors. Lookups for pids are done in the caller's PID namespace only. At moment only x86 is supported and tested. [akpm@linux-foundation.org: fix up selftests, warnings] [akpm@linux-foundation.org: include errno.h] [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org> Acked-by: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Pavel Emelyanov <xemul@parallels.com> Cc: Andrey Vagin <avagin@openvz.org> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Glauber Costa <glommer@parallels.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Tejun Heo <tj@kernel.org> Cc: Matt Helsley <matthltc@us.ibm.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Vasiliy Kulikov <segoon@openwall.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Valdis.Kletnieks@vt.edu Cc: Michal Marek <mmarek@suse.cz> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/x86/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/syscalls/syscall_64.tbl2
-rw-r--r--include/linux/kcmp.h17
-rw-r--r--include/linux/syscalls.h2
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/kcmp.c196
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--tools/testing/selftests/Makefile2
-rw-r--r--tools/testing/selftests/kcmp/Makefile29
-rw-r--r--tools/testing/selftests/kcmp/kcmp_test.c94
10 files changed, 348 insertions, 1 deletions
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 29f9f0554f7d..7a35a6e71d44 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -355,3 +355,4 @@
355346 i386 setns sys_setns 355346 i386 setns sys_setns
356347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv 356347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv
357348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev 357348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
358349 i386 kcmp sys_kcmp
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index dd29a9ea27c5..51171aeff0dc 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -318,6 +318,8 @@
318309 common getcpu sys_getcpu 318309 common getcpu sys_getcpu
319310 64 process_vm_readv sys_process_vm_readv 319310 64 process_vm_readv sys_process_vm_readv
320311 64 process_vm_writev sys_process_vm_writev 320311 64 process_vm_writev sys_process_vm_writev
321312 64 kcmp sys_kcmp
322
321# 323#
322# x32-specific system call numbers start at 512 to avoid cache impact 324# x32-specific system call numbers start at 512 to avoid cache impact
323# for native 64-bit operation. 325# for native 64-bit operation.
diff --git a/include/linux/kcmp.h b/include/linux/kcmp.h
new file mode 100644
index 000000000000..2dcd1b3aafc8
--- /dev/null
+++ b/include/linux/kcmp.h
@@ -0,0 +1,17 @@
1#ifndef _LINUX_KCMP_H
2#define _LINUX_KCMP_H
3
4/* Comparison type */
5enum kcmp_type {
6 KCMP_FILE,
7 KCMP_VM,
8 KCMP_FILES,
9 KCMP_FS,
10 KCMP_SIGHAND,
11 KCMP_IO,
12 KCMP_SYSVSEM,
13
14 KCMP_TYPES,
15};
16
17#endif /* _LINUX_KCMP_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3de3acb84a95..19439c75c5b2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -858,4 +858,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
858 unsigned long riovcnt, 858 unsigned long riovcnt,
859 unsigned long flags); 859 unsigned long flags);
860 860
861asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
862 unsigned long idx1, unsigned long idx2);
861#endif 863#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c07f30fa9b7..80be6ca0cc75 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -25,6 +25,9 @@ endif
25obj-y += sched/ 25obj-y += sched/
26obj-y += power/ 26obj-y += power/
27 27
28ifeq ($(CONFIG_CHECKPOINT_RESTORE),y)
29obj-$(CONFIG_X86) += kcmp.o
30endif
28obj-$(CONFIG_FREEZER) += freezer.o 31obj-$(CONFIG_FREEZER) += freezer.o
29obj-$(CONFIG_PROFILING) += profile.o 32obj-$(CONFIG_PROFILING) += profile.o
30obj-$(CONFIG_STACKTRACE) += stacktrace.o 33obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
new file mode 100644
index 000000000000..30b7b225306c
--- /dev/null
+++ b/kernel/kcmp.c
@@ -0,0 +1,196 @@
1#include <linux/kernel.h>
2#include <linux/syscalls.h>
3#include <linux/fdtable.h>
4#include <linux/string.h>
5#include <linux/random.h>
6#include <linux/module.h>
7#include <linux/init.h>
8#include <linux/errno.h>
9#include <linux/cache.h>
10#include <linux/bug.h>
11#include <linux/err.h>
12#include <linux/kcmp.h>
13
14#include <asm/unistd.h>
15
16/*
17 * We don't expose the real in-memory order of objects for security reasons.
18 * But still the comparison results should be suitable for sorting. So we
19 * obfuscate kernel pointers values and compare the production instead.
20 *
21 * The obfuscation is done in two steps. First we xor the kernel pointer with
22 * a random value, which puts pointer into a new position in a reordered space.
23 * Secondly we multiply the xor production with a large odd random number to
24 * permute its bits even more (the odd multiplier guarantees that the product
25 * is unique ever after the high bits are truncated, since any odd number is
26 * relative prime to 2^n).
27 *
28 * Note also that the obfuscation itself is invisible to userspace and if needed
29 * it can be changed to an alternate scheme.
30 */
31static unsigned long cookies[KCMP_TYPES][2] __read_mostly;
32
33static long kptr_obfuscate(long v, int type)
34{
35 return (v ^ cookies[type][0]) * cookies[type][1];
36}
37
38/*
39 * 0 - equal, i.e. v1 = v2
40 * 1 - less than, i.e. v1 < v2
41 * 2 - greater than, i.e. v1 > v2
42 * 3 - not equal but ordering unavailable (reserved for future)
43 */
44static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
45{
46 long ret;
47
48 ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
49
50 return (ret < 0) | ((ret > 0) << 1);
51}
52
53/* The caller must have pinned the task */
54static struct file *
55get_file_raw_ptr(struct task_struct *task, unsigned int idx)
56{
57 struct file *file = NULL;
58
59 task_lock(task);
60 rcu_read_lock();
61
62 if (task->files)
63 file = fcheck_files(task->files, idx);
64
65 rcu_read_unlock();
66 task_unlock(task);
67
68 return file;
69}
70
71static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
72{
73 if (likely(m2 != m1))
74 mutex_unlock(m2);
75 mutex_unlock(m1);
76}
77
78static int kcmp_lock(struct mutex *m1, struct mutex *m2)
79{
80 int err;
81
82 if (m2 > m1)
83 swap(m1, m2);
84
85 err = mutex_lock_killable(m1);
86 if (!err && likely(m1 != m2)) {
87 err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
88 if (err)
89 mutex_unlock(m1);
90 }
91
92 return err;
93}
94
95SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
96 unsigned long, idx1, unsigned long, idx2)
97{
98 struct task_struct *task1, *task2;
99 int ret;
100
101 rcu_read_lock();
102
103 /*
104 * Tasks are looked up in caller's PID namespace only.
105 */
106 task1 = find_task_by_vpid(pid1);
107 task2 = find_task_by_vpid(pid2);
108 if (!task1 || !task2)
109 goto err_no_task;
110
111 get_task_struct(task1);
112 get_task_struct(task2);
113
114 rcu_read_unlock();
115
116 /*
117 * One should have enough rights to inspect task details.
118 */
119 ret = kcmp_lock(&task1->signal->cred_guard_mutex,
120 &task2->signal->cred_guard_mutex);
121 if (ret)
122 goto err;
123 if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
124 !ptrace_may_access(task2, PTRACE_MODE_READ)) {
125 ret = -EPERM;
126 goto err_unlock;
127 }
128
129 switch (type) {
130 case KCMP_FILE: {
131 struct file *filp1, *filp2;
132
133 filp1 = get_file_raw_ptr(task1, idx1);
134 filp2 = get_file_raw_ptr(task2, idx2);
135
136 if (filp1 && filp2)
137 ret = kcmp_ptr(filp1, filp2, KCMP_FILE);
138 else
139 ret = -EBADF;
140 break;
141 }
142 case KCMP_VM:
143 ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM);
144 break;
145 case KCMP_FILES:
146 ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES);
147 break;
148 case KCMP_FS:
149 ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS);
150 break;
151 case KCMP_SIGHAND:
152 ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND);
153 break;
154 case KCMP_IO:
155 ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO);
156 break;
157 case KCMP_SYSVSEM:
158#ifdef CONFIG_SYSVIPC
159 ret = kcmp_ptr(task1->sysvsem.undo_list,
160 task2->sysvsem.undo_list,
161 KCMP_SYSVSEM);
162#else
163 ret = -EOPNOTSUPP;
164#endif
165 break;
166 default:
167 ret = -EINVAL;
168 break;
169 }
170
171err_unlock:
172 kcmp_unlock(&task1->signal->cred_guard_mutex,
173 &task2->signal->cred_guard_mutex);
174err:
175 put_task_struct(task1);
176 put_task_struct(task2);
177
178 return ret;
179
180err_no_task:
181 rcu_read_unlock();
182 return -ESRCH;
183}
184
185static __init int kcmp_cookies_init(void)
186{
187 int i;
188
189 get_random_bytes(cookies, sizeof(cookies));
190
191 for (i = 0; i < KCMP_TYPES; i++)
192 cookies[i][1] |= (~(~0UL >> 1) | 1);
193
194 return 0;
195}
196arch_initcall(kcmp_cookies_init);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 47bfa16430d7..dbff751e4086 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark);
203cond_syscall(sys_name_to_handle_at); 203cond_syscall(sys_name_to_handle_at);
204cond_syscall(sys_open_by_handle_at); 204cond_syscall(sys_open_by_handle_at);
205cond_syscall(compat_sys_open_by_handle_at); 205cond_syscall(compat_sys_open_by_handle_at);
206
207/* compare kernel pointers */
208cond_syscall(sys_kcmp);
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 14972017a43e..a4162e15c25f 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -1,4 +1,4 @@
1TARGETS = breakpoints mqueue vm 1TARGETS = breakpoints kcmp mqueue vm
2 2
3all: 3all:
4 for TARGET in $(TARGETS); do \ 4 for TARGET in $(TARGETS); do \
diff --git a/tools/testing/selftests/kcmp/Makefile b/tools/testing/selftests/kcmp/Makefile
new file mode 100644
index 000000000000..dc79b86ea65c
--- /dev/null
+++ b/tools/testing/selftests/kcmp/Makefile
@@ -0,0 +1,29 @@
1uname_M := $(shell uname -m 2>/dev/null || echo not)
2ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
3ifeq ($(ARCH),i386)
4 ARCH := X86
5 CFLAGS := -DCONFIG_X86_32 -D__i386__
6endif
7ifeq ($(ARCH),x86_64)
8 ARCH := X86
9 CFLAGS := -DCONFIG_X86_64 -D__x86_64__
10endif
11
12CFLAGS += -I../../../../arch/x86/include/generated/
13CFLAGS += -I../../../../include/
14CFLAGS += -I../../../../usr/include/
15CFLAGS += -I../../../../arch/x86/include/
16
17all:
18ifeq ($(ARCH),X86)
19 gcc $(CFLAGS) kcmp_test.c -o run_test
20else
21 echo "Not an x86 target, can't build kcmp selftest"
22endif
23
24run-tests: all
25 ./kcmp_test
26
27clean:
28 rm -fr ./run_test
29 rm -fr ./test-file
diff --git a/tools/testing/selftests/kcmp/kcmp_test.c b/tools/testing/selftests/kcmp/kcmp_test.c
new file mode 100644
index 000000000000..358cc6bfa35d
--- /dev/null
+++ b/tools/testing/selftests/kcmp/kcmp_test.c
@@ -0,0 +1,94 @@
1#define _GNU_SOURCE
2
3#include <stdio.h>
4#include <stdlib.h>
5#include <signal.h>
6#include <limits.h>
7#include <unistd.h>
8#include <errno.h>
9#include <string.h>
10#include <fcntl.h>
11
12#include <linux/unistd.h>
13#include <linux/kcmp.h>
14
15#include <sys/syscall.h>
16#include <sys/types.h>
17#include <sys/stat.h>
18#include <sys/wait.h>
19
20static long sys_kcmp(int pid1, int pid2, int type, int fd1, int fd2)
21{
22 return syscall(__NR_kcmp, pid1, pid2, type, fd1, fd2);
23}
24
25int main(int argc, char **argv)
26{
27 const char kpath[] = "kcmp-test-file";
28 int pid1, pid2;
29 int fd1, fd2;
30 int status;
31
32 fd1 = open(kpath, O_RDWR | O_CREAT | O_TRUNC, 0644);
33 pid1 = getpid();
34
35 if (fd1 < 0) {
36 perror("Can't create file");
37 exit(1);
38 }
39
40 pid2 = fork();
41 if (pid2 < 0) {
42 perror("fork failed");
43 exit(1);
44 }
45
46 if (!pid2) {
47 int pid2 = getpid();
48 int ret;
49
50 fd2 = open(kpath, O_RDWR, 0644);
51 if (fd2 < 0) {
52 perror("Can't open file");
53 exit(1);
54 }
55
56 /* An example of output and arguments */
57 printf("pid1: %6d pid2: %6d FD: %2ld FILES: %2ld VM: %2ld "
58 "FS: %2ld SIGHAND: %2ld IO: %2ld SYSVSEM: %2ld "
59 "INV: %2ld\n",
60 pid1, pid2,
61 sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd2),
62 sys_kcmp(pid1, pid2, KCMP_FILES, 0, 0),
63 sys_kcmp(pid1, pid2, KCMP_VM, 0, 0),
64 sys_kcmp(pid1, pid2, KCMP_FS, 0, 0),
65 sys_kcmp(pid1, pid2, KCMP_SIGHAND, 0, 0),
66 sys_kcmp(pid1, pid2, KCMP_IO, 0, 0),
67 sys_kcmp(pid1, pid2, KCMP_SYSVSEM, 0, 0),
68
69 /* This one should fail */
70 sys_kcmp(pid1, pid2, KCMP_TYPES + 1, 0, 0));
71
72 /* This one should return same fd */
73 ret = sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd1);
74 if (ret) {
75 printf("FAIL: 0 expected but %d returned\n", ret);
76 ret = -1;
77 } else
78 printf("PASS: 0 returned as expected\n");
79
80 /* Compare with self */
81 ret = sys_kcmp(pid1, pid1, KCMP_VM, 0, 0);
82 if (ret) {
83 printf("FAIL: 0 expected but %li returned\n", ret);
84 ret = -1;
85 } else
86 printf("PASS: 0 returned as expected\n");
87
88 exit(ret);
89 }
90
91 waitpid(pid2, &status, P_ALL);
92
93 return 0;
94}