diff options
author | Andy Lutomirski <luto@amacapital.net> | 2012-10-01 14:40:45 -0400 |
---|---|---|
committer | James Morris <james.l.morris@oracle.com> | 2012-10-02 07:14:29 -0400 |
commit | 87b526d349b04c31d7b3a40b434eb3f825d22305 (patch) | |
tree | 2aeec0465901c9623ef7f5b3eb451ea6ccce6ecc /arch/x86/kernel | |
parent | bf5308344527d015ac9a6d2bda4ad4d40fd7d943 (diff) |
seccomp: Make syscall skipping and nr changes more consistent
This fixes two issues that could cause incompatibility between
kernel versions:
- If a tracer uses SECCOMP_RET_TRACE to select a syscall number
higher than the largest known syscall, emulate the unknown
vsyscall by returning -ENOSYS. (This is unlikely to make a
noticeable difference on x86-64 due to the way the system call
entry works.)
- On x86-64 with vsyscall=emulate, skipped vsyscalls were buggy.
This updates the documentation accordingly.
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Acked-by: Will Drewry <wad@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/vsyscall_64.c | 110 |
1 files changed, 59 insertions, 51 deletions
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 8d141b309046..b2e58a248b3b 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c | |||
@@ -136,19 +136,6 @@ static int addr_to_vsyscall_nr(unsigned long addr) | |||
136 | return nr; | 136 | return nr; |
137 | } | 137 | } |
138 | 138 | ||
139 | #ifdef CONFIG_SECCOMP | ||
140 | static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr) | ||
141 | { | ||
142 | if (!seccomp_mode(&tsk->seccomp)) | ||
143 | return 0; | ||
144 | task_pt_regs(tsk)->orig_ax = syscall_nr; | ||
145 | task_pt_regs(tsk)->ax = syscall_nr; | ||
146 | return __secure_computing(syscall_nr); | ||
147 | } | ||
148 | #else | ||
149 | #define vsyscall_seccomp(_tsk, _nr) 0 | ||
150 | #endif | ||
151 | |||
152 | static bool write_ok_or_segv(unsigned long ptr, size_t size) | 139 | static bool write_ok_or_segv(unsigned long ptr, size_t size) |
153 | { | 140 | { |
154 | /* | 141 | /* |
@@ -181,10 +168,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) | |||
181 | { | 168 | { |
182 | struct task_struct *tsk; | 169 | struct task_struct *tsk; |
183 | unsigned long caller; | 170 | unsigned long caller; |
184 | int vsyscall_nr; | 171 | int vsyscall_nr, syscall_nr, tmp; |
185 | int prev_sig_on_uaccess_error; | 172 | int prev_sig_on_uaccess_error; |
186 | long ret; | 173 | long ret; |
187 | int skip; | ||
188 | 174 | ||
189 | /* | 175 | /* |
190 | * No point in checking CS -- the only way to get here is a user mode | 176 | * No point in checking CS -- the only way to get here is a user mode |
@@ -216,56 +202,84 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) | |||
216 | } | 202 | } |
217 | 203 | ||
218 | tsk = current; | 204 | tsk = current; |
219 | /* | ||
220 | * With a real vsyscall, page faults cause SIGSEGV. We want to | ||
221 | * preserve that behavior to make writing exploits harder. | ||
222 | */ | ||
223 | prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; | ||
224 | current_thread_info()->sig_on_uaccess_error = 1; | ||
225 | 205 | ||
226 | /* | 206 | /* |
207 | * Check for access_ok violations and find the syscall nr. | ||
208 | * | ||
227 | * NULL is a valid user pointer (in the access_ok sense) on 32-bit and | 209 | * NULL is a valid user pointer (in the access_ok sense) on 32-bit and |
228 | * 64-bit, so we don't need to special-case it here. For all the | 210 | * 64-bit, so we don't need to special-case it here. For all the |
229 | * vsyscalls, NULL means "don't write anything" not "write it at | 211 | * vsyscalls, NULL means "don't write anything" not "write it at |
230 | * address 0". | 212 | * address 0". |
231 | */ | 213 | */ |
232 | ret = -EFAULT; | ||
233 | skip = 0; | ||
234 | switch (vsyscall_nr) { | 214 | switch (vsyscall_nr) { |
235 | case 0: | 215 | case 0: |
236 | skip = vsyscall_seccomp(tsk, __NR_gettimeofday); | ||
237 | if (skip) | ||
238 | break; | ||
239 | |||
240 | if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || | 216 | if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || |
241 | !write_ok_or_segv(regs->si, sizeof(struct timezone))) | 217 | !write_ok_or_segv(regs->si, sizeof(struct timezone))) { |
242 | break; | 218 | ret = -EFAULT; |
219 | goto check_fault; | ||
220 | } | ||
221 | |||
222 | syscall_nr = __NR_gettimeofday; | ||
223 | break; | ||
224 | |||
225 | case 1: | ||
226 | if (!write_ok_or_segv(regs->di, sizeof(time_t))) { | ||
227 | ret = -EFAULT; | ||
228 | goto check_fault; | ||
229 | } | ||
230 | |||
231 | syscall_nr = __NR_time; | ||
232 | break; | ||
233 | |||
234 | case 2: | ||
235 | if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || | ||
236 | !write_ok_or_segv(regs->si, sizeof(unsigned))) { | ||
237 | ret = -EFAULT; | ||
238 | goto check_fault; | ||
239 | } | ||
240 | |||
241 | syscall_nr = __NR_getcpu; | ||
242 | break; | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Handle seccomp. regs->ip must be the original value. | ||
247 | * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. | ||
248 | * | ||
249 | * We could optimize the seccomp disabled case, but performance | ||
250 | * here doesn't matter. | ||
251 | */ | ||
252 | regs->orig_ax = syscall_nr; | ||
253 | regs->ax = -ENOSYS; | ||
254 | tmp = secure_computing(syscall_nr); | ||
255 | if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { | ||
256 | warn_bad_vsyscall(KERN_DEBUG, regs, | ||
257 | "seccomp tried to change syscall nr or ip"); | ||
258 | do_exit(SIGSYS); | ||
259 | } | ||
260 | if (tmp) | ||
261 | goto do_ret; /* skip requested */ | ||
243 | 262 | ||
263 | /* | ||
264 | * With a real vsyscall, page faults cause SIGSEGV. We want to | ||
265 | * preserve that behavior to make writing exploits harder. | ||
266 | */ | ||
267 | prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; | ||
268 | current_thread_info()->sig_on_uaccess_error = 1; | ||
269 | |||
270 | ret = -EFAULT; | ||
271 | switch (vsyscall_nr) { | ||
272 | case 0: | ||
244 | ret = sys_gettimeofday( | 273 | ret = sys_gettimeofday( |
245 | (struct timeval __user *)regs->di, | 274 | (struct timeval __user *)regs->di, |
246 | (struct timezone __user *)regs->si); | 275 | (struct timezone __user *)regs->si); |
247 | break; | 276 | break; |
248 | 277 | ||
249 | case 1: | 278 | case 1: |
250 | skip = vsyscall_seccomp(tsk, __NR_time); | ||
251 | if (skip) | ||
252 | break; | ||
253 | |||
254 | if (!write_ok_or_segv(regs->di, sizeof(time_t))) | ||
255 | break; | ||
256 | |||
257 | ret = sys_time((time_t __user *)regs->di); | 279 | ret = sys_time((time_t __user *)regs->di); |
258 | break; | 280 | break; |
259 | 281 | ||
260 | case 2: | 282 | case 2: |
261 | skip = vsyscall_seccomp(tsk, __NR_getcpu); | ||
262 | if (skip) | ||
263 | break; | ||
264 | |||
265 | if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || | ||
266 | !write_ok_or_segv(regs->si, sizeof(unsigned))) | ||
267 | break; | ||
268 | |||
269 | ret = sys_getcpu((unsigned __user *)regs->di, | 283 | ret = sys_getcpu((unsigned __user *)regs->di, |
270 | (unsigned __user *)regs->si, | 284 | (unsigned __user *)regs->si, |
271 | NULL); | 285 | NULL); |
@@ -274,12 +288,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) | |||
274 | 288 | ||
275 | current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; | 289 | current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; |
276 | 290 | ||
277 | if (skip) { | 291 | check_fault: |
278 | if ((long)regs->ax <= 0L) /* seccomp errno emulation */ | ||
279 | goto do_ret; | ||
280 | goto done; /* seccomp trace/trap */ | ||
281 | } | ||
282 | |||
283 | if (ret == -EFAULT) { | 292 | if (ret == -EFAULT) { |
284 | /* Bad news -- userspace fed a bad pointer to a vsyscall. */ | 293 | /* Bad news -- userspace fed a bad pointer to a vsyscall. */ |
285 | warn_bad_vsyscall(KERN_INFO, regs, | 294 | warn_bad_vsyscall(KERN_INFO, regs, |
@@ -302,7 +311,6 @@ do_ret: | |||
302 | /* Emulate a ret instruction. */ | 311 | /* Emulate a ret instruction. */ |
303 | regs->ip = caller; | 312 | regs->ip = caller; |
304 | regs->sp += 8; | 313 | regs->sp += 8; |
305 | done: | ||
306 | return true; | 314 | return true; |
307 | 315 | ||
308 | sigsegv: | 316 | sigsegv: |