aboutsummaryrefslogtreecommitdiffstats
path: root/fs/select.c
diff options
context:
space:
mode:
authorDavid Woodhouse <dwmw2@infradead.org>2006-01-18 20:44:05 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-18 22:20:30 -0500
commit9f72949f679df06021c9e43886c9191494fdb007 (patch)
treef4d76ed281b34e195db7741b69a7d095e168a864 /fs/select.c
parent36a7878a224c18aa4a5e098dc93d19cf5601462b (diff)
[PATCH] Add pselect/ppoll system call implementation
The following implementation of ppoll() and pselect() system calls depends on the architecture providing a TIF_RESTORE_SIGMASK flag in the thread_info. These system calls have to change the signal mask during their operation, and signal handlers must be invoked using the new, temporary signal mask. The old signal mask must be restored either upon successful exit from the system call, or upon returning from the invoked signal handler if the system call is interrupted. We can't simply restore the original signal mask and return to userspace, since the restored signal mask may actually block the signal which interrupted the system call. The TIF_RESTORE_SIGMASK flag deals with this by causing the syscall exit path to trap into do_signal() just as TIF_SIGPENDING does, and by causing do_signal() to use the saved signal mask instead of the current signal mask when setting up the stack frame for the signal handler -- or by causing do_signal() to simply restore the saved signal mask in the case where there is no handler to be invoked. The first patch implements the sys_pselect() and sys_ppoll() system calls, which are present only if TIF_RESTORE_SIGMASK is defined. That #ifdef should go away in time when all architectures have implemented it. The second patch implements TIF_RESTORE_SIGMASK for the PowerPC kernel (in the -mm tree), and the third patch then removes the arch-specific implementations of sys_rt_sigsuspend() and replaces them with generic versions using the same trick. The fourth and fifth patches, provided by David Howells, implement TIF_RESTORE_SIGMASK for FR-V and i386 respectively, and the sixth patch adds the syscalls to the i386 syscall table. This patch: Add the pselect() and ppoll() system calls, providing core routines usable by the original select() and poll() system calls and also the new calls (with their semantics w.r.t timeouts). Signed-off-by: David Woodhouse <dwmw2@infradead.org> Cc: Michael Kerrisk <mtk-manpages@gmx.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'fs/select.c')
-rw-r--r--fs/select.c348
1 files changed, 291 insertions, 57 deletions
diff --git a/fs/select.c b/fs/select.c
index f10a10317d54..c0f02d36c60e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -179,12 +179,11 @@ get_max:
179#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) 179#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
180#define POLLEX_SET (POLLPRI) 180#define POLLEX_SET (POLLPRI)
181 181
182int do_select(int n, fd_set_bits *fds, long *timeout) 182int do_select(int n, fd_set_bits *fds, s64 *timeout)
183{ 183{
184 struct poll_wqueues table; 184 struct poll_wqueues table;
185 poll_table *wait; 185 poll_table *wait;
186 int retval, i; 186 int retval, i;
187 long __timeout = *timeout;
188 187
189 rcu_read_lock(); 188 rcu_read_lock();
190 retval = max_select_fd(n, fds); 189 retval = max_select_fd(n, fds);
@@ -196,11 +195,12 @@ int do_select(int n, fd_set_bits *fds, long *timeout)
196 195
197 poll_initwait(&table); 196 poll_initwait(&table);
198 wait = &table.pt; 197 wait = &table.pt;
199 if (!__timeout) 198 if (!*timeout)
200 wait = NULL; 199 wait = NULL;
201 retval = 0; 200 retval = 0;
202 for (;;) { 201 for (;;) {
203 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; 202 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
203 long __timeout;
204 204
205 set_current_state(TASK_INTERRUPTIBLE); 205 set_current_state(TASK_INTERRUPTIBLE);
206 206
@@ -255,22 +255,32 @@ int do_select(int n, fd_set_bits *fds, long *timeout)
255 *rexp = res_ex; 255 *rexp = res_ex;
256 } 256 }
257 wait = NULL; 257 wait = NULL;
258 if (retval || !__timeout || signal_pending(current)) 258 if (retval || !*timeout || signal_pending(current))
259 break; 259 break;
260 if(table.error) { 260 if(table.error) {
261 retval = table.error; 261 retval = table.error;
262 break; 262 break;
263 } 263 }
264
265 if (*timeout < 0) {
266 /* Wait indefinitely */
267 __timeout = MAX_SCHEDULE_TIMEOUT;
268 } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {
269 /* Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in a loop */
270 __timeout = MAX_SCHEDULE_TIMEOUT - 1;
271 *timeout -= __timeout;
272 } else {
273 __timeout = *timeout;
274 *timeout = 0;
275 }
264 __timeout = schedule_timeout(__timeout); 276 __timeout = schedule_timeout(__timeout);
277 if (*timeout >= 0)
278 *timeout += __timeout;
265 } 279 }
266 __set_current_state(TASK_RUNNING); 280 __set_current_state(TASK_RUNNING);
267 281
268 poll_freewait(&table); 282 poll_freewait(&table);
269 283
270 /*
271 * Up-to-date the caller timeout.
272 */
273 *timeout = __timeout;
274 return retval; 284 return retval;
275} 285}
276 286
@@ -295,36 +305,14 @@ static void select_bits_free(void *bits, int size)
295#define MAX_SELECT_SECONDS \ 305#define MAX_SELECT_SECONDS \
296 ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) 306 ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
297 307
298asmlinkage long 308static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
299sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp) 309 fd_set __user *exp, s64 *timeout)
300{ 310{
301 fd_set_bits fds; 311 fd_set_bits fds;
302 char *bits; 312 char *bits;
303 long timeout;
304 int ret, size, max_fdset; 313 int ret, size, max_fdset;
305 struct fdtable *fdt; 314 struct fdtable *fdt;
306 315
307 timeout = MAX_SCHEDULE_TIMEOUT;
308 if (tvp) {
309 time_t sec, usec;
310
311 if (!access_ok(VERIFY_READ, tvp, sizeof(*tvp))
312 || __get_user(sec, &tvp->tv_sec)
313 || __get_user(usec, &tvp->tv_usec)) {
314 ret = -EFAULT;
315 goto out_nofds;
316 }
317
318 ret = -EINVAL;
319 if (sec < 0 || usec < 0)
320 goto out_nofds;
321
322 if ((unsigned long) sec < MAX_SELECT_SECONDS) {
323 timeout = ROUND_UP(usec, 1000000/HZ);
324 timeout += sec * (unsigned long) HZ;
325 }
326 }
327
328 ret = -EINVAL; 316 ret = -EINVAL;
329 if (n < 0) 317 if (n < 0)
330 goto out_nofds; 318 goto out_nofds;
@@ -362,18 +350,7 @@ sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s
362 zero_fd_set(n, fds.res_out); 350 zero_fd_set(n, fds.res_out);
363 zero_fd_set(n, fds.res_ex); 351 zero_fd_set(n, fds.res_ex);
364 352
365 ret = do_select(n, &fds, &timeout); 353 ret = do_select(n, &fds, timeout);
366
367 if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
368 time_t sec = 0, usec = 0;
369 if (timeout) {
370 sec = timeout / HZ;
371 usec = timeout % HZ;
372 usec *= (1000000/HZ);
373 }
374 put_user(sec, &tvp->tv_sec);
375 put_user(usec, &tvp->tv_usec);
376 }
377 354
378 if (ret < 0) 355 if (ret < 0)
379 goto out; 356 goto out;
@@ -395,6 +372,154 @@ out_nofds:
395 return ret; 372 return ret;
396} 373}
397 374
375asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
376 fd_set __user *exp, struct timeval __user *tvp)
377{
378 s64 timeout = -1;
379 struct timeval tv;
380 int ret;
381
382 if (tvp) {
383 if (copy_from_user(&tv, tvp, sizeof(tv)))
384 return -EFAULT;
385
386 if (tv.tv_sec < 0 || tv.tv_usec < 0)
387 return -EINVAL;
388
389 /* Cast to u64 to make GCC stop complaining */
390 if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
391 timeout = -1; /* infinite */
392 else {
393 timeout = ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
394 timeout += tv.tv_sec * HZ;
395 }
396 }
397
398 ret = core_sys_select(n, inp, outp, exp, &timeout);
399
400 if (tvp) {
401 if (current->personality & STICKY_TIMEOUTS)
402 goto sticky;
403 tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
404 tv.tv_sec = timeout;
405 if (copy_to_user(tvp, &tv, sizeof(tv))) {
406sticky:
407 /*
408 * If an application puts its timeval in read-only
409 * memory, we don't want the Linux-specific update to
410 * the timeval to cause a fault after the select has
411 * completed successfully. However, because we're not
412 * updating the timeval, we can't restart the system
413 * call.
414 */
415 if (ret == -ERESTARTNOHAND)
416 ret = -EINTR;
417 }
418 }
419
420 return ret;
421}
422
423#ifdef TIF_RESTORE_SIGMASK
424asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
425 fd_set __user *exp, struct timespec __user *tsp,
426 const sigset_t __user *sigmask, size_t sigsetsize)
427{
428 s64 timeout = MAX_SCHEDULE_TIMEOUT;
429 sigset_t ksigmask, sigsaved;
430 struct timespec ts;
431 int ret;
432
433 if (tsp) {
434 if (copy_from_user(&ts, tsp, sizeof(ts)))
435 return -EFAULT;
436
437 if (ts.tv_sec < 0 || ts.tv_nsec < 0)
438 return -EINVAL;
439
440 /* Cast to u64 to make GCC stop complaining */
441 if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
442 timeout = -1; /* infinite */
443 else {
444 timeout = ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
445 timeout += ts.tv_sec * HZ;
446 }
447 }
448
449 if (sigmask) {
450 /* XXX: Don't preclude handling different sized sigset_t's. */
451 if (sigsetsize != sizeof(sigset_t))
452 return -EINVAL;
453 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
454 return -EFAULT;
455
456 sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
457 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
458 }
459
460 ret = core_sys_select(n, inp, outp, exp, &timeout);
461
462 if (tsp) {
463 if (current->personality & STICKY_TIMEOUTS)
464 goto sticky;
465 ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
466 ts.tv_sec = timeout;
467 if (copy_to_user(tsp, &ts, sizeof(ts))) {
468sticky:
469 /*
470 * If an application puts its timeval in read-only
471 * memory, we don't want the Linux-specific update to
472 * the timeval to cause a fault after the select has
473 * completed successfully. However, because we're not
474 * updating the timeval, we can't restart the system
475 * call.
476 */
477 if (ret == -ERESTARTNOHAND)
478 ret = -EINTR;
479 }
480 }
481
482 if (ret == -ERESTARTNOHAND) {
483 /*
484 * Don't restore the signal mask yet. Let do_signal() deliver
485 * the signal on the way back to userspace, before the signal
486 * mask is restored.
487 */
488 if (sigmask) {
489 memcpy(&current->saved_sigmask, &sigsaved,
490 sizeof(sigsaved));
491 set_thread_flag(TIF_RESTORE_SIGMASK);
492 }
493 } else if (sigmask)
494 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
495
496 return ret;
497}
498
499/*
500 * Most architectures can't handle 7-argument syscalls. So we provide a
501 * 6-argument version where the sixth argument is a pointer to a structure
502 * which has a pointer to the sigset_t itself followed by a size_t containing
503 * the sigset size.
504 */
505asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
506 fd_set __user *exp, struct timespec __user *tsp, void __user *sig)
507{
508 size_t sigsetsize = 0;
509 sigset_t __user *up = NULL;
510
511 if (sig) {
512 if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
513 || __get_user(up, (sigset_t * __user *)sig)
514 || __get_user(sigsetsize,
515 (size_t * __user)(sig+sizeof(void *))))
516 return -EFAULT;
517 }
518
519 return sys_pselect7(n, inp, outp, exp, tsp, up, sigsetsize);
520}
521#endif /* TIF_RESTORE_SIGMASK */
522
398struct poll_list { 523struct poll_list {
399 struct poll_list *next; 524 struct poll_list *next;
400 int len; 525 int len;
@@ -436,16 +561,19 @@ static void do_pollfd(unsigned int num, struct pollfd * fdpage,
436} 561}
437 562
438static int do_poll(unsigned int nfds, struct poll_list *list, 563static int do_poll(unsigned int nfds, struct poll_list *list,
439 struct poll_wqueues *wait, long timeout) 564 struct poll_wqueues *wait, s64 *timeout)
440{ 565{
441 int count = 0; 566 int count = 0;
442 poll_table* pt = &wait->pt; 567 poll_table* pt = &wait->pt;
443 568
444 if (!timeout) 569 /* Optimise the no-wait case */
570 if (!(*timeout))
445 pt = NULL; 571 pt = NULL;
446 572
447 for (;;) { 573 for (;;) {
448 struct poll_list *walk; 574 struct poll_list *walk;
575 long __timeout;
576
449 set_current_state(TASK_INTERRUPTIBLE); 577 set_current_state(TASK_INTERRUPTIBLE);
450 walk = list; 578 walk = list;
451 while(walk != NULL) { 579 while(walk != NULL) {
@@ -453,18 +581,36 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
453 walk = walk->next; 581 walk = walk->next;
454 } 582 }
455 pt = NULL; 583 pt = NULL;
456 if (count || !timeout || signal_pending(current)) 584 if (count || !*timeout || signal_pending(current))
457 break; 585 break;
458 count = wait->error; 586 count = wait->error;
459 if (count) 587 if (count)
460 break; 588 break;
461 timeout = schedule_timeout(timeout); 589
590 if (*timeout < 0) {
591 /* Wait indefinitely */
592 __timeout = MAX_SCHEDULE_TIMEOUT;
593 } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT-1)) {
594 /*
595 * Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in
596 * a loop
597 */
598 __timeout = MAX_SCHEDULE_TIMEOUT - 1;
599 *timeout -= __timeout;
600 } else {
601 __timeout = *timeout;
602 *timeout = 0;
603 }
604
605 __timeout = schedule_timeout(__timeout);
606 if (*timeout >= 0)
607 *timeout += __timeout;
462 } 608 }
463 __set_current_state(TASK_RUNNING); 609 __set_current_state(TASK_RUNNING);
464 return count; 610 return count;
465} 611}
466 612
467asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long timeout) 613int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
468{ 614{
469 struct poll_wqueues table; 615 struct poll_wqueues table;
470 int fdcount, err; 616 int fdcount, err;
@@ -482,14 +628,6 @@ asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long ti
482 if (nfds > max_fdset && nfds > OPEN_MAX) 628 if (nfds > max_fdset && nfds > OPEN_MAX)
483 return -EINVAL; 629 return -EINVAL;
484 630
485 if (timeout) {
486 /* Careful about overflow in the intermediate values */
487 if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ)
488 timeout = (unsigned long)(timeout*HZ+999)/1000+1;
489 else /* Negative or overflow */
490 timeout = MAX_SCHEDULE_TIMEOUT;
491 }
492
493 poll_initwait(&table); 631 poll_initwait(&table);
494 632
495 head = NULL; 633 head = NULL;
@@ -519,6 +657,7 @@ asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long ti
519 } 657 }
520 i -= pp->len; 658 i -= pp->len;
521 } 659 }
660
522 fdcount = do_poll(nfds, head, &table, timeout); 661 fdcount = do_poll(nfds, head, &table, timeout);
523 662
524 /* OK, now copy the revents fields back to user space. */ 663 /* OK, now copy the revents fields back to user space. */
@@ -547,3 +686,98 @@ out_fds:
547 poll_freewait(&table); 686 poll_freewait(&table);
548 return err; 687 return err;
549} 688}
689
690asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
691 long timeout_msecs)
692{
693 s64 timeout_jiffies = 0;
694
695 if (timeout_msecs) {
696#if HZ > 1000
697 /* We can only overflow if HZ > 1000 */
698 if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ)
699 timeout_jiffies = -1;
700 else
701#endif
702 timeout_jiffies = msecs_to_jiffies(timeout_msecs);
703 }
704
705 return do_sys_poll(ufds, nfds, &timeout_jiffies);
706}
707
708#ifdef TIF_RESTORE_SIGMASK
709asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
710 struct timespec __user *tsp, const sigset_t __user *sigmask,
711 size_t sigsetsize)
712{
713 sigset_t ksigmask, sigsaved;
714 struct timespec ts;
715 s64 timeout = -1;
716 int ret;
717
718 if (tsp) {
719 if (copy_from_user(&ts, tsp, sizeof(ts)))
720 return -EFAULT;
721
722 /* Cast to u64 to make GCC stop complaining */
723 if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
724 timeout = -1; /* infinite */
725 else {
726 timeout = ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
727 timeout += ts.tv_sec * HZ;
728 }
729 }
730
731 if (sigmask) {
732 /* XXX: Don't preclude handling different sized sigset_t's. */
733 if (sigsetsize != sizeof(sigset_t))
734 return -EINVAL;
735 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
736 return -EFAULT;
737
738 sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
739 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
740 }
741
742 ret = do_sys_poll(ufds, nfds, &timeout);
743
744 /* We can restart this syscall, usually */
745 if (ret == -EINTR) {
746 /*
747 * Don't restore the signal mask yet. Let do_signal() deliver
748 * the signal on the way back to userspace, before the signal
749 * mask is restored.
750 */
751 if (sigmask) {
752 memcpy(&current->saved_sigmask, &sigsaved,
753 sizeof(sigsaved));
754 set_thread_flag(TIF_RESTORE_SIGMASK);
755 }
756 ret = -ERESTARTNOHAND;
757 } else if (sigmask)
758 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
759
760 if (tsp && timeout >= 0) {
761 if (current->personality & STICKY_TIMEOUTS)
762 goto sticky;
763 /* Yes, we know it's actually an s64, but it's also positive. */
764 ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
765 ts.tv_sec = timeout;
766 if (copy_to_user(tsp, &ts, sizeof(ts))) {
767 sticky:
768 /*
769 * If an application puts its timeval in read-only
770 * memory, we don't want the Linux-specific update to
771 * the timeval to cause a fault after the select has
772 * completed successfully. However, because we're not
773 * updating the timeval, we can't restart the system
774 * call.
775 */
776 if (ret == -ERESTARTNOHAND && timeout >= 0)
777 ret = -EINTR;
778 }
779 }
780
781 return ret;
782}
783#endif /* TIF_RESTORE_SIGMASK */