aboutsummaryrefslogtreecommitdiffstats
path: root/fs/pipe.c
diff options
context:
space:
mode:
authorMichael Kerrisk (man-pages) <mtk.manpages@gmail.com>2016-10-11 16:53:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-10-11 18:06:32 -0400
commitb0b91d18e2e97b741b294af9333824ecc3fadfd8 (patch)
treec691b991974347b735dc107ad4e3426d2f41c278 /fs/pipe.c
parent3734a13b96ebf039b293d8d37a934fd1bd9e03ab (diff)
pipe: fix limit checking in pipe_set_size()
The limit checking in pipe_set_size() (used by fcntl(F_SETPIPE_SZ)) has the following problems: (1) When increasing the pipe capacity, the checks against the limits in /proc/sys/fs/pipe-user-pages-{soft,hard} are made against existing consumption, and exclude the memory required for the increased pipe capacity. The new increase in pipe capacity can then push the total memory used by the user for pipes (possibly far) over a limit. This can also trigger the problem described next. (2) The limit checks are performed even when the new pipe capacity is less than the existing pipe capacity. This can lead to problems if a user sets a large pipe capacity, and then the limits are lowered, with the result that the user will no longer be able to decrease the pipe capacity. (3) As currently implemented, accounting and checking against the limits is done as follows: (a) Test whether the user has exceeded the limit. (b) Make new pipe buffer allocation. (c) Account new allocation against the limits. This is racey. Multiple processes may pass point (a) simultaneously, and then allocate pipe buffers that are accounted for only in step (c). The race means that the user's pipe buffer allocation could be pushed over the limit (by an arbitrary amount, depending on how unlucky we were in the race). [Thanks to Vegard Nossum for spotting this point, which I had missed.] This patch addresses the above problems as follows: * Perform checks against the limits only when increasing a pipe's capacity; an unprivileged user can always decrease a pipe's capacity. * Alter the checks against limits to include the memory required for the new pipe capacity. * Re-order the accounting step so that it precedes the buffer allocation. If the accounting step determines that a limit has been reached, revert the accounting and cause the operation to fail. The program below can be used to demonstrate problems 1 and 2, and the effect of the fix. The program takes one or more command-line arguments. The first argument specifies the number of pipes that the program should create. The remaining arguments are, alternately, pipe capacities that should be set using fcntl(F_SETPIPE_SZ), and sleep intervals (in seconds) between the fcntl() operations. (The sleep intervals allow the possibility to change the limits between fcntl() operations.) Problem 1 ========= Using the test program on an unpatched kernel, we first set some limits: # echo 0 > /proc/sys/fs/pipe-user-pages-soft # echo 1000000000 > /proc/sys/fs/pipe-max-size # echo 10000 > /proc/sys/fs/pipe-user-pages-hard # 40.96 MB Then show that we can set a pipe with capacity (100MB) that is over the hard limit # sudo -u mtk ./test_F_SETPIPE_SZ 1 100000000 Initial pipe capacity: 65536 Loop 1: set pipe capacity to 100000000 bytes F_SETPIPE_SZ returned 134217728 Now set the capacity to 100MB twice. The second call fails (which is probably surprising to most users, since it seems like a no-op): # sudo -u mtk ./test_F_SETPIPE_SZ 1 100000000 0 100000000 Initial pipe capacity: 65536 Loop 1: set pipe capacity to 100000000 bytes F_SETPIPE_SZ returned 134217728 Loop 2: set pipe capacity to 100000000 bytes Loop 2, pipe 0: F_SETPIPE_SZ failed: fcntl: Operation not permitted With a patched kernel, setting a capacity over the limit fails at the first attempt: # echo 0 > /proc/sys/fs/pipe-user-pages-soft # echo 1000000000 > /proc/sys/fs/pipe-max-size # echo 10000 > /proc/sys/fs/pipe-user-pages-hard # sudo -u mtk ./test_F_SETPIPE_SZ 1 100000000 Initial pipe capacity: 65536 Loop 1: set pipe capacity to 100000000 bytes Loop 1, pipe 0: F_SETPIPE_SZ failed: fcntl: Operation not permitted There is a small chance that the change to fix this problem could break user-space, since there are cases where fcntl(F_SETPIPE_SZ) calls that previously succeeded might fail. However, the chances are small, since (a) the pipe-user-pages-{soft,hard} limits are new (in 4.5), and the default soft/hard limits are high/unlimited. Therefore, it seems warranted to make these limits operate more precisely (and behave more like what users probably expect). Problem 2 ========= Running the test program on an unpatched kernel, we first set some limits: # getconf PAGESIZE 4096 # echo 0 > /proc/sys/fs/pipe-user-pages-soft # echo 1000000000 > /proc/sys/fs/pipe-max-size # echo 10000 > /proc/sys/fs/pipe-user-pages-hard # 40.96 MB Now perform two fcntl(F_SETPIPE_SZ) operations on a single pipe, first setting a pipe capacity (10MB), sleeping for a few seconds, during which time the hard limit is lowered, and then set pipe capacity to a smaller amount (5MB): # sudo -u mtk ./test_F_SETPIPE_SZ 1 10000000 15 5000000 & [1] 748 # Initial pipe capacity: 65536 Loop 1: set pipe capacity to 10000000 bytes F_SETPIPE_SZ returned 16777216 Sleeping 15 seconds # echo 1000 > /proc/sys/fs/pipe-user-pages-hard # 4.096 MB # Loop 2: set pipe capacity to 5000000 bytes Loop 2, pipe 0: F_SETPIPE_SZ failed: fcntl: Operation not permitted In this case, the user should be able to lower the limit. With a kernel that has the patch below, the second fcntl() succeeds: # echo 0 > /proc/sys/fs/pipe-user-pages-soft # echo 1000000000 > /proc/sys/fs/pipe-max-size # echo 10000 > /proc/sys/fs/pipe-user-pages-hard # sudo -u mtk ./test_F_SETPIPE_SZ 1 10000000 15 5000000 & [1] 3215 # Initial pipe capacity: 65536 # Loop 1: set pipe capacity to 10000000 bytes F_SETPIPE_SZ returned 16777216 Sleeping 15 seconds # echo 1000 > /proc/sys/fs/pipe-user-pages-hard # Loop 2: set pipe capacity to 5000000 bytes F_SETPIPE_SZ returned 8388608 8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x--- /* test_F_SETPIPE_SZ.c (C) 2016, Michael Kerrisk; licensed under GNU GPL version 2 or later Test operation of fcntl(F_SETPIPE_SZ) for setting pipe capacity and interactions with limits defined by /proc/sys/fs/pipe-* files. */ #define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <fcntl.h> #include <unistd.h> int main(int argc, char *argv[]) { int (*pfd)[2]; int npipes; int pcap, rcap; int j, p, s, stime, loop; if (argc < 2) { fprintf(stderr, "Usage: %s num-pipes " "[pipe-capacity sleep-time]...\n", argv[0]); exit(EXIT_FAILURE); } npipes = atoi(argv[1]); pfd = calloc(npipes, sizeof (int [2])); if (pfd == NULL) { perror("calloc"); exit(EXIT_FAILURE); } for (j = 0; j < npipes; j++) { if (pipe(pfd[j]) == -1) { fprintf(stderr, "Loop %d: pipe() failed: ", j); perror("pipe"); exit(EXIT_FAILURE); } } printf("Initial pipe capacity: %d\n", fcntl(pfd[0][0], F_GETPIPE_SZ)); for (j = 2; j < argc; j += 2 ) { loop = j / 2; pcap = atoi(argv[j]); printf(" Loop %d: set pipe capacity to %d bytes\n", loop, pcap); for (p = 0; p < npipes; p++) { s = fcntl(pfd[p][0], F_SETPIPE_SZ, pcap); if (s == -1) { fprintf(stderr, " Loop %d, pipe %d: F_SETPIPE_SZ " "failed: ", loop, p); perror("fcntl"); exit(EXIT_FAILURE); } if (p == 0) { printf(" F_SETPIPE_SZ returned %d\n", s); rcap = s; } else { if (s != rcap) { fprintf(stderr, " Loop %d, pipe %d: F_SETPIPE_SZ " "unexpected return: %d\n", loop, p, s); exit(EXIT_FAILURE); } } stime = (j + 1 < argc) ? atoi(argv[j + 1]) : 0; if (stime > 0) { printf(" Sleeping %d seconds\n", stime); sleep(stime); } } } exit(EXIT_SUCCESS); } 8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x--- Patch history: v2 * Switch order of test in 'if' statement to avoid function call (to capability()) in normal path. [This is a fix to a preexisting wart in the code. Thanks to Willy Tarreau] * Perform (size > pipe_max_size) check before calling account_pipe_buffers(). [Thanks to Vegard Nossum] Quoting Vegard: The potential problem happens if the user passes a very large number which will overflow pipe->user->pipe_bufs. On 32-bit, sizeof(int) == sizeof(long), so if they pass arg = INT_MAX then round_pipe_size() returns INT_MAX. Although it's true that the accounting is done in terms of pages and not bytes, so you'd need on the order of (1 << 13) = 8192 processes hitting the limit at the same time in order to make it overflow, which seems a bit unlikely. (See https://lkml.org/lkml/2016/8/12/215 for another discussion on the limit checking) Link: http://lkml.kernel.org/r/1e464945-536b-2420-798b-e77b9c7e8593@gmail.com Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com> Reviewed-by: Vegard Nossum <vegard.nossum@oracle.com> Cc: Willy Tarreau <w@1wt.eu> Cc: <socketpair@gmail.com> Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: Jens Axboe <axboe@fb.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/pipe.c')
-rw-r--r--fs/pipe.c41
1 files changed, 31 insertions, 10 deletions
diff --git a/fs/pipe.c b/fs/pipe.c
index b9422bc17028..dc5f4c040890 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1027,6 +1027,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
1027{ 1027{
1028 struct pipe_buffer *bufs; 1028 struct pipe_buffer *bufs;
1029 unsigned int size, nr_pages; 1029 unsigned int size, nr_pages;
1030 long ret = 0;
1030 1031
1031 size = round_pipe_size(arg); 1032 size = round_pipe_size(arg);
1032 nr_pages = size >> PAGE_SHIFT; 1033 nr_pages = size >> PAGE_SHIFT;
@@ -1034,13 +1035,26 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
1034 if (!nr_pages) 1035 if (!nr_pages)
1035 return -EINVAL; 1036 return -EINVAL;
1036 1037
1037 if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) 1038 /*
1039 * If trying to increase the pipe capacity, check that an
1040 * unprivileged user is not trying to exceed various limits
1041 * (soft limit check here, hard limit check just below).
1042 * Decreasing the pipe capacity is always permitted, even
1043 * if the user is currently over a limit.
1044 */
1045 if (nr_pages > pipe->buffers &&
1046 size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
1038 return -EPERM; 1047 return -EPERM;
1039 1048
1040 if ((too_many_pipe_buffers_hard(pipe->user) || 1049 account_pipe_buffers(pipe->user, pipe->buffers, nr_pages);
1041 too_many_pipe_buffers_soft(pipe->user)) && 1050
1042 !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) 1051 if (nr_pages > pipe->buffers &&
1043 return -EPERM; 1052 (too_many_pipe_buffers_hard(pipe->user) ||
1053 too_many_pipe_buffers_soft(pipe->user)) &&
1054 !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
1055 ret = -EPERM;
1056 goto out_revert_acct;
1057 }
1044 1058
1045 /* 1059 /*
1046 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't 1060 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
@@ -1048,13 +1062,17 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
1048 * again like we would do for growing. If the pipe currently 1062 * again like we would do for growing. If the pipe currently
1049 * contains more buffers than arg, then return busy. 1063 * contains more buffers than arg, then return busy.
1050 */ 1064 */
1051 if (nr_pages < pipe->nrbufs) 1065 if (nr_pages < pipe->nrbufs) {
1052 return -EBUSY; 1066 ret = -EBUSY;
1067 goto out_revert_acct;
1068 }
1053 1069
1054 bufs = kcalloc(nr_pages, sizeof(*bufs), 1070 bufs = kcalloc(nr_pages, sizeof(*bufs),
1055 GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1071 GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
1056 if (unlikely(!bufs)) 1072 if (unlikely(!bufs)) {
1057 return -ENOMEM; 1073 ret = -ENOMEM;
1074 goto out_revert_acct;
1075 }
1058 1076
1059 /* 1077 /*
1060 * The pipe array wraps around, so just start the new one at zero 1078 * The pipe array wraps around, so just start the new one at zero
@@ -1077,12 +1095,15 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
1077 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); 1095 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
1078 } 1096 }
1079 1097
1080 account_pipe_buffers(pipe->user, pipe->buffers, nr_pages);
1081 pipe->curbuf = 0; 1098 pipe->curbuf = 0;
1082 kfree(pipe->bufs); 1099 kfree(pipe->bufs);
1083 pipe->bufs = bufs; 1100 pipe->bufs = bufs;
1084 pipe->buffers = nr_pages; 1101 pipe->buffers = nr_pages;
1085 return nr_pages * PAGE_SIZE; 1102 return nr_pages * PAGE_SIZE;
1103
1104out_revert_acct:
1105 account_pipe_buffers(pipe->user, nr_pages, pipe->buffers);
1106 return ret;
1086} 1107}
1087 1108
1088/* 1109/*