diff options
author | Jens Axboe <axboe@suse.de> | 2006-07-10 05:00:01 -0400 |
---|---|---|
committer | Jens Axboe <axboe@suse.de> | 2006-07-10 05:00:01 -0400 |
commit | aadd06e5c56b9ff5117ec77e59eada43dc46e2fc (patch) | |
tree | 16da42148eab5ebcfc821fcedd0541f35a2c318b | |
parent | b3cf257623fabd8f1ee6700a6d328cc1c5da5a1d (diff) |
[PATCH] splice: fix problems with sys_tee()
Several issues noticed/fixed:
- We cannot reliably block in link_pipe() while holding both input and output
mutexes. So do preparatory checks before locking down both mutexes and doing
the link.
- The ipipe->nrbufs vs i check was bad, because we could have dropped the
ipipe lock in-between. This causes us to potentially look at unknown
buffers if we were racing with someone else reading this pipe.
Signed-off-by: Jens Axboe <axboe@suse.de>
-rw-r--r-- | fs/splice.c | 238 |
1 files changed, 133 insertions, 105 deletions
diff --git a/fs/splice.c b/fs/splice.c index 05fd2787be98..684bca3d3a10 100644 --- a/fs/splice.c +++ b/fs/splice.c | |||
@@ -1307,6 +1307,85 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, | |||
1307 | } | 1307 | } |
1308 | 1308 | ||
1309 | /* | 1309 | /* |
1310 | * Make sure there's data to read. Wait for input if we can, otherwise | ||
1311 | * return an appropriate error. | ||
1312 | */ | ||
1313 | static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) | ||
1314 | { | ||
1315 | int ret; | ||
1316 | |||
1317 | /* | ||
1318 | * Check ->nrbufs without the inode lock first. This function | ||
1319 | * is speculative anyways, so missing one is ok. | ||
1320 | */ | ||
1321 | if (pipe->nrbufs) | ||
1322 | return 0; | ||
1323 | |||
1324 | ret = 0; | ||
1325 | mutex_lock(&pipe->inode->i_mutex); | ||
1326 | |||
1327 | while (!pipe->nrbufs) { | ||
1328 | if (signal_pending(current)) { | ||
1329 | ret = -ERESTARTSYS; | ||
1330 | break; | ||
1331 | } | ||
1332 | if (!pipe->writers) | ||
1333 | break; | ||
1334 | if (!pipe->waiting_writers) { | ||
1335 | if (flags & SPLICE_F_NONBLOCK) { | ||
1336 | ret = -EAGAIN; | ||
1337 | break; | ||
1338 | } | ||
1339 | } | ||
1340 | pipe_wait(pipe); | ||
1341 | } | ||
1342 | |||
1343 | mutex_unlock(&pipe->inode->i_mutex); | ||
1344 | return ret; | ||
1345 | } | ||
1346 | |||
1347 | /* | ||
1348 | * Make sure there's writeable room. Wait for room if we can, otherwise | ||
1349 | * return an appropriate error. | ||
1350 | */ | ||
1351 | static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) | ||
1352 | { | ||
1353 | int ret; | ||
1354 | |||
1355 | /* | ||
1356 | * Check ->nrbufs without the inode lock first. This function | ||
1357 | * is speculative anyways, so missing one is ok. | ||
1358 | */ | ||
1359 | if (pipe->nrbufs < PIPE_BUFFERS) | ||
1360 | return 0; | ||
1361 | |||
1362 | ret = 0; | ||
1363 | mutex_lock(&pipe->inode->i_mutex); | ||
1364 | |||
1365 | while (pipe->nrbufs >= PIPE_BUFFERS) { | ||
1366 | if (!pipe->readers) { | ||
1367 | send_sig(SIGPIPE, current, 0); | ||
1368 | ret = -EPIPE; | ||
1369 | break; | ||
1370 | } | ||
1371 | if (flags & SPLICE_F_NONBLOCK) { | ||
1372 | ret = -EAGAIN; | ||
1373 | break; | ||
1374 | } | ||
1375 | if (signal_pending(current)) { | ||
1376 | ret = -ERESTARTSYS; | ||
1377 | break; | ||
1378 | } | ||
1379 | pipe->waiting_writers++; | ||
1380 | pipe_wait(pipe); | ||
1381 | pipe->waiting_writers--; | ||
1382 | } | ||
1383 | |||
1384 | mutex_unlock(&pipe->inode->i_mutex); | ||
1385 | return ret; | ||
1386 | } | ||
1387 | |||
1388 | /* | ||
1310 | * Link contents of ipipe to opipe. | 1389 | * Link contents of ipipe to opipe. |
1311 | */ | 1390 | */ |
1312 | static int link_pipe(struct pipe_inode_info *ipipe, | 1391 | static int link_pipe(struct pipe_inode_info *ipipe, |
@@ -1314,9 +1393,7 @@ static int link_pipe(struct pipe_inode_info *ipipe, | |||
1314 | size_t len, unsigned int flags) | 1393 | size_t len, unsigned int flags) |
1315 | { | 1394 | { |
1316 | struct pipe_buffer *ibuf, *obuf; | 1395 | struct pipe_buffer *ibuf, *obuf; |
1317 | int ret, do_wakeup, i, ipipe_first; | 1396 | int ret = 0, i = 0, nbuf; |
1318 | |||
1319 | ret = do_wakeup = ipipe_first = 0; | ||
1320 | 1397 | ||
1321 | /* | 1398 | /* |
1322 | * Potential ABBA deadlock, work around it by ordering lock | 1399 | * Potential ABBA deadlock, work around it by ordering lock |
@@ -1324,126 +1401,62 @@ static int link_pipe(struct pipe_inode_info *ipipe, | |||
1324 | * could deadlock (one doing tee from A -> B, the other from B -> A). | 1401 | * could deadlock (one doing tee from A -> B, the other from B -> A). |
1325 | */ | 1402 | */ |
1326 | if (ipipe->inode < opipe->inode) { | 1403 | if (ipipe->inode < opipe->inode) { |
1327 | ipipe_first = 1; | 1404 | mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_PARENT); |
1328 | mutex_lock(&ipipe->inode->i_mutex); | 1405 | mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_CHILD); |
1329 | mutex_lock(&opipe->inode->i_mutex); | ||
1330 | } else { | 1406 | } else { |
1331 | mutex_lock(&opipe->inode->i_mutex); | 1407 | mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_PARENT); |
1332 | mutex_lock(&ipipe->inode->i_mutex); | 1408 | mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_CHILD); |
1333 | } | 1409 | } |
1334 | 1410 | ||
1335 | for (i = 0;; i++) { | 1411 | do { |
1336 | if (!opipe->readers) { | 1412 | if (!opipe->readers) { |
1337 | send_sig(SIGPIPE, current, 0); | 1413 | send_sig(SIGPIPE, current, 0); |
1338 | if (!ret) | 1414 | if (!ret) |
1339 | ret = -EPIPE; | 1415 | ret = -EPIPE; |
1340 | break; | 1416 | break; |
1341 | } | 1417 | } |
1342 | if (ipipe->nrbufs - i) { | ||
1343 | ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); | ||
1344 | 1418 | ||
1345 | /* | 1419 | /* |
1346 | * If we have room, fill this buffer | 1420 | * If we have iterated all input buffers or ran out of |
1347 | */ | 1421 | * output room, break. |
1348 | if (opipe->nrbufs < PIPE_BUFFERS) { | 1422 | */ |
1349 | int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); | 1423 | if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) |
1350 | 1424 | break; | |
1351 | /* | ||
1352 | * Get a reference to this pipe buffer, | ||
1353 | * so we can copy the contents over. | ||
1354 | */ | ||
1355 | ibuf->ops->get(ipipe, ibuf); | ||
1356 | |||
1357 | obuf = opipe->bufs + nbuf; | ||
1358 | *obuf = *ibuf; | ||
1359 | |||
1360 | /* | ||
1361 | * Don't inherit the gift flag, we need to | ||
1362 | * prevent multiple steals of this page. | ||
1363 | */ | ||
1364 | obuf->flags &= ~PIPE_BUF_FLAG_GIFT; | ||
1365 | |||
1366 | if (obuf->len > len) | ||
1367 | obuf->len = len; | ||
1368 | |||
1369 | opipe->nrbufs++; | ||
1370 | do_wakeup = 1; | ||
1371 | ret += obuf->len; | ||
1372 | len -= obuf->len; | ||
1373 | |||
1374 | if (!len) | ||
1375 | break; | ||
1376 | if (opipe->nrbufs < PIPE_BUFFERS) | ||
1377 | continue; | ||
1378 | } | ||
1379 | |||
1380 | /* | ||
1381 | * We have input available, but no output room. | ||
1382 | * If we already copied data, return that. If we | ||
1383 | * need to drop the opipe lock, it must be ordered | ||
1384 | * last to avoid deadlocks. | ||
1385 | */ | ||
1386 | if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) { | ||
1387 | if (!ret) | ||
1388 | ret = -EAGAIN; | ||
1389 | break; | ||
1390 | } | ||
1391 | if (signal_pending(current)) { | ||
1392 | if (!ret) | ||
1393 | ret = -ERESTARTSYS; | ||
1394 | break; | ||
1395 | } | ||
1396 | if (do_wakeup) { | ||
1397 | smp_mb(); | ||
1398 | if (waitqueue_active(&opipe->wait)) | ||
1399 | wake_up_interruptible(&opipe->wait); | ||
1400 | kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); | ||
1401 | do_wakeup = 0; | ||
1402 | } | ||
1403 | 1425 | ||
1404 | opipe->waiting_writers++; | 1426 | ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); |
1405 | pipe_wait(opipe); | 1427 | nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); |
1406 | opipe->waiting_writers--; | ||
1407 | continue; | ||
1408 | } | ||
1409 | 1428 | ||
1410 | /* | 1429 | /* |
1411 | * No input buffers, do the usual checks for available | 1430 | * Get a reference to this pipe buffer, |
1412 | * writers and blocking and wait if necessary | 1431 | * so we can copy the contents over. |
1413 | */ | 1432 | */ |
1414 | if (!ipipe->writers) | 1433 | ibuf->ops->get(ipipe, ibuf); |
1415 | break; | 1434 | |
1416 | if (!ipipe->waiting_writers) { | 1435 | obuf = opipe->bufs + nbuf; |
1417 | if (ret) | 1436 | *obuf = *ibuf; |
1418 | break; | 1437 | |
1419 | } | ||
1420 | /* | 1438 | /* |
1421 | * pipe_wait() drops the ipipe mutex. To avoid deadlocks | 1439 | * Don't inherit the gift flag, we need to |
1422 | * with another process, we can only safely do that if | 1440 | * prevent multiple steals of this page. |
1423 | * the ipipe lock is ordered last. | ||
1424 | */ | 1441 | */ |
1425 | if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) { | 1442 | obuf->flags &= ~PIPE_BUF_FLAG_GIFT; |
1426 | if (!ret) | ||
1427 | ret = -EAGAIN; | ||
1428 | break; | ||
1429 | } | ||
1430 | if (signal_pending(current)) { | ||
1431 | if (!ret) | ||
1432 | ret = -ERESTARTSYS; | ||
1433 | break; | ||
1434 | } | ||
1435 | 1443 | ||
1436 | if (waitqueue_active(&ipipe->wait)) | 1444 | if (obuf->len > len) |
1437 | wake_up_interruptible_sync(&ipipe->wait); | 1445 | obuf->len = len; |
1438 | kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT); | ||
1439 | 1446 | ||
1440 | pipe_wait(ipipe); | 1447 | opipe->nrbufs++; |
1441 | } | 1448 | ret += obuf->len; |
1449 | len -= obuf->len; | ||
1450 | i++; | ||
1451 | } while (len); | ||
1442 | 1452 | ||
1443 | mutex_unlock(&ipipe->inode->i_mutex); | 1453 | mutex_unlock(&ipipe->inode->i_mutex); |
1444 | mutex_unlock(&opipe->inode->i_mutex); | 1454 | mutex_unlock(&opipe->inode->i_mutex); |
1445 | 1455 | ||
1446 | if (do_wakeup) { | 1456 | /* |
1457 | * If we put data in the output pipe, wakeup any potential readers. | ||
1458 | */ | ||
1459 | if (ret > 0) { | ||
1447 | smp_mb(); | 1460 | smp_mb(); |
1448 | if (waitqueue_active(&opipe->wait)) | 1461 | if (waitqueue_active(&opipe->wait)) |
1449 | wake_up_interruptible(&opipe->wait); | 1462 | wake_up_interruptible(&opipe->wait); |
@@ -1464,14 +1477,29 @@ static long do_tee(struct file *in, struct file *out, size_t len, | |||
1464 | { | 1477 | { |
1465 | struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; | 1478 | struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; |
1466 | struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; | 1479 | struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; |
1480 | int ret = -EINVAL; | ||
1467 | 1481 | ||
1468 | /* | 1482 | /* |
1469 | * Link ipipe to the two output pipes, consuming as we go along. | 1483 | * Duplicate the contents of ipipe to opipe without actually |
1484 | * copying the data. | ||
1470 | */ | 1485 | */ |
1471 | if (ipipe && opipe) | 1486 | if (ipipe && opipe && ipipe != opipe) { |
1472 | return link_pipe(ipipe, opipe, len, flags); | 1487 | /* |
1488 | * Keep going, unless we encounter an error. The ipipe/opipe | ||
1489 | * ordering doesn't really matter. | ||
1490 | */ | ||
1491 | ret = link_ipipe_prep(ipipe, flags); | ||
1492 | if (!ret) { | ||
1493 | ret = link_opipe_prep(opipe, flags); | ||
1494 | if (!ret) { | ||
1495 | ret = link_pipe(ipipe, opipe, len, flags); | ||
1496 | if (!ret && (flags & SPLICE_F_NONBLOCK)) | ||
1497 | ret = -EAGAIN; | ||
1498 | } | ||
1499 | } | ||
1500 | } | ||
1473 | 1501 | ||
1474 | return -EINVAL; | 1502 | return ret; |
1475 | } | 1503 | } |
1476 | 1504 | ||
1477 | asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) | 1505 | asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) |