aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBadari Pulavarty <pbadari@us.ibm.com>2006-10-01 02:28:49 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-10-01 03:39:29 -0400
commiteed4e51fb60c3863c134a5e9f6006b29805ead97 (patch)
treeedb0a80d75c454ad77001f3bd1a87933cbcff53f
parent543ade1fc901db4c3dbe9fb27241fb977f1f3eea (diff)
[PATCH] Add vector AIO support
This work is initially done by Zach Brown to add support for vectored aio. These are the core changes for AIO to support IOCB_CMD_PREADV/IOCB_CMD_PWRITEV. [akpm@osdl.org: huge build fix] Signed-off-by: Zach Brown <zach.brown@oracle.com> Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com> Acked-by: Benjamin LaHaise <bcrl@kvack.org> Acked-by: James Morris <jmorris@namei.org> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--fs/aio.c171
-rw-r--r--fs/read_write.c129
-rw-r--r--include/linux/aio.h4
-rw-r--r--include/linux/aio_abi.h2
-rw-r--r--include/linux/fs.h5
5 files changed, 203 insertions, 108 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 27ff56540c73..2e0d1505ee36 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -415,6 +415,7 @@ static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx)
415 req->ki_retry = NULL; 415 req->ki_retry = NULL;
416 req->ki_dtor = NULL; 416 req->ki_dtor = NULL;
417 req->private = NULL; 417 req->private = NULL;
418 req->ki_iovec = NULL;
418 INIT_LIST_HEAD(&req->ki_run_list); 419 INIT_LIST_HEAD(&req->ki_run_list);
419 420
420 /* Check if the completion queue has enough free space to 421 /* Check if the completion queue has enough free space to
@@ -460,6 +461,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
460 461
461 if (req->ki_dtor) 462 if (req->ki_dtor)
462 req->ki_dtor(req); 463 req->ki_dtor(req);
464 if (req->ki_iovec != &req->ki_inline_vec)
465 kfree(req->ki_iovec);
463 kmem_cache_free(kiocb_cachep, req); 466 kmem_cache_free(kiocb_cachep, req);
464 ctx->reqs_active--; 467 ctx->reqs_active--;
465 468
@@ -1301,69 +1304,63 @@ asmlinkage long sys_io_destroy(aio_context_t ctx)
1301 return -EINVAL; 1304 return -EINVAL;
1302} 1305}
1303 1306
1304/* 1307static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
1305 * aio_p{read,write} are the default ki_retry methods for
1306 * IO_CMD_P{READ,WRITE}. They maintains kiocb retry state around potentially
1307 * multiple calls to f_op->aio_read(). They loop around partial progress
1308 * instead of returning -EIOCBRETRY because they don't have the means to call
1309 * kick_iocb().
1310 */
1311static ssize_t aio_pread(struct kiocb *iocb)
1312{ 1308{
1313 struct file *file = iocb->ki_filp; 1309 struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg];
1314 struct address_space *mapping = file->f_mapping; 1310
1315 struct inode *inode = mapping->host; 1311 BUG_ON(ret <= 0);
1316 ssize_t ret = 0; 1312
1317 1313 while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) {
1318 do { 1314 ssize_t this = min((ssize_t)iov->iov_len, ret);
1319 iocb->ki_inline_vec.iov_base = iocb->ki_buf; 1315 iov->iov_base += this;
1320 iocb->ki_inline_vec.iov_len = iocb->ki_left; 1316 iov->iov_len -= this;
1321 1317 iocb->ki_left -= this;
1322 ret = file->f_op->aio_read(iocb, &iocb->ki_inline_vec, 1318 ret -= this;
1323 1, iocb->ki_pos); 1319 if (iov->iov_len == 0) {
1324 /* 1320 iocb->ki_cur_seg++;
1325 * Can't just depend on iocb->ki_left to determine 1321 iov++;
1326 * whether we are done. This may have been a short read.
1327 */
1328 if (ret > 0) {
1329 iocb->ki_buf += ret;
1330 iocb->ki_left -= ret;
1331 } 1322 }
1323 }
1332 1324
1333 /* 1325 /* the caller should not have done more io than what fit in
1334 * For pipes and sockets we return once we have some data; for 1326 * the remaining iovecs */
1335 * regular files we retry till we complete the entire read or 1327 BUG_ON(ret > 0 && iocb->ki_left == 0);
1336 * find that we can't read any more data (e.g short reads).
1337 */
1338 } while (ret > 0 && iocb->ki_left > 0 &&
1339 !S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode));
1340
1341 /* This means we must have transferred all that we could */
1342 /* No need to retry anymore */
1343 if ((ret == 0) || (iocb->ki_left == 0))
1344 ret = iocb->ki_nbytes - iocb->ki_left;
1345
1346 return ret;
1347} 1328}
1348 1329
1349/* see aio_pread() */ 1330static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
1350static ssize_t aio_pwrite(struct kiocb *iocb)
1351{ 1331{
1352 struct file *file = iocb->ki_filp; 1332 struct file *file = iocb->ki_filp;
1333 struct address_space *mapping = file->f_mapping;
1334 struct inode *inode = mapping->host;
1335 ssize_t (*rw_op)(struct kiocb *, const struct iovec *,
1336 unsigned long, loff_t);
1353 ssize_t ret = 0; 1337 ssize_t ret = 0;
1338 unsigned short opcode;
1339
1340 if ((iocb->ki_opcode == IOCB_CMD_PREADV) ||
1341 (iocb->ki_opcode == IOCB_CMD_PREAD)) {
1342 rw_op = file->f_op->aio_read;
1343 opcode = IOCB_CMD_PREADV;
1344 } else {
1345 rw_op = file->f_op->aio_write;
1346 opcode = IOCB_CMD_PWRITEV;
1347 }
1354 1348
1355 do { 1349 do {
1356 iocb->ki_inline_vec.iov_base = iocb->ki_buf; 1350 ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
1357 iocb->ki_inline_vec.iov_len = iocb->ki_left; 1351 iocb->ki_nr_segs - iocb->ki_cur_seg,
1358 1352 iocb->ki_pos);
1359 ret = file->f_op->aio_write(iocb, &iocb->ki_inline_vec, 1353 if (ret > 0)
1360 1, iocb->ki_pos); 1354 aio_advance_iovec(iocb, ret);
1361 if (ret > 0) { 1355
1362 iocb->ki_buf += ret; 1356 /* retry all partial writes. retry partial reads as long as its a
1363 iocb->ki_left -= ret; 1357 * regular file. */
1364 } 1358 } while (ret > 0 && iocb->ki_left > 0 &&
1365 } while (ret > 0 && iocb->ki_left > 0); 1359 (opcode == IOCB_CMD_PWRITEV ||
1360 (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
1366 1361
1362 /* This means we must have transferred all that we could */
1363 /* No need to retry anymore */
1367 if ((ret == 0) || (iocb->ki_left == 0)) 1364 if ((ret == 0) || (iocb->ki_left == 0))
1368 ret = iocb->ki_nbytes - iocb->ki_left; 1365 ret = iocb->ki_nbytes - iocb->ki_left;
1369 1366
@@ -1390,6 +1387,38 @@ static ssize_t aio_fsync(struct kiocb *iocb)
1390 return ret; 1387 return ret;
1391} 1388}
1392 1389
1390static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb)
1391{
1392 ssize_t ret;
1393
1394 ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf,
1395 kiocb->ki_nbytes, 1,
1396 &kiocb->ki_inline_vec, &kiocb->ki_iovec);
1397 if (ret < 0)
1398 goto out;
1399
1400 kiocb->ki_nr_segs = kiocb->ki_nbytes;
1401 kiocb->ki_cur_seg = 0;
1402 /* ki_nbytes/left now reflect bytes instead of segs */
1403 kiocb->ki_nbytes = ret;
1404 kiocb->ki_left = ret;
1405
1406 ret = 0;
1407out:
1408 return ret;
1409}
1410
1411static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
1412{
1413 kiocb->ki_iovec = &kiocb->ki_inline_vec;
1414 kiocb->ki_iovec->iov_base = kiocb->ki_buf;
1415 kiocb->ki_iovec->iov_len = kiocb->ki_left;
1416 kiocb->ki_nr_segs = 1;
1417 kiocb->ki_cur_seg = 0;
1418 kiocb->ki_nbytes = kiocb->ki_left;
1419 return 0;
1420}
1421
1393/* 1422/*
1394 * aio_setup_iocb: 1423 * aio_setup_iocb:
1395 * Performs the initial checks and aio retry method 1424 * Performs the initial checks and aio retry method
@@ -1412,9 +1441,12 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
1412 ret = security_file_permission(file, MAY_READ); 1441 ret = security_file_permission(file, MAY_READ);
1413 if (unlikely(ret)) 1442 if (unlikely(ret))
1414 break; 1443 break;
1444 ret = aio_setup_single_vector(kiocb);
1445 if (ret)
1446 break;
1415 ret = -EINVAL; 1447 ret = -EINVAL;
1416 if (file->f_op->aio_read) 1448 if (file->f_op->aio_read)
1417 kiocb->ki_retry = aio_pread; 1449 kiocb->ki_retry = aio_rw_vect_retry;
1418 break; 1450 break;
1419 case IOCB_CMD_PWRITE: 1451 case IOCB_CMD_PWRITE:
1420 ret = -EBADF; 1452 ret = -EBADF;
@@ -1427,9 +1459,40 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
1427 ret = security_file_permission(file, MAY_WRITE); 1459 ret = security_file_permission(file, MAY_WRITE);
1428 if (unlikely(ret)) 1460 if (unlikely(ret))
1429 break; 1461 break;
1462 ret = aio_setup_single_vector(kiocb);
1463 if (ret)
1464 break;
1465 ret = -EINVAL;
1466 if (file->f_op->aio_write)
1467 kiocb->ki_retry = aio_rw_vect_retry;
1468 break;
1469 case IOCB_CMD_PREADV:
1470 ret = -EBADF;
1471 if (unlikely(!(file->f_mode & FMODE_READ)))
1472 break;
1473 ret = security_file_permission(file, MAY_READ);
1474 if (unlikely(ret))
1475 break;
1476 ret = aio_setup_vectored_rw(READ, kiocb);
1477 if (ret)
1478 break;
1479 ret = -EINVAL;
1480 if (file->f_op->aio_read)
1481 kiocb->ki_retry = aio_rw_vect_retry;
1482 break;
1483 case IOCB_CMD_PWRITEV:
1484 ret = -EBADF;
1485 if (unlikely(!(file->f_mode & FMODE_WRITE)))
1486 break;
1487 ret = security_file_permission(file, MAY_WRITE);
1488 if (unlikely(ret))
1489 break;
1490 ret = aio_setup_vectored_rw(WRITE, kiocb);
1491 if (ret)
1492 break;
1430 ret = -EINVAL; 1493 ret = -EINVAL;
1431 if (file->f_op->aio_write) 1494 if (file->f_op->aio_write)
1432 kiocb->ki_retry = aio_pwrite; 1495 kiocb->ki_retry = aio_rw_vect_retry;
1433 break; 1496 break;
1434 case IOCB_CMD_FDSYNC: 1497 case IOCB_CMD_FDSYNC:
1435 ret = -EINVAL; 1498 ret = -EINVAL;
diff --git a/fs/read_write.c b/fs/read_write.c
index 4ed839bcb91c..f792000a28e6 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -511,6 +511,74 @@ ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
511/* A write operation does a read from user space and vice versa */ 511/* A write operation does a read from user space and vice versa */
512#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) 512#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
513 513
514ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
515 unsigned long nr_segs, unsigned long fast_segs,
516 struct iovec *fast_pointer,
517 struct iovec **ret_pointer)
518 {
519 unsigned long seg;
520 ssize_t ret;
521 struct iovec *iov = fast_pointer;
522
523 /*
524 * SuS says "The readv() function *may* fail if the iovcnt argument
525 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
526 * traditionally returned zero for zero segments, so...
527 */
528 if (nr_segs == 0) {
529 ret = 0;
530 goto out;
531 }
532
533 /*
534 * First get the "struct iovec" from user memory and
535 * verify all the pointers
536 */
537 if (nr_segs > UIO_MAXIOV) {
538 ret = -EINVAL;
539 goto out;
540 }
541 if (nr_segs > fast_segs) {
542 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
543 if (iov == NULL) {
544 ret = -ENOMEM;
545 goto out;
546 }
547 }
548 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
549 ret = -EFAULT;
550 goto out;
551 }
552
553 /*
554 * According to the Single Unix Specification we should return EINVAL
555 * if an element length is < 0 when cast to ssize_t or if the
556 * total length would overflow the ssize_t return value of the
557 * system call.
558 */
559 ret = 0;
560 for (seg = 0; seg < nr_segs; seg++) {
561 void __user *buf = iov[seg].iov_base;
562 ssize_t len = (ssize_t)iov[seg].iov_len;
563
564 /* see if we we're about to use an invalid len or if
565 * it's about to overflow ssize_t */
566 if (len < 0 || (ret + len < ret)) {
567 ret = -EINVAL;
568 goto out;
569 }
570 if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
571 ret = -EFAULT;
572 goto out;
573 }
574
575 ret += len;
576 }
577out:
578 *ret_pointer = iov;
579 return ret;
580}
581
514static ssize_t do_readv_writev(int type, struct file *file, 582static ssize_t do_readv_writev(int type, struct file *file,
515 const struct iovec __user * uvector, 583 const struct iovec __user * uvector,
516 unsigned long nr_segs, loff_t *pos) 584 unsigned long nr_segs, loff_t *pos)
@@ -519,64 +587,20 @@ static ssize_t do_readv_writev(int type, struct file *file,
519 struct iovec iovstack[UIO_FASTIOV]; 587 struct iovec iovstack[UIO_FASTIOV];
520 struct iovec *iov = iovstack; 588 struct iovec *iov = iovstack;
521 ssize_t ret; 589 ssize_t ret;
522 int seg;
523 io_fn_t fn; 590 io_fn_t fn;
524 iov_fn_t fnv; 591 iov_fn_t fnv;
525 592
526 /* 593 if (!file->f_op) {
527 * SuS says "The readv() function *may* fail if the iovcnt argument 594 ret = -EINVAL;
528 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
529 * traditionally returned zero for zero segments, so...
530 */
531 ret = 0;
532 if (nr_segs == 0)
533 goto out;
534
535 /*
536 * First get the "struct iovec" from user memory and
537 * verify all the pointers
538 */
539 ret = -EINVAL;
540 if (nr_segs > UIO_MAXIOV)
541 goto out;
542 if (!file->f_op)
543 goto out;
544 if (nr_segs > UIO_FASTIOV) {
545 ret = -ENOMEM;
546 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
547 if (!iov)
548 goto out;
549 }
550 ret = -EFAULT;
551 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector)))
552 goto out; 595 goto out;
596 }
553 597
554 /* 598 ret = rw_copy_check_uvector(type, uvector, nr_segs,
555 * Single unix specification: 599 ARRAY_SIZE(iovstack), iovstack, &iov);
556 * We should -EINVAL if an element length is not >= 0 and fitting an 600 if (ret <= 0)
557 * ssize_t. The total length is fitting an ssize_t
558 *
559 * Be careful here because iov_len is a size_t not an ssize_t
560 */
561 tot_len = 0;
562 ret = -EINVAL;
563 for (seg = 0; seg < nr_segs; seg++) {
564 void __user *buf = iov[seg].iov_base;
565 ssize_t len = (ssize_t)iov[seg].iov_len;
566
567 if (len < 0) /* size_t not fitting an ssize_t .. */
568 goto out;
569 if (unlikely(!access_ok(vrfy_dir(type), buf, len)))
570 goto Efault;
571 tot_len += len;
572 if ((ssize_t)tot_len < 0) /* maths overflow on the ssize_t */
573 goto out;
574 }
575 if (tot_len == 0) {
576 ret = 0;
577 goto out; 601 goto out;
578 }
579 602
603 tot_len = ret;
580 ret = rw_verify_area(type, file, pos, tot_len); 604 ret = rw_verify_area(type, file, pos, tot_len);
581 if (ret < 0) 605 if (ret < 0)
582 goto out; 606 goto out;
@@ -609,9 +633,6 @@ out:
609 fsnotify_modify(file->f_dentry); 633 fsnotify_modify(file->f_dentry);
610 } 634 }
611 return ret; 635 return ret;
612Efault:
613 ret = -EFAULT;
614 goto out;
615} 636}
616 637
617ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, 638ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 58349e58b749..5722568fc71e 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -7,6 +7,7 @@
7#include <linux/uio.h> 7#include <linux/uio.h>
8 8
9#include <asm/atomic.h> 9#include <asm/atomic.h>
10#include <linux/uio.h>
10 11
11#define AIO_MAXSEGS 4 12#define AIO_MAXSEGS 4
12#define AIO_KIOGRP_NR_ATOMIC 8 13#define AIO_KIOGRP_NR_ATOMIC 8
@@ -114,6 +115,9 @@ struct kiocb {
114 long ki_kicked; /* just for testing */ 115 long ki_kicked; /* just for testing */
115 long ki_queued; /* just for testing */ 116 long ki_queued; /* just for testing */
116 struct iovec ki_inline_vec; /* inline vector */ 117 struct iovec ki_inline_vec; /* inline vector */
118 struct iovec *ki_iovec;
119 unsigned long ki_nr_segs;
120 unsigned long ki_cur_seg;
117 121
118 struct list_head ki_list; /* the aio core uses this 122 struct list_head ki_list; /* the aio core uses this
119 * for cancellation */ 123 * for cancellation */
diff --git a/include/linux/aio_abi.h b/include/linux/aio_abi.h
index 30fdcc89d142..3466b1d0ffd2 100644
--- a/include/linux/aio_abi.h
+++ b/include/linux/aio_abi.h
@@ -41,6 +41,8 @@ enum {
41 * IOCB_CMD_POLL = 5, 41 * IOCB_CMD_POLL = 5,
42 */ 42 */
43 IOCB_CMD_NOOP = 6, 43 IOCB_CMD_NOOP = 6,
44 IOCB_CMD_PREADV = 7,
45 IOCB_CMD_PWRITEV = 8,
44}; 46};
45 47
46/* read() from /dev/aio returns these structures. */ 48/* read() from /dev/aio returns these structures. */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 011129f8803e..4bb70871873f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1150,6 +1150,11 @@ struct inode_operations {
1150 1150
1151struct seq_file; 1151struct seq_file;
1152 1152
1153ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
1154 unsigned long nr_segs, unsigned long fast_segs,
1155 struct iovec *fast_pointer,
1156 struct iovec **ret_pointer);
1157
1153extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); 1158extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
1154extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); 1159extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
1155extern ssize_t vfs_readv(struct file *, const struct iovec __user *, 1160extern ssize_t vfs_readv(struct file *, const struct iovec __user *,