diff options
Diffstat (limited to 'fs/splice.c')
-rw-r--r-- | fs/splice.c | 202 |
1 files changed, 159 insertions, 43 deletions
diff --git a/fs/splice.c b/fs/splice.c index 7c2bbf18d7a7..bfa42a277bb8 100644 --- a/fs/splice.c +++ b/fs/splice.c | |||
@@ -22,7 +22,10 @@ | |||
22 | #include <linux/pipe_fs_i.h> | 22 | #include <linux/pipe_fs_i.h> |
23 | #include <linux/mm_inline.h> | 23 | #include <linux/mm_inline.h> |
24 | #include <linux/swap.h> | 24 | #include <linux/swap.h> |
25 | #include <linux/writeback.h> | ||
26 | #include <linux/buffer_head.h> | ||
25 | #include <linux/module.h> | 27 | #include <linux/module.h> |
28 | #include <linux/syscalls.h> | ||
26 | 29 | ||
27 | /* | 30 | /* |
28 | * Passed to the actors | 31 | * Passed to the actors |
@@ -34,28 +37,37 @@ struct splice_desc { | |||
34 | loff_t pos; /* file position */ | 37 | loff_t pos; /* file position */ |
35 | }; | 38 | }; |
36 | 39 | ||
40 | /* | ||
41 | * Attempt to steal a page from a pipe buffer. This should perhaps go into | ||
42 | * a vm helper function, it's already simplified quite a bit by the | ||
43 | * addition of remove_mapping(). If success is returned, the caller may | ||
44 | * attempt to reuse this page for another destination. | ||
45 | */ | ||
37 | static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, | 46 | static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, |
38 | struct pipe_buffer *buf) | 47 | struct pipe_buffer *buf) |
39 | { | 48 | { |
40 | struct page *page = buf->page; | 49 | struct page *page = buf->page; |
50 | struct address_space *mapping = page_mapping(page); | ||
41 | 51 | ||
42 | WARN_ON(!PageLocked(page)); | 52 | WARN_ON(!PageLocked(page)); |
43 | WARN_ON(!PageUptodate(page)); | 53 | WARN_ON(!PageUptodate(page)); |
44 | 54 | ||
45 | if (!remove_mapping(page_mapping(page), page)) | 55 | /* |
46 | return 1; | 56 | * At least for ext2 with nobh option, we need to wait on writeback |
57 | * completing on this page, since we'll remove it from the pagecache. | ||
58 | * Otherwise truncate wont wait on the page, allowing the disk | ||
59 | * blocks to be reused by someone else before we actually wrote our | ||
60 | * data to them. fs corruption ensues. | ||
61 | */ | ||
62 | wait_on_page_writeback(page); | ||
47 | 63 | ||
48 | if (PageLRU(page)) { | 64 | if (PagePrivate(page)) |
49 | struct zone *zone = page_zone(page); | 65 | try_to_release_page(page, mapping_gfp_mask(mapping)); |
50 | 66 | ||
51 | spin_lock_irq(&zone->lru_lock); | 67 | if (!remove_mapping(mapping, page)) |
52 | BUG_ON(!PageLRU(page)); | 68 | return 1; |
53 | __ClearPageLRU(page); | ||
54 | del_page_from_lru(zone, page); | ||
55 | spin_unlock_irq(&zone->lru_lock); | ||
56 | } | ||
57 | 69 | ||
58 | buf->stolen = 1; | 70 | buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU; |
59 | return 0; | 71 | return 0; |
60 | } | 72 | } |
61 | 73 | ||
@@ -64,7 +76,7 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info, | |||
64 | { | 76 | { |
65 | page_cache_release(buf->page); | 77 | page_cache_release(buf->page); |
66 | buf->page = NULL; | 78 | buf->page = NULL; |
67 | buf->stolen = 0; | 79 | buf->flags &= ~(PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU); |
68 | } | 80 | } |
69 | 81 | ||
70 | static void *page_cache_pipe_buf_map(struct file *file, | 82 | static void *page_cache_pipe_buf_map(struct file *file, |
@@ -91,8 +103,7 @@ static void *page_cache_pipe_buf_map(struct file *file, | |||
91 | static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, | 103 | static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, |
92 | struct pipe_buffer *buf) | 104 | struct pipe_buffer *buf) |
93 | { | 105 | { |
94 | if (!buf->stolen) | 106 | unlock_page(buf->page); |
95 | unlock_page(buf->page); | ||
96 | kunmap(buf->page); | 107 | kunmap(buf->page); |
97 | } | 108 | } |
98 | 109 | ||
@@ -104,9 +115,13 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { | |||
104 | .steal = page_cache_pipe_buf_steal, | 115 | .steal = page_cache_pipe_buf_steal, |
105 | }; | 116 | }; |
106 | 117 | ||
118 | /* | ||
119 | * Pipe output worker. This sets up our pipe format with the page cache | ||
120 | * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). | ||
121 | */ | ||
107 | static ssize_t move_to_pipe(struct inode *inode, struct page **pages, | 122 | static ssize_t move_to_pipe(struct inode *inode, struct page **pages, |
108 | int nr_pages, unsigned long offset, | 123 | int nr_pages, unsigned long offset, |
109 | unsigned long len) | 124 | unsigned long len, unsigned int flags) |
110 | { | 125 | { |
111 | struct pipe_inode_info *info; | 126 | struct pipe_inode_info *info; |
112 | int ret, do_wakeup, i; | 127 | int ret, do_wakeup, i; |
@@ -159,6 +174,12 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, | |||
159 | break; | 174 | break; |
160 | } | 175 | } |
161 | 176 | ||
177 | if (flags & SPLICE_F_NONBLOCK) { | ||
178 | if (!ret) | ||
179 | ret = -EAGAIN; | ||
180 | break; | ||
181 | } | ||
182 | |||
162 | if (signal_pending(current)) { | 183 | if (signal_pending(current)) { |
163 | if (!ret) | 184 | if (!ret) |
164 | ret = -ERESTARTSYS; | 185 | ret = -ERESTARTSYS; |
@@ -191,7 +212,7 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, | |||
191 | } | 212 | } |
192 | 213 | ||
193 | static int __generic_file_splice_read(struct file *in, struct inode *pipe, | 214 | static int __generic_file_splice_read(struct file *in, struct inode *pipe, |
194 | size_t len) | 215 | size_t len, unsigned int flags) |
195 | { | 216 | { |
196 | struct address_space *mapping = in->f_mapping; | 217 | struct address_space *mapping = in->f_mapping; |
197 | unsigned int offset, nr_pages; | 218 | unsigned int offset, nr_pages; |
@@ -231,9 +252,9 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, | |||
231 | * fill shadow[] with pages at the right locations, so we only | 252 | * fill shadow[] with pages at the right locations, so we only |
232 | * have to fill holes | 253 | * have to fill holes |
233 | */ | 254 | */ |
234 | memset(shadow, 0, i * sizeof(struct page *)); | 255 | memset(shadow, 0, nr_pages * sizeof(struct page *)); |
235 | for (j = 0, pidx = index; j < i; pidx++, j++) | 256 | for (j = 0; j < i; j++) |
236 | shadow[pages[j]->index - pidx] = pages[j]; | 257 | shadow[pages[j]->index - index] = pages[j]; |
237 | 258 | ||
238 | /* | 259 | /* |
239 | * now fill in the holes | 260 | * now fill in the holes |
@@ -279,9 +300,19 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, | |||
279 | * Now we splice them into the pipe.. | 300 | * Now we splice them into the pipe.. |
280 | */ | 301 | */ |
281 | splice_them: | 302 | splice_them: |
282 | return move_to_pipe(pipe, pages, i, offset, len); | 303 | return move_to_pipe(pipe, pages, i, offset, len, flags); |
283 | } | 304 | } |
284 | 305 | ||
306 | /** | ||
307 | * generic_file_splice_read - splice data from file to a pipe | ||
308 | * @in: file to splice from | ||
309 | * @pipe: pipe to splice to | ||
310 | * @len: number of bytes to splice | ||
311 | * @flags: splice modifier flags | ||
312 | * | ||
313 | * Will read pages from given file and fill them into a pipe. | ||
314 | * | ||
315 | */ | ||
285 | ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, | 316 | ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, |
286 | size_t len, unsigned int flags) | 317 | size_t len, unsigned int flags) |
287 | { | 318 | { |
@@ -291,7 +322,7 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, | |||
291 | ret = 0; | 322 | ret = 0; |
292 | spliced = 0; | 323 | spliced = 0; |
293 | while (len) { | 324 | while (len) { |
294 | ret = __generic_file_splice_read(in, pipe, len); | 325 | ret = __generic_file_splice_read(in, pipe, len, flags); |
295 | 326 | ||
296 | if (ret <= 0) | 327 | if (ret <= 0) |
297 | break; | 328 | break; |
@@ -299,6 +330,11 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, | |||
299 | in->f_pos += ret; | 330 | in->f_pos += ret; |
300 | len -= ret; | 331 | len -= ret; |
301 | spliced += ret; | 332 | spliced += ret; |
333 | |||
334 | if (!(flags & SPLICE_F_NONBLOCK)) | ||
335 | continue; | ||
336 | ret = -EAGAIN; | ||
337 | break; | ||
302 | } | 338 | } |
303 | 339 | ||
304 | if (spliced) | 340 | if (spliced) |
@@ -307,8 +343,11 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, | |||
307 | return ret; | 343 | return ret; |
308 | } | 344 | } |
309 | 345 | ||
346 | EXPORT_SYMBOL(generic_file_splice_read); | ||
347 | |||
310 | /* | 348 | /* |
311 | * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage(). | 349 | * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' |
350 | * using sendpage(). | ||
312 | */ | 351 | */ |
313 | static int pipe_to_sendpage(struct pipe_inode_info *info, | 352 | static int pipe_to_sendpage(struct pipe_inode_info *info, |
314 | struct pipe_buffer *buf, struct splice_desc *sd) | 353 | struct pipe_buffer *buf, struct splice_desc *sd) |
@@ -318,6 +357,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, | |||
318 | unsigned int offset; | 357 | unsigned int offset; |
319 | ssize_t ret; | 358 | ssize_t ret; |
320 | void *ptr; | 359 | void *ptr; |
360 | int more; | ||
321 | 361 | ||
322 | /* | 362 | /* |
323 | * sub-optimal, but we are limited by the pipe ->map. we don't | 363 | * sub-optimal, but we are limited by the pipe ->map. we don't |
@@ -330,9 +370,9 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, | |||
330 | return PTR_ERR(ptr); | 370 | return PTR_ERR(ptr); |
331 | 371 | ||
332 | offset = pos & ~PAGE_CACHE_MASK; | 372 | offset = pos & ~PAGE_CACHE_MASK; |
373 | more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; | ||
333 | 374 | ||
334 | ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos, | 375 | ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more); |
335 | sd->len < sd->total_len); | ||
336 | 376 | ||
337 | buf->ops->unmap(info, buf); | 377 | buf->ops->unmap(info, buf); |
338 | if (ret == sd->len) | 378 | if (ret == sd->len) |
@@ -354,16 +394,19 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, | |||
354 | * - Destination page does not exist, we can add the pipe page to | 394 | * - Destination page does not exist, we can add the pipe page to |
355 | * the page cache and avoid the copy. | 395 | * the page cache and avoid the copy. |
356 | * | 396 | * |
357 | * For now we just do the slower thing and always copy pages over, it's | 397 | * If asked to move pages to the output file (SPLICE_F_MOVE is set in |
358 | * easier than migrating pages from the pipe to the target file. For the | 398 | * sd->flags), we attempt to migrate pages from the pipe to the output |
359 | * case of doing file | file splicing, the migrate approach had some LRU | 399 | * file address space page cache. This is possible if no one else has |
360 | * nastiness... | 400 | * the pipe page referenced outside of the pipe and page cache. If |
401 | * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create | ||
402 | * a new page in the output file page cache and fill/dirty that. | ||
361 | */ | 403 | */ |
362 | static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, | 404 | static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, |
363 | struct splice_desc *sd) | 405 | struct splice_desc *sd) |
364 | { | 406 | { |
365 | struct file *file = sd->file; | 407 | struct file *file = sd->file; |
366 | struct address_space *mapping = file->f_mapping; | 408 | struct address_space *mapping = file->f_mapping; |
409 | gfp_t gfp_mask = mapping_gfp_mask(mapping); | ||
367 | unsigned int offset; | 410 | unsigned int offset; |
368 | struct page *page; | 411 | struct page *page; |
369 | pgoff_t index; | 412 | pgoff_t index; |
@@ -384,18 +427,23 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, | |||
384 | * reuse buf page, if SPLICE_F_MOVE is set | 427 | * reuse buf page, if SPLICE_F_MOVE is set |
385 | */ | 428 | */ |
386 | if (sd->flags & SPLICE_F_MOVE) { | 429 | if (sd->flags & SPLICE_F_MOVE) { |
430 | /* | ||
431 | * If steal succeeds, buf->page is now pruned from the vm | ||
432 | * side (LRU and page cache) and we can reuse it. | ||
433 | */ | ||
387 | if (buf->ops->steal(info, buf)) | 434 | if (buf->ops->steal(info, buf)) |
388 | goto find_page; | 435 | goto find_page; |
389 | 436 | ||
390 | page = buf->page; | 437 | page = buf->page; |
391 | if (add_to_page_cache_lru(page, mapping, index, | 438 | if (add_to_page_cache(page, mapping, index, gfp_mask)) |
392 | mapping_gfp_mask(mapping))) | ||
393 | goto find_page; | 439 | goto find_page; |
440 | |||
441 | if (!(buf->flags & PIPE_BUF_FLAG_LRU)) | ||
442 | lru_cache_add(page); | ||
394 | } else { | 443 | } else { |
395 | find_page: | 444 | find_page: |
396 | ret = -ENOMEM; | 445 | ret = -ENOMEM; |
397 | page = find_or_create_page(mapping, index, | 446 | page = find_or_create_page(mapping, index, gfp_mask); |
398 | mapping_gfp_mask(mapping)); | ||
399 | if (!page) | 447 | if (!page) |
400 | goto out; | 448 | goto out; |
401 | 449 | ||
@@ -432,10 +480,13 @@ find_page: | |||
432 | } | 480 | } |
433 | 481 | ||
434 | ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); | 482 | ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); |
435 | if (ret) | 483 | if (ret == AOP_TRUNCATED_PAGE) { |
484 | page_cache_release(page); | ||
485 | goto find_page; | ||
486 | } else if (ret) | ||
436 | goto out; | 487 | goto out; |
437 | 488 | ||
438 | if (!buf->stolen) { | 489 | if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { |
439 | char *dst = kmap_atomic(page, KM_USER0); | 490 | char *dst = kmap_atomic(page, KM_USER0); |
440 | 491 | ||
441 | memcpy(dst + offset, src + buf->offset, sd->len); | 492 | memcpy(dst + offset, src + buf->offset, sd->len); |
@@ -444,16 +495,18 @@ find_page: | |||
444 | } | 495 | } |
445 | 496 | ||
446 | ret = mapping->a_ops->commit_write(file, page, 0, sd->len); | 497 | ret = mapping->a_ops->commit_write(file, page, 0, sd->len); |
447 | if (ret < 0) | 498 | if (ret == AOP_TRUNCATED_PAGE) { |
499 | page_cache_release(page); | ||
500 | goto find_page; | ||
501 | } else if (ret) | ||
448 | goto out; | 502 | goto out; |
449 | 503 | ||
450 | set_page_dirty(page); | 504 | balance_dirty_pages_ratelimited(mapping); |
451 | ret = write_one_page(page, 0); | ||
452 | out: | 505 | out: |
453 | if (ret < 0) | 506 | if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { |
454 | unlock_page(page); | ||
455 | if (!buf->stolen) | ||
456 | page_cache_release(page); | 507 | page_cache_release(page); |
508 | unlock_page(page); | ||
509 | } | ||
457 | buf->ops->unmap(info, buf); | 510 | buf->ops->unmap(info, buf); |
458 | return ret; | 511 | return ret; |
459 | } | 512 | } |
@@ -461,6 +514,11 @@ out: | |||
461 | typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, | 514 | typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, |
462 | struct splice_desc *); | 515 | struct splice_desc *); |
463 | 516 | ||
517 | /* | ||
518 | * Pipe input worker. Most of this logic works like a regular pipe, the | ||
519 | * key here is the 'actor' worker passed in that actually moves the data | ||
520 | * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. | ||
521 | */ | ||
464 | static ssize_t move_from_pipe(struct inode *inode, struct file *out, | 522 | static ssize_t move_from_pipe(struct inode *inode, struct file *out, |
465 | size_t len, unsigned int flags, | 523 | size_t len, unsigned int flags, |
466 | splice_actor *actor) | 524 | splice_actor *actor) |
@@ -527,6 +585,12 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, | |||
527 | break; | 585 | break; |
528 | } | 586 | } |
529 | 587 | ||
588 | if (flags & SPLICE_F_NONBLOCK) { | ||
589 | if (!ret) | ||
590 | ret = -EAGAIN; | ||
591 | break; | ||
592 | } | ||
593 | |||
530 | if (signal_pending(current)) { | 594 | if (signal_pending(current)) { |
531 | if (!ret) | 595 | if (!ret) |
532 | ret = -ERESTARTSYS; | 596 | ret = -ERESTARTSYS; |
@@ -556,21 +620,67 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, | |||
556 | 620 | ||
557 | } | 621 | } |
558 | 622 | ||
623 | /** | ||
624 | * generic_file_splice_write - splice data from a pipe to a file | ||
625 | * @inode: pipe inode | ||
626 | * @out: file to write to | ||
627 | * @len: number of bytes to splice | ||
628 | * @flags: splice modifier flags | ||
629 | * | ||
630 | * Will either move or copy pages (determined by @flags options) from | ||
631 | * the given pipe inode to the given file. | ||
632 | * | ||
633 | */ | ||
559 | ssize_t generic_file_splice_write(struct inode *inode, struct file *out, | 634 | ssize_t generic_file_splice_write(struct inode *inode, struct file *out, |
560 | size_t len, unsigned int flags) | 635 | size_t len, unsigned int flags) |
561 | { | 636 | { |
562 | return move_from_pipe(inode, out, len, flags, pipe_to_file); | 637 | struct address_space *mapping = out->f_mapping; |
638 | ssize_t ret = move_from_pipe(inode, out, len, flags, pipe_to_file); | ||
639 | |||
640 | /* | ||
641 | * if file or inode is SYNC and we actually wrote some data, sync it | ||
642 | */ | ||
643 | if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host)) | ||
644 | && ret > 0) { | ||
645 | struct inode *inode = mapping->host; | ||
646 | int err; | ||
647 | |||
648 | mutex_lock(&inode->i_mutex); | ||
649 | err = generic_osync_inode(mapping->host, mapping, | ||
650 | OSYNC_METADATA|OSYNC_DATA); | ||
651 | mutex_unlock(&inode->i_mutex); | ||
652 | |||
653 | if (err) | ||
654 | ret = err; | ||
655 | } | ||
656 | |||
657 | return ret; | ||
563 | } | 658 | } |
564 | 659 | ||
660 | EXPORT_SYMBOL(generic_file_splice_write); | ||
661 | |||
662 | /** | ||
663 | * generic_splice_sendpage - splice data from a pipe to a socket | ||
664 | * @inode: pipe inode | ||
665 | * @out: socket to write to | ||
666 | * @len: number of bytes to splice | ||
667 | * @flags: splice modifier flags | ||
668 | * | ||
669 | * Will send @len bytes from the pipe to a network socket. No data copying | ||
670 | * is involved. | ||
671 | * | ||
672 | */ | ||
565 | ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, | 673 | ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, |
566 | size_t len, unsigned int flags) | 674 | size_t len, unsigned int flags) |
567 | { | 675 | { |
568 | return move_from_pipe(inode, out, len, flags, pipe_to_sendpage); | 676 | return move_from_pipe(inode, out, len, flags, pipe_to_sendpage); |
569 | } | 677 | } |
570 | 678 | ||
571 | EXPORT_SYMBOL(generic_file_splice_write); | 679 | EXPORT_SYMBOL(generic_splice_sendpage); |
572 | EXPORT_SYMBOL(generic_file_splice_read); | ||
573 | 680 | ||
681 | /* | ||
682 | * Attempt to initiate a splice from pipe to file. | ||
683 | */ | ||
574 | static long do_splice_from(struct inode *pipe, struct file *out, size_t len, | 684 | static long do_splice_from(struct inode *pipe, struct file *out, size_t len, |
575 | unsigned int flags) | 685 | unsigned int flags) |
576 | { | 686 | { |
@@ -591,6 +701,9 @@ static long do_splice_from(struct inode *pipe, struct file *out, size_t len, | |||
591 | return out->f_op->splice_write(pipe, out, len, flags); | 701 | return out->f_op->splice_write(pipe, out, len, flags); |
592 | } | 702 | } |
593 | 703 | ||
704 | /* | ||
705 | * Attempt to initiate a splice from a file to a pipe. | ||
706 | */ | ||
594 | static long do_splice_to(struct file *in, struct inode *pipe, size_t len, | 707 | static long do_splice_to(struct file *in, struct inode *pipe, size_t len, |
595 | unsigned int flags) | 708 | unsigned int flags) |
596 | { | 709 | { |
@@ -619,6 +732,9 @@ static long do_splice_to(struct file *in, struct inode *pipe, size_t len, | |||
619 | return in->f_op->splice_read(in, pipe, len, flags); | 732 | return in->f_op->splice_read(in, pipe, len, flags); |
620 | } | 733 | } |
621 | 734 | ||
735 | /* | ||
736 | * Determine where to splice to/from. | ||
737 | */ | ||
622 | static long do_splice(struct file *in, struct file *out, size_t len, | 738 | static long do_splice(struct file *in, struct file *out, size_t len, |
623 | unsigned int flags) | 739 | unsigned int flags) |
624 | { | 740 | { |