aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@g5.osdl.org>2006-04-02 17:22:06 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-04-02 17:22:06 -0400
commitd69636157ad8fa6bb9fff452cefb34fcace851b5 (patch)
treedc9c0cc41150042224a367ffcab77cf7d315e3a7
parent1810b6cb162e0c19e0ecbbacbcfd66f578f335ec (diff)
parent3e7ee3e7b36fa4e2d88d8fb0a2577be95fc4636d (diff)
Merge branch 'splice' of git://brick.kernel.dk/data/git/linux-2.6-block
* 'splice' of git://brick.kernel.dk/data/git/linux-2.6-block: [PATCH] splice: fix page stealing LRU handling. [PATCH] splice: page stealing needs to wait_on_page_writeback() [PATCH] splice: export generic_splice_sendpage [PATCH] splice: add a SPLICE_F_MORE flag [PATCH] splice: add comments documenting more of the code [PATCH] splice: improve writeback and clean up page stealing [PATCH] splice: fix shadow[] filling logic
-rw-r--r--fs/pipe.c4
-rw-r--r--fs/splice.c177
-rw-r--r--include/linux/pipe_fs_i.h6
3 files changed, 146 insertions, 41 deletions
diff --git a/fs/pipe.c b/fs/pipe.c
index 109a102c150d..795df987cd38 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -95,6 +95,8 @@ static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buff
95{ 95{
96 struct page *page = buf->page; 96 struct page *page = buf->page;
97 97
98 buf->flags &= ~PIPE_BUF_FLAG_STOLEN;
99
98 /* 100 /*
99 * If nobody else uses this page, and we don't already have a 101 * If nobody else uses this page, and we don't already have a
100 * temporary page, let's keep track of it as a one-deep 102 * temporary page, let's keep track of it as a one-deep
@@ -124,7 +126,7 @@ static void anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer
124static int anon_pipe_buf_steal(struct pipe_inode_info *info, 126static int anon_pipe_buf_steal(struct pipe_inode_info *info,
125 struct pipe_buffer *buf) 127 struct pipe_buffer *buf)
126{ 128{
127 buf->stolen = 1; 129 buf->flags |= PIPE_BUF_FLAG_STOLEN;
128 return 0; 130 return 0;
129} 131}
130 132
diff --git a/fs/splice.c b/fs/splice.c
index 6081cf7d2d1b..bfa42a277bb8 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -22,7 +22,10 @@
22#include <linux/pipe_fs_i.h> 22#include <linux/pipe_fs_i.h>
23#include <linux/mm_inline.h> 23#include <linux/mm_inline.h>
24#include <linux/swap.h> 24#include <linux/swap.h>
25#include <linux/writeback.h>
26#include <linux/buffer_head.h>
25#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/syscalls.h>
26 29
27/* 30/*
28 * Passed to the actors 31 * Passed to the actors
@@ -34,28 +37,37 @@ struct splice_desc {
34 loff_t pos; /* file position */ 37 loff_t pos; /* file position */
35}; 38};
36 39
40/*
41 * Attempt to steal a page from a pipe buffer. This should perhaps go into
42 * a vm helper function, it's already simplified quite a bit by the
43 * addition of remove_mapping(). If success is returned, the caller may
44 * attempt to reuse this page for another destination.
45 */
37static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, 46static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
38 struct pipe_buffer *buf) 47 struct pipe_buffer *buf)
39{ 48{
40 struct page *page = buf->page; 49 struct page *page = buf->page;
50 struct address_space *mapping = page_mapping(page);
41 51
42 WARN_ON(!PageLocked(page)); 52 WARN_ON(!PageLocked(page));
43 WARN_ON(!PageUptodate(page)); 53 WARN_ON(!PageUptodate(page));
44 54
45 if (!remove_mapping(page_mapping(page), page)) 55 /*
46 return 1; 56 * At least for ext2 with nobh option, we need to wait on writeback
57 * completing on this page, since we'll remove it from the pagecache.
58 * Otherwise truncate wont wait on the page, allowing the disk
59 * blocks to be reused by someone else before we actually wrote our
60 * data to them. fs corruption ensues.
61 */
62 wait_on_page_writeback(page);
47 63
48 if (PageLRU(page)) { 64 if (PagePrivate(page))
49 struct zone *zone = page_zone(page); 65 try_to_release_page(page, mapping_gfp_mask(mapping));
50 66
51 spin_lock_irq(&zone->lru_lock); 67 if (!remove_mapping(mapping, page))
52 BUG_ON(!PageLRU(page)); 68 return 1;
53 __ClearPageLRU(page);
54 del_page_from_lru(zone, page);
55 spin_unlock_irq(&zone->lru_lock);
56 }
57 69
58 buf->stolen = 1; 70 buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU;
59 return 0; 71 return 0;
60} 72}
61 73
@@ -64,7 +76,7 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
64{ 76{
65 page_cache_release(buf->page); 77 page_cache_release(buf->page);
66 buf->page = NULL; 78 buf->page = NULL;
67 buf->stolen = 0; 79 buf->flags &= ~(PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU);
68} 80}
69 81
70static void *page_cache_pipe_buf_map(struct file *file, 82static void *page_cache_pipe_buf_map(struct file *file,
@@ -91,8 +103,7 @@ static void *page_cache_pipe_buf_map(struct file *file,
91static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, 103static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
92 struct pipe_buffer *buf) 104 struct pipe_buffer *buf)
93{ 105{
94 if (!buf->stolen) 106 unlock_page(buf->page);
95 unlock_page(buf->page);
96 kunmap(buf->page); 107 kunmap(buf->page);
97} 108}
98 109
@@ -104,6 +115,10 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = {
104 .steal = page_cache_pipe_buf_steal, 115 .steal = page_cache_pipe_buf_steal,
105}; 116};
106 117
118/*
119 * Pipe output worker. This sets up our pipe format with the page cache
120 * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
121 */
107static ssize_t move_to_pipe(struct inode *inode, struct page **pages, 122static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
108 int nr_pages, unsigned long offset, 123 int nr_pages, unsigned long offset,
109 unsigned long len, unsigned int flags) 124 unsigned long len, unsigned int flags)
@@ -237,9 +252,9 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe,
237 * fill shadow[] with pages at the right locations, so we only 252 * fill shadow[] with pages at the right locations, so we only
238 * have to fill holes 253 * have to fill holes
239 */ 254 */
240 memset(shadow, 0, i * sizeof(struct page *)); 255 memset(shadow, 0, nr_pages * sizeof(struct page *));
241 for (j = 0, pidx = index; j < i; pidx++, j++) 256 for (j = 0; j < i; j++)
242 shadow[pages[j]->index - pidx] = pages[j]; 257 shadow[pages[j]->index - index] = pages[j];
243 258
244 /* 259 /*
245 * now fill in the holes 260 * now fill in the holes
@@ -288,6 +303,16 @@ splice_them:
288 return move_to_pipe(pipe, pages, i, offset, len, flags); 303 return move_to_pipe(pipe, pages, i, offset, len, flags);
289} 304}
290 305
306/**
307 * generic_file_splice_read - splice data from file to a pipe
308 * @in: file to splice from
309 * @pipe: pipe to splice to
310 * @len: number of bytes to splice
311 * @flags: splice modifier flags
312 *
313 * Will read pages from given file and fill them into a pipe.
314 *
315 */
291ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, 316ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
292 size_t len, unsigned int flags) 317 size_t len, unsigned int flags)
293{ 318{
@@ -318,8 +343,11 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
318 return ret; 343 return ret;
319} 344}
320 345
346EXPORT_SYMBOL(generic_file_splice_read);
347
321/* 348/*
322 * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage(). 349 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
350 * using sendpage().
323 */ 351 */
324static int pipe_to_sendpage(struct pipe_inode_info *info, 352static int pipe_to_sendpage(struct pipe_inode_info *info,
325 struct pipe_buffer *buf, struct splice_desc *sd) 353 struct pipe_buffer *buf, struct splice_desc *sd)
@@ -329,6 +357,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *info,
329 unsigned int offset; 357 unsigned int offset;
330 ssize_t ret; 358 ssize_t ret;
331 void *ptr; 359 void *ptr;
360 int more;
332 361
333 /* 362 /*
334 * sub-optimal, but we are limited by the pipe ->map. we don't 363 * sub-optimal, but we are limited by the pipe ->map. we don't
@@ -341,9 +370,9 @@ static int pipe_to_sendpage(struct pipe_inode_info *info,
341 return PTR_ERR(ptr); 370 return PTR_ERR(ptr);
342 371
343 offset = pos & ~PAGE_CACHE_MASK; 372 offset = pos & ~PAGE_CACHE_MASK;
373 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
344 374
345 ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos, 375 ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more);
346 sd->len < sd->total_len);
347 376
348 buf->ops->unmap(info, buf); 377 buf->ops->unmap(info, buf);
349 if (ret == sd->len) 378 if (ret == sd->len)
@@ -365,16 +394,19 @@ static int pipe_to_sendpage(struct pipe_inode_info *info,
365 * - Destination page does not exist, we can add the pipe page to 394 * - Destination page does not exist, we can add the pipe page to
366 * the page cache and avoid the copy. 395 * the page cache and avoid the copy.
367 * 396 *
368 * For now we just do the slower thing and always copy pages over, it's 397 * If asked to move pages to the output file (SPLICE_F_MOVE is set in
369 * easier than migrating pages from the pipe to the target file. For the 398 * sd->flags), we attempt to migrate pages from the pipe to the output
370 * case of doing file | file splicing, the migrate approach had some LRU 399 * file address space page cache. This is possible if no one else has
371 * nastiness... 400 * the pipe page referenced outside of the pipe and page cache. If
401 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
402 * a new page in the output file page cache and fill/dirty that.
372 */ 403 */
373static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, 404static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
374 struct splice_desc *sd) 405 struct splice_desc *sd)
375{ 406{
376 struct file *file = sd->file; 407 struct file *file = sd->file;
377 struct address_space *mapping = file->f_mapping; 408 struct address_space *mapping = file->f_mapping;
409 gfp_t gfp_mask = mapping_gfp_mask(mapping);
378 unsigned int offset; 410 unsigned int offset;
379 struct page *page; 411 struct page *page;
380 pgoff_t index; 412 pgoff_t index;
@@ -395,18 +427,23 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
395 * reuse buf page, if SPLICE_F_MOVE is set 427 * reuse buf page, if SPLICE_F_MOVE is set
396 */ 428 */
397 if (sd->flags & SPLICE_F_MOVE) { 429 if (sd->flags & SPLICE_F_MOVE) {
430 /*
431 * If steal succeeds, buf->page is now pruned from the vm
432 * side (LRU and page cache) and we can reuse it.
433 */
398 if (buf->ops->steal(info, buf)) 434 if (buf->ops->steal(info, buf))
399 goto find_page; 435 goto find_page;
400 436
401 page = buf->page; 437 page = buf->page;
402 if (add_to_page_cache_lru(page, mapping, index, 438 if (add_to_page_cache(page, mapping, index, gfp_mask))
403 mapping_gfp_mask(mapping)))
404 goto find_page; 439 goto find_page;
440
441 if (!(buf->flags & PIPE_BUF_FLAG_LRU))
442 lru_cache_add(page);
405 } else { 443 } else {
406find_page: 444find_page:
407 ret = -ENOMEM; 445 ret = -ENOMEM;
408 page = find_or_create_page(mapping, index, 446 page = find_or_create_page(mapping, index, gfp_mask);
409 mapping_gfp_mask(mapping));
410 if (!page) 447 if (!page)
411 goto out; 448 goto out;
412 449
@@ -443,10 +480,13 @@ find_page:
443 } 480 }
444 481
445 ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); 482 ret = mapping->a_ops->prepare_write(file, page, 0, sd->len);
446 if (ret) 483 if (ret == AOP_TRUNCATED_PAGE) {
484 page_cache_release(page);
485 goto find_page;
486 } else if (ret)
447 goto out; 487 goto out;
448 488
449 if (!buf->stolen) { 489 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) {
450 char *dst = kmap_atomic(page, KM_USER0); 490 char *dst = kmap_atomic(page, KM_USER0);
451 491
452 memcpy(dst + offset, src + buf->offset, sd->len); 492 memcpy(dst + offset, src + buf->offset, sd->len);
@@ -455,16 +495,18 @@ find_page:
455 } 495 }
456 496
457 ret = mapping->a_ops->commit_write(file, page, 0, sd->len); 497 ret = mapping->a_ops->commit_write(file, page, 0, sd->len);
458 if (ret < 0) 498 if (ret == AOP_TRUNCATED_PAGE) {
499 page_cache_release(page);
500 goto find_page;
501 } else if (ret)
459 goto out; 502 goto out;
460 503
461 set_page_dirty(page); 504 balance_dirty_pages_ratelimited(mapping);
462 ret = write_one_page(page, 0);
463out: 505out:
464 if (ret < 0) 506 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) {
465 unlock_page(page);
466 if (!buf->stolen)
467 page_cache_release(page); 507 page_cache_release(page);
508 unlock_page(page);
509 }
468 buf->ops->unmap(info, buf); 510 buf->ops->unmap(info, buf);
469 return ret; 511 return ret;
470} 512}
@@ -472,6 +514,11 @@ out:
472typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, 514typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
473 struct splice_desc *); 515 struct splice_desc *);
474 516
517/*
518 * Pipe input worker. Most of this logic works like a regular pipe, the
519 * key here is the 'actor' worker passed in that actually moves the data
520 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
521 */
475static ssize_t move_from_pipe(struct inode *inode, struct file *out, 522static ssize_t move_from_pipe(struct inode *inode, struct file *out,
476 size_t len, unsigned int flags, 523 size_t len, unsigned int flags,
477 splice_actor *actor) 524 splice_actor *actor)
@@ -573,21 +620,67 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
573 620
574} 621}
575 622
623/**
624 * generic_file_splice_write - splice data from a pipe to a file
625 * @inode: pipe inode
626 * @out: file to write to
627 * @len: number of bytes to splice
628 * @flags: splice modifier flags
629 *
630 * Will either move or copy pages (determined by @flags options) from
631 * the given pipe inode to the given file.
632 *
633 */
576ssize_t generic_file_splice_write(struct inode *inode, struct file *out, 634ssize_t generic_file_splice_write(struct inode *inode, struct file *out,
577 size_t len, unsigned int flags) 635 size_t len, unsigned int flags)
578{ 636{
579 return move_from_pipe(inode, out, len, flags, pipe_to_file); 637 struct address_space *mapping = out->f_mapping;
638 ssize_t ret = move_from_pipe(inode, out, len, flags, pipe_to_file);
639
640 /*
641 * if file or inode is SYNC and we actually wrote some data, sync it
642 */
643 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host))
644 && ret > 0) {
645 struct inode *inode = mapping->host;
646 int err;
647
648 mutex_lock(&inode->i_mutex);
649 err = generic_osync_inode(mapping->host, mapping,
650 OSYNC_METADATA|OSYNC_DATA);
651 mutex_unlock(&inode->i_mutex);
652
653 if (err)
654 ret = err;
655 }
656
657 return ret;
580} 658}
581 659
660EXPORT_SYMBOL(generic_file_splice_write);
661
662/**
663 * generic_splice_sendpage - splice data from a pipe to a socket
664 * @inode: pipe inode
665 * @out: socket to write to
666 * @len: number of bytes to splice
667 * @flags: splice modifier flags
668 *
669 * Will send @len bytes from the pipe to a network socket. No data copying
670 * is involved.
671 *
672 */
582ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, 673ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
583 size_t len, unsigned int flags) 674 size_t len, unsigned int flags)
584{ 675{
585 return move_from_pipe(inode, out, len, flags, pipe_to_sendpage); 676 return move_from_pipe(inode, out, len, flags, pipe_to_sendpage);
586} 677}
587 678
588EXPORT_SYMBOL(generic_file_splice_write); 679EXPORT_SYMBOL(generic_splice_sendpage);
589EXPORT_SYMBOL(generic_file_splice_read);
590 680
681/*
682 * Attempt to initiate a splice from pipe to file.
683 */
591static long do_splice_from(struct inode *pipe, struct file *out, size_t len, 684static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
592 unsigned int flags) 685 unsigned int flags)
593{ 686{
@@ -608,6 +701,9 @@ static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
608 return out->f_op->splice_write(pipe, out, len, flags); 701 return out->f_op->splice_write(pipe, out, len, flags);
609} 702}
610 703
704/*
705 * Attempt to initiate a splice from a file to a pipe.
706 */
611static long do_splice_to(struct file *in, struct inode *pipe, size_t len, 707static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
612 unsigned int flags) 708 unsigned int flags)
613{ 709{
@@ -636,6 +732,9 @@ static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
636 return in->f_op->splice_read(in, pipe, len, flags); 732 return in->f_op->splice_read(in, pipe, len, flags);
637} 733}
638 734
735/*
736 * Determine where to splice to/from.
737 */
639static long do_splice(struct file *in, struct file *out, size_t len, 738static long do_splice(struct file *in, struct file *out, size_t len,
640 unsigned int flags) 739 unsigned int flags)
641{ 740{
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index d218fc729319..ec384958d509 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -5,11 +5,14 @@
5 5
6#define PIPE_BUFFERS (16) 6#define PIPE_BUFFERS (16)
7 7
8#define PIPE_BUF_FLAG_STOLEN 0x01
9#define PIPE_BUF_FLAG_LRU 0x02
10
8struct pipe_buffer { 11struct pipe_buffer {
9 struct page *page; 12 struct page *page;
10 unsigned int offset, len; 13 unsigned int offset, len;
11 struct pipe_buf_operations *ops; 14 struct pipe_buf_operations *ops;
12 unsigned int stolen; 15 unsigned int flags;
13}; 16};
14 17
15struct pipe_buf_operations { 18struct pipe_buf_operations {
@@ -63,5 +66,6 @@ void free_pipe_info(struct inode* inode);
63#define SPLICE_F_NONBLOCK (0x02) /* don't block on the pipe splicing (but */ 66#define SPLICE_F_NONBLOCK (0x02) /* don't block on the pipe splicing (but */
64 /* we may still block on the fd we splice */ 67 /* we may still block on the fd we splice */
65 /* from/to, of course */ 68 /* from/to, of course */
69#define SPLICE_F_MORE (0x04) /* expect more data */
66 70
67#endif 71#endif