aboutsummaryrefslogtreecommitdiffstats
path: root/fs/splice.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/splice.c')
-rw-r--r--fs/splice.c529
1 files changed, 409 insertions, 120 deletions
diff --git a/fs/splice.c b/fs/splice.c
index 8d57e89924a6..447ebc0a37f3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -27,15 +27,22 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/syscalls.h> 29#include <linux/syscalls.h>
30#include <linux/uio.h>
31
32struct partial_page {
33 unsigned int offset;
34 unsigned int len;
35};
30 36
31/* 37/*
32 * Passed to the actors 38 * Passed to splice_to_pipe
33 */ 39 */
34struct splice_desc { 40struct splice_pipe_desc {
35 unsigned int len, total_len; /* current and remaining length */ 41 struct page **pages; /* page map */
42 struct partial_page *partial; /* pages[] may not be contig */
43 int nr_pages; /* number of pages in map */
36 unsigned int flags; /* splice flags */ 44 unsigned int flags; /* splice flags */
37 struct file *file; /* file to read/write */ 45 struct pipe_buf_operations *ops;/* ops associated with output pipe */
38 loff_t pos; /* file position */
39}; 46};
40 47
41/* 48/*
@@ -50,7 +57,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
50 struct page *page = buf->page; 57 struct page *page = buf->page;
51 struct address_space *mapping = page_mapping(page); 58 struct address_space *mapping = page_mapping(page);
52 59
53 WARN_ON(!PageLocked(page)); 60 lock_page(page);
61
54 WARN_ON(!PageUptodate(page)); 62 WARN_ON(!PageUptodate(page));
55 63
56 /* 64 /*
@@ -65,8 +73,10 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
65 if (PagePrivate(page)) 73 if (PagePrivate(page))
66 try_to_release_page(page, mapping_gfp_mask(mapping)); 74 try_to_release_page(page, mapping_gfp_mask(mapping));
67 75
68 if (!remove_mapping(mapping, page)) 76 if (!remove_mapping(mapping, page)) {
77 unlock_page(page);
69 return 1; 78 return 1;
79 }
70 80
71 buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU; 81 buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU;
72 return 0; 82 return 0;
@@ -125,6 +135,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
125 kunmap(buf->page); 135 kunmap(buf->page);
126} 136}
127 137
138static void *user_page_pipe_buf_map(struct file *file,
139 struct pipe_inode_info *pipe,
140 struct pipe_buffer *buf)
141{
142 return kmap(buf->page);
143}
144
145static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe,
146 struct pipe_buffer *buf)
147{
148 kunmap(buf->page);
149}
150
128static void page_cache_pipe_buf_get(struct pipe_inode_info *info, 151static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
129 struct pipe_buffer *buf) 152 struct pipe_buffer *buf)
130{ 153{
@@ -140,19 +163,33 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = {
140 .get = page_cache_pipe_buf_get, 163 .get = page_cache_pipe_buf_get,
141}; 164};
142 165
166static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
167 struct pipe_buffer *buf)
168{
169 return 1;
170}
171
172static struct pipe_buf_operations user_page_pipe_buf_ops = {
173 .can_merge = 0,
174 .map = user_page_pipe_buf_map,
175 .unmap = user_page_pipe_buf_unmap,
176 .release = page_cache_pipe_buf_release,
177 .steal = user_page_pipe_buf_steal,
178 .get = page_cache_pipe_buf_get,
179};
180
143/* 181/*
144 * Pipe output worker. This sets up our pipe format with the page cache 182 * Pipe output worker. This sets up our pipe format with the page cache
145 * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). 183 * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
146 */ 184 */
147static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, 185static ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
148 int nr_pages, unsigned long offset, 186 struct splice_pipe_desc *spd)
149 unsigned long len, unsigned int flags)
150{ 187{
151 int ret, do_wakeup, i; 188 int ret, do_wakeup, page_nr;
152 189
153 ret = 0; 190 ret = 0;
154 do_wakeup = 0; 191 do_wakeup = 0;
155 i = 0; 192 page_nr = 0;
156 193
157 if (pipe->inode) 194 if (pipe->inode)
158 mutex_lock(&pipe->inode->i_mutex); 195 mutex_lock(&pipe->inode->i_mutex);
@@ -168,27 +205,19 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
168 if (pipe->nrbufs < PIPE_BUFFERS) { 205 if (pipe->nrbufs < PIPE_BUFFERS) {
169 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 206 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
170 struct pipe_buffer *buf = pipe->bufs + newbuf; 207 struct pipe_buffer *buf = pipe->bufs + newbuf;
171 struct page *page = pages[i++];
172 unsigned long this_len;
173
174 this_len = PAGE_CACHE_SIZE - offset;
175 if (this_len > len)
176 this_len = len;
177 208
178 buf->page = page; 209 buf->page = spd->pages[page_nr];
179 buf->offset = offset; 210 buf->offset = spd->partial[page_nr].offset;
180 buf->len = this_len; 211 buf->len = spd->partial[page_nr].len;
181 buf->ops = &page_cache_pipe_buf_ops; 212 buf->ops = spd->ops;
182 pipe->nrbufs++; 213 pipe->nrbufs++;
214 page_nr++;
215 ret += buf->len;
216
183 if (pipe->inode) 217 if (pipe->inode)
184 do_wakeup = 1; 218 do_wakeup = 1;
185 219
186 ret += this_len; 220 if (!--spd->nr_pages)
187 len -= this_len;
188 offset = 0;
189 if (!--nr_pages)
190 break;
191 if (!len)
192 break; 221 break;
193 if (pipe->nrbufs < PIPE_BUFFERS) 222 if (pipe->nrbufs < PIPE_BUFFERS)
194 continue; 223 continue;
@@ -196,7 +225,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
196 break; 225 break;
197 } 226 }
198 227
199 if (flags & SPLICE_F_NONBLOCK) { 228 if (spd->flags & SPLICE_F_NONBLOCK) {
200 if (!ret) 229 if (!ret)
201 ret = -EAGAIN; 230 ret = -EAGAIN;
202 break; 231 break;
@@ -231,8 +260,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
231 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 260 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
232 } 261 }
233 262
234 while (i < nr_pages) 263 while (page_nr < spd->nr_pages)
235 page_cache_release(pages[i++]); 264 page_cache_release(spd->pages[page_nr++]);
236 265
237 return ret; 266 return ret;
238} 267}
@@ -243,15 +272,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
243 unsigned int flags) 272 unsigned int flags)
244{ 273{
245 struct address_space *mapping = in->f_mapping; 274 struct address_space *mapping = in->f_mapping;
246 unsigned int offset, nr_pages; 275 unsigned int loff, nr_pages;
247 struct page *pages[PIPE_BUFFERS]; 276 struct page *pages[PIPE_BUFFERS];
277 struct partial_page partial[PIPE_BUFFERS];
248 struct page *page; 278 struct page *page;
249 pgoff_t index; 279 pgoff_t index, end_index;
250 int i, error; 280 loff_t isize;
281 size_t total_len;
282 int error;
283 struct splice_pipe_desc spd = {
284 .pages = pages,
285 .partial = partial,
286 .flags = flags,
287 .ops = &page_cache_pipe_buf_ops,
288 };
251 289
252 index = *ppos >> PAGE_CACHE_SHIFT; 290 index = *ppos >> PAGE_CACHE_SHIFT;
253 offset = *ppos & ~PAGE_CACHE_MASK; 291 loff = *ppos & ~PAGE_CACHE_MASK;
254 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 292 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
255 293
256 if (nr_pages > PIPE_BUFFERS) 294 if (nr_pages > PIPE_BUFFERS)
257 nr_pages = PIPE_BUFFERS; 295 nr_pages = PIPE_BUFFERS;
@@ -261,14 +299,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
261 * read-ahead if this is a non-zero offset (we are likely doing small 299 * read-ahead if this is a non-zero offset (we are likely doing small
262 * chunk splice and the page is already there) for a single page. 300 * chunk splice and the page is already there) for a single page.
263 */ 301 */
264 if (!offset || nr_pages > 1) 302 if (!loff || spd.nr_pages > 1)
265 do_page_cache_readahead(mapping, in, index, nr_pages); 303 do_page_cache_readahead(mapping, in, index, spd.nr_pages);
266 304
267 /* 305 /*
268 * Now fill in the holes: 306 * Now fill in the holes:
269 */ 307 */
270 error = 0; 308 error = 0;
271 for (i = 0; i < nr_pages; i++, index++) { 309 total_len = 0;
310 for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) {
311 unsigned int this_len;
312
313 if (!len)
314 break;
315
316 /*
317 * this_len is the max we'll use from this page
318 */
319 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
272find_page: 320find_page:
273 /* 321 /*
274 * lookup the page for this index 322 * lookup the page for this index
@@ -276,14 +324,6 @@ find_page:
276 page = find_get_page(mapping, index); 324 page = find_get_page(mapping, index);
277 if (!page) { 325 if (!page) {
278 /* 326 /*
279 * If in nonblock mode then dont block on
280 * readpage (we've kicked readahead so there
281 * will be asynchronous progress):
282 */
283 if (flags & SPLICE_F_NONBLOCK)
284 break;
285
286 /*
287 * page didn't exist, allocate one 327 * page didn't exist, allocate one
288 */ 328 */
289 page = page_cache_alloc_cold(mapping); 329 page = page_cache_alloc_cold(mapping);
@@ -304,6 +344,13 @@ find_page:
304 * If the page isn't uptodate, we may need to start io on it 344 * If the page isn't uptodate, we may need to start io on it
305 */ 345 */
306 if (!PageUptodate(page)) { 346 if (!PageUptodate(page)) {
347 /*
348 * If in nonblock mode then dont block on waiting
349 * for an in-flight io page
350 */
351 if (flags & SPLICE_F_NONBLOCK)
352 break;
353
307 lock_page(page); 354 lock_page(page);
308 355
309 /* 356 /*
@@ -336,13 +383,46 @@ readpage:
336 goto find_page; 383 goto find_page;
337 break; 384 break;
338 } 385 }
386
387 /*
388 * i_size must be checked after ->readpage().
389 */
390 isize = i_size_read(mapping->host);
391 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
392 if (unlikely(!isize || index > end_index)) {
393 page_cache_release(page);
394 break;
395 }
396
397 /*
398 * if this is the last page, see if we need to shrink
399 * the length and stop
400 */
401 if (end_index == index) {
402 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
403 if (total_len + loff > isize) {
404 page_cache_release(page);
405 break;
406 }
407 /*
408 * force quit after adding this page
409 */
410 nr_pages = spd.nr_pages;
411 this_len = min(this_len, loff);
412 loff = 0;
413 }
339 } 414 }
340fill_it: 415fill_it:
341 pages[i] = page; 416 pages[spd.nr_pages] = page;
417 partial[spd.nr_pages].offset = loff;
418 partial[spd.nr_pages].len = this_len;
419 len -= this_len;
420 total_len += this_len;
421 loff = 0;
342 } 422 }
343 423
344 if (i) 424 if (spd.nr_pages)
345 return move_to_pipe(pipe, pages, i, offset, len, flags); 425 return splice_to_pipe(pipe, &spd);
346 426
347 return error; 427 return error;
348} 428}
@@ -369,17 +449,20 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
369 while (len) { 449 while (len) {
370 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 450 ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
371 451
372 if (ret <= 0) 452 if (ret < 0)
373 break; 453 break;
454 else if (!ret) {
455 if (spliced)
456 break;
457 if (flags & SPLICE_F_NONBLOCK) {
458 ret = -EAGAIN;
459 break;
460 }
461 }
374 462
375 *ppos += ret; 463 *ppos += ret;
376 len -= ret; 464 len -= ret;
377 spliced += ret; 465 spliced += ret;
378
379 if (!(flags & SPLICE_F_NONBLOCK))
380 continue;
381 ret = -EAGAIN;
382 break;
383 } 466 }
384 467
385 if (spliced) 468 if (spliced)
@@ -392,14 +475,13 @@ EXPORT_SYMBOL(generic_file_splice_read);
392 475
393/* 476/*
394 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 477 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
395 * using sendpage(). 478 * using sendpage(). Return the number of bytes sent.
396 */ 479 */
397static int pipe_to_sendpage(struct pipe_inode_info *info, 480static int pipe_to_sendpage(struct pipe_inode_info *info,
398 struct pipe_buffer *buf, struct splice_desc *sd) 481 struct pipe_buffer *buf, struct splice_desc *sd)
399{ 482{
400 struct file *file = sd->file; 483 struct file *file = sd->file;
401 loff_t pos = sd->pos; 484 loff_t pos = sd->pos;
402 unsigned int offset;
403 ssize_t ret; 485 ssize_t ret;
404 void *ptr; 486 void *ptr;
405 int more; 487 int more;
@@ -414,16 +496,13 @@ static int pipe_to_sendpage(struct pipe_inode_info *info,
414 if (IS_ERR(ptr)) 496 if (IS_ERR(ptr))
415 return PTR_ERR(ptr); 497 return PTR_ERR(ptr);
416 498
417 offset = pos & ~PAGE_CACHE_MASK;
418 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 499 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
419 500
420 ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more); 501 ret = file->f_op->sendpage(file, buf->page, buf->offset, sd->len,
502 &pos, more);
421 503
422 buf->ops->unmap(info, buf); 504 buf->ops->unmap(info, buf);
423 if (ret == sd->len) 505 return ret;
424 return 0;
425
426 return -EIO;
427} 506}
428 507
429/* 508/*
@@ -452,7 +531,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
452 struct file *file = sd->file; 531 struct file *file = sd->file;
453 struct address_space *mapping = file->f_mapping; 532 struct address_space *mapping = file->f_mapping;
454 gfp_t gfp_mask = mapping_gfp_mask(mapping); 533 gfp_t gfp_mask = mapping_gfp_mask(mapping);
455 unsigned int offset; 534 unsigned int offset, this_len;
456 struct page *page; 535 struct page *page;
457 pgoff_t index; 536 pgoff_t index;
458 char *src; 537 char *src;
@@ -468,20 +547,22 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
468 index = sd->pos >> PAGE_CACHE_SHIFT; 547 index = sd->pos >> PAGE_CACHE_SHIFT;
469 offset = sd->pos & ~PAGE_CACHE_MASK; 548 offset = sd->pos & ~PAGE_CACHE_MASK;
470 549
550 this_len = sd->len;
551 if (this_len + offset > PAGE_CACHE_SIZE)
552 this_len = PAGE_CACHE_SIZE - offset;
553
471 /* 554 /*
472 * Reuse buf page, if SPLICE_F_MOVE is set. 555 * Reuse buf page, if SPLICE_F_MOVE is set.
473 */ 556 */
474 if (sd->flags & SPLICE_F_MOVE) { 557 if (sd->flags & SPLICE_F_MOVE) {
475 /* 558 /*
476 * If steal succeeds, buf->page is now pruned from the vm 559 * If steal succeeds, buf->page is now pruned from the vm
477 * side (LRU and page cache) and we can reuse it. 560 * side (LRU and page cache) and we can reuse it. The page
561 * will also be looked on successful return.
478 */ 562 */
479 if (buf->ops->steal(info, buf)) 563 if (buf->ops->steal(info, buf))
480 goto find_page; 564 goto find_page;
481 565
482 /*
483 * this will also set the page locked
484 */
485 page = buf->page; 566 page = buf->page;
486 if (add_to_page_cache(page, mapping, index, gfp_mask)) 567 if (add_to_page_cache(page, mapping, index, gfp_mask))
487 goto find_page; 568 goto find_page;
@@ -490,18 +571,30 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
490 lru_cache_add(page); 571 lru_cache_add(page);
491 } else { 572 } else {
492find_page: 573find_page:
493 ret = -ENOMEM; 574 page = find_lock_page(mapping, index);
494 page = find_or_create_page(mapping, index, gfp_mask); 575 if (!page) {
495 if (!page) 576 ret = -ENOMEM;
496 goto out_nomem; 577 page = page_cache_alloc_cold(mapping);
578 if (unlikely(!page))
579 goto out_nomem;
580
581 /*
582 * This will also lock the page
583 */
584 ret = add_to_page_cache_lru(page, mapping, index,
585 gfp_mask);
586 if (unlikely(ret))
587 goto out;
588 }
497 589
498 /* 590 /*
499 * If the page is uptodate, it is also locked. If it isn't 591 * We get here with the page locked. If the page is also
500 * uptodate, we can mark it uptodate if we are filling the 592 * uptodate, we don't need to do more. If it isn't, we
501 * full page. Otherwise we need to read it in first... 593 * may need to bring it in if we are not going to overwrite
594 * the full page.
502 */ 595 */
503 if (!PageUptodate(page)) { 596 if (!PageUptodate(page)) {
504 if (sd->len < PAGE_CACHE_SIZE) { 597 if (this_len < PAGE_CACHE_SIZE) {
505 ret = mapping->a_ops->readpage(file, page); 598 ret = mapping->a_ops->readpage(file, page);
506 if (unlikely(ret)) 599 if (unlikely(ret))
507 goto out; 600 goto out;
@@ -520,14 +613,12 @@ find_page:
520 ret = -EIO; 613 ret = -EIO;
521 goto out; 614 goto out;
522 } 615 }
523 } else { 616 } else
524 WARN_ON(!PageLocked(page));
525 SetPageUptodate(page); 617 SetPageUptodate(page);
526 }
527 } 618 }
528 } 619 }
529 620
530 ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); 621 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
531 if (ret == AOP_TRUNCATED_PAGE) { 622 if (ret == AOP_TRUNCATED_PAGE) {
532 page_cache_release(page); 623 page_cache_release(page);
533 goto find_page; 624 goto find_page;
@@ -537,41 +628,42 @@ find_page:
537 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { 628 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) {
538 char *dst = kmap_atomic(page, KM_USER0); 629 char *dst = kmap_atomic(page, KM_USER0);
539 630
540 memcpy(dst + offset, src + buf->offset, sd->len); 631 memcpy(dst + offset, src + buf->offset, this_len);
541 flush_dcache_page(page); 632 flush_dcache_page(page);
542 kunmap_atomic(dst, KM_USER0); 633 kunmap_atomic(dst, KM_USER0);
543 } 634 }
544 635
545 ret = mapping->a_ops->commit_write(file, page, 0, sd->len); 636 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
546 if (ret == AOP_TRUNCATED_PAGE) { 637 if (ret == AOP_TRUNCATED_PAGE) {
547 page_cache_release(page); 638 page_cache_release(page);
548 goto find_page; 639 goto find_page;
549 } else if (ret) 640 } else if (ret)
550 goto out; 641 goto out;
551 642
643 /*
644 * Return the number of bytes written.
645 */
646 ret = this_len;
552 mark_page_accessed(page); 647 mark_page_accessed(page);
553 balance_dirty_pages_ratelimited(mapping); 648 balance_dirty_pages_ratelimited(mapping);
554out: 649out:
555 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { 650 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN))
556 page_cache_release(page); 651 page_cache_release(page);
557 unlock_page(page); 652
558 } 653 unlock_page(page);
559out_nomem: 654out_nomem:
560 buf->ops->unmap(info, buf); 655 buf->ops->unmap(info, buf);
561 return ret; 656 return ret;
562} 657}
563 658
564typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
565 struct splice_desc *);
566
567/* 659/*
568 * Pipe input worker. Most of this logic works like a regular pipe, the 660 * Pipe input worker. Most of this logic works like a regular pipe, the
569 * key here is the 'actor' worker passed in that actually moves the data 661 * key here is the 'actor' worker passed in that actually moves the data
570 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. 662 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
571 */ 663 */
572static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, 664ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
573 loff_t *ppos, size_t len, unsigned int flags, 665 loff_t *ppos, size_t len, unsigned int flags,
574 splice_actor *actor) 666 splice_actor *actor)
575{ 667{
576 int ret, do_wakeup, err; 668 int ret, do_wakeup, err;
577 struct splice_desc sd; 669 struct splice_desc sd;
@@ -597,16 +689,22 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
597 sd.len = sd.total_len; 689 sd.len = sd.total_len;
598 690
599 err = actor(pipe, buf, &sd); 691 err = actor(pipe, buf, &sd);
600 if (err) { 692 if (err <= 0) {
601 if (!ret && err != -ENODATA) 693 if (!ret && err != -ENODATA)
602 ret = err; 694 ret = err;
603 695
604 break; 696 break;
605 } 697 }
606 698
607 ret += sd.len; 699 ret += err;
608 buf->offset += sd.len; 700 buf->offset += err;
609 buf->len -= sd.len; 701 buf->len -= err;
702
703 sd.len -= err;
704 sd.pos += err;
705 sd.total_len -= err;
706 if (sd.len)
707 continue;
610 708
611 if (!buf->len) { 709 if (!buf->len) {
612 buf->ops = NULL; 710 buf->ops = NULL;
@@ -617,8 +715,6 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
617 do_wakeup = 1; 715 do_wakeup = 1;
618 } 716 }
619 717
620 sd.pos += sd.len;
621 sd.total_len -= sd.len;
622 if (!sd.total_len) 718 if (!sd.total_len)
623 break; 719 break;
624 } 720 }
@@ -686,23 +782,27 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
686 struct address_space *mapping = out->f_mapping; 782 struct address_space *mapping = out->f_mapping;
687 ssize_t ret; 783 ssize_t ret;
688 784
689 ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 785 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
690 786 if (ret > 0) {
691 /*
692 * If file or inode is SYNC and we actually wrote some data, sync it.
693 */
694 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host))
695 && ret > 0) {
696 struct inode *inode = mapping->host; 787 struct inode *inode = mapping->host;
697 int err;
698 788
699 mutex_lock(&inode->i_mutex); 789 *ppos += ret;
700 err = generic_osync_inode(mapping->host, mapping, 790
701 OSYNC_METADATA|OSYNC_DATA); 791 /*
702 mutex_unlock(&inode->i_mutex); 792 * If file or inode is SYNC and we actually wrote some data,
793 * sync it.
794 */
795 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
796 int err;
797
798 mutex_lock(&inode->i_mutex);
799 err = generic_osync_inode(inode, mapping,
800 OSYNC_METADATA|OSYNC_DATA);
801 mutex_unlock(&inode->i_mutex);
703 802
704 if (err) 803 if (err)
705 ret = err; 804 ret = err;
805 }
706 } 806 }
707 807
708 return ret; 808 return ret;
@@ -724,7 +824,7 @@ EXPORT_SYMBOL(generic_file_splice_write);
724ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 824ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
725 loff_t *ppos, size_t len, unsigned int flags) 825 loff_t *ppos, size_t len, unsigned int flags)
726{ 826{
727 return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 827 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
728} 828}
729 829
730EXPORT_SYMBOL(generic_splice_sendpage); 830EXPORT_SYMBOL(generic_splice_sendpage);
@@ -811,7 +911,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
811 911
812 /* 912 /*
813 * We don't have an immediate reader, but we'll read the stuff 913 * We don't have an immediate reader, but we'll read the stuff
814 * out of the pipe right after the move_to_pipe(). So set 914 * out of the pipe right after the splice_to_pipe(). So set
815 * PIPE_READERS appropriately. 915 * PIPE_READERS appropriately.
816 */ 916 */
817 pipe->readers = 1; 917 pipe->readers = 1;
@@ -904,6 +1004,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
904{ 1004{
905 struct pipe_inode_info *pipe; 1005 struct pipe_inode_info *pipe;
906 loff_t offset, *off; 1006 loff_t offset, *off;
1007 long ret;
907 1008
908 pipe = in->f_dentry->d_inode->i_pipe; 1009 pipe = in->f_dentry->d_inode->i_pipe;
909 if (pipe) { 1010 if (pipe) {
@@ -918,7 +1019,12 @@ static long do_splice(struct file *in, loff_t __user *off_in,
918 } else 1019 } else
919 off = &out->f_pos; 1020 off = &out->f_pos;
920 1021
921 return do_splice_from(pipe, out, off, len, flags); 1022 ret = do_splice_from(pipe, out, off, len, flags);
1023
1024 if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1025 ret = -EFAULT;
1026
1027 return ret;
922 } 1028 }
923 1029
924 pipe = out->f_dentry->d_inode->i_pipe; 1030 pipe = out->f_dentry->d_inode->i_pipe;
@@ -934,12 +1040,185 @@ static long do_splice(struct file *in, loff_t __user *off_in,
934 } else 1040 } else
935 off = &in->f_pos; 1041 off = &in->f_pos;
936 1042
937 return do_splice_to(in, off, pipe, len, flags); 1043 ret = do_splice_to(in, off, pipe, len, flags);
1044
1045 if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1046 ret = -EFAULT;
1047
1048 return ret;
938 } 1049 }
939 1050
940 return -EINVAL; 1051 return -EINVAL;
941} 1052}
942 1053
1054/*
1055 * Map an iov into an array of pages and offset/length tupples. With the
1056 * partial_page structure, we can map several non-contiguous ranges into
1057 * our ones pages[] map instead of splitting that operation into pieces.
1058 * Could easily be exported as a generic helper for other users, in which
1059 * case one would probably want to add a 'max_nr_pages' parameter as well.
1060 */
1061static int get_iovec_page_array(const struct iovec __user *iov,
1062 unsigned int nr_vecs, struct page **pages,
1063 struct partial_page *partial)
1064{
1065 int buffers = 0, error = 0;
1066
1067 /*
1068 * It's ok to take the mmap_sem for reading, even
1069 * across a "get_user()".
1070 */
1071 down_read(&current->mm->mmap_sem);
1072
1073 while (nr_vecs) {
1074 unsigned long off, npages;
1075 void __user *base;
1076 size_t len;
1077 int i;
1078
1079 /*
1080 * Get user address base and length for this iovec.
1081 */
1082 error = get_user(base, &iov->iov_base);
1083 if (unlikely(error))
1084 break;
1085 error = get_user(len, &iov->iov_len);
1086 if (unlikely(error))
1087 break;
1088
1089 /*
1090 * Sanity check this iovec. 0 read succeeds.
1091 */
1092 if (unlikely(!len))
1093 break;
1094 error = -EFAULT;
1095 if (unlikely(!base))
1096 break;
1097
1098 /*
1099 * Get this base offset and number of pages, then map
1100 * in the user pages.
1101 */
1102 off = (unsigned long) base & ~PAGE_MASK;
1103 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1104 if (npages > PIPE_BUFFERS - buffers)
1105 npages = PIPE_BUFFERS - buffers;
1106
1107 error = get_user_pages(current, current->mm,
1108 (unsigned long) base, npages, 0, 0,
1109 &pages[buffers], NULL);
1110
1111 if (unlikely(error <= 0))
1112 break;
1113
1114 /*
1115 * Fill this contiguous range into the partial page map.
1116 */
1117 for (i = 0; i < error; i++) {
1118 const int plen = min_t(size_t, len, PAGE_SIZE) - off;
1119
1120 partial[buffers].offset = off;
1121 partial[buffers].len = plen;
1122
1123 off = 0;
1124 len -= plen;
1125 buffers++;
1126 }
1127
1128 /*
1129 * We didn't complete this iov, stop here since it probably
1130 * means we have to move some of this into a pipe to
1131 * be able to continue.
1132 */
1133 if (len)
1134 break;
1135
1136 /*
1137 * Don't continue if we mapped fewer pages than we asked for,
1138 * or if we mapped the max number of pages that we have
1139 * room for.
1140 */
1141 if (error < npages || buffers == PIPE_BUFFERS)
1142 break;
1143
1144 nr_vecs--;
1145 iov++;
1146 }
1147
1148 up_read(&current->mm->mmap_sem);
1149
1150 if (buffers)
1151 return buffers;
1152
1153 return error;
1154}
1155
1156/*
1157 * vmsplice splices a user address range into a pipe. It can be thought of
1158 * as splice-from-memory, where the regular splice is splice-from-file (or
1159 * to file). In both cases the output is a pipe, naturally.
1160 *
1161 * Note that vmsplice only supports splicing _from_ user memory to a pipe,
1162 * not the other way around. Splicing from user memory is a simple operation
1163 * that can be supported without any funky alignment restrictions or nasty
1164 * vm tricks. We simply map in the user memory and fill them into a pipe.
1165 * The reverse isn't quite as easy, though. There are two possible solutions
1166 * for that:
1167 *
1168 * - memcpy() the data internally, at which point we might as well just
1169 * do a regular read() on the buffer anyway.
1170 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1171 * has restriction limitations on both ends of the pipe).
1172 *
1173 * Alas, it isn't here.
1174 *
1175 */
1176static long do_vmsplice(struct file *file, const struct iovec __user *iov,
1177 unsigned long nr_segs, unsigned int flags)
1178{
1179 struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe;
1180 struct page *pages[PIPE_BUFFERS];
1181 struct partial_page partial[PIPE_BUFFERS];
1182 struct splice_pipe_desc spd = {
1183 .pages = pages,
1184 .partial = partial,
1185 .flags = flags,
1186 .ops = &user_page_pipe_buf_ops,
1187 };
1188
1189 if (unlikely(!pipe))
1190 return -EBADF;
1191 if (unlikely(nr_segs > UIO_MAXIOV))
1192 return -EINVAL;
1193 else if (unlikely(!nr_segs))
1194 return 0;
1195
1196 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial);
1197 if (spd.nr_pages <= 0)
1198 return spd.nr_pages;
1199
1200 return splice_to_pipe(pipe, &spd);
1201}
1202
1203asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1204 unsigned long nr_segs, unsigned int flags)
1205{
1206 struct file *file;
1207 long error;
1208 int fput;
1209
1210 error = -EBADF;
1211 file = fget_light(fd, &fput);
1212 if (file) {
1213 if (file->f_mode & FMODE_WRITE)
1214 error = do_vmsplice(file, iov, nr_segs, flags);
1215
1216 fput_light(file, fput);
1217 }
1218
1219 return error;
1220}
1221
943asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1222asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
944 int fd_out, loff_t __user *off_out, 1223 int fd_out, loff_t __user *off_out,
945 size_t len, unsigned int flags) 1224 size_t len, unsigned int flags)
@@ -979,7 +1258,9 @@ static int link_pipe(struct pipe_inode_info *ipipe,
979 size_t len, unsigned int flags) 1258 size_t len, unsigned int flags)
980{ 1259{
981 struct pipe_buffer *ibuf, *obuf; 1260 struct pipe_buffer *ibuf, *obuf;
982 int ret = 0, do_wakeup = 0, i; 1261 int ret, do_wakeup, i, ipipe_first;
1262
1263 ret = do_wakeup = ipipe_first = 0;
983 1264
984 /* 1265 /*
985 * Potential ABBA deadlock, work around it by ordering lock 1266 * Potential ABBA deadlock, work around it by ordering lock
@@ -987,6 +1268,7 @@ static int link_pipe(struct pipe_inode_info *ipipe,
987 * could deadlock (one doing tee from A -> B, the other from B -> A). 1268 * could deadlock (one doing tee from A -> B, the other from B -> A).
988 */ 1269 */
989 if (ipipe->inode < opipe->inode) { 1270 if (ipipe->inode < opipe->inode) {
1271 ipipe_first = 1;
990 mutex_lock(&ipipe->inode->i_mutex); 1272 mutex_lock(&ipipe->inode->i_mutex);
991 mutex_lock(&opipe->inode->i_mutex); 1273 mutex_lock(&opipe->inode->i_mutex);
992 } else { 1274 } else {
@@ -1035,9 +1317,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1035 1317
1036 /* 1318 /*
1037 * We have input available, but no output room. 1319 * We have input available, but no output room.
1038 * If we already copied data, return that. 1320 * If we already copied data, return that. If we
1321 * need to drop the opipe lock, it must be ordered
1322 * last to avoid deadlocks.
1039 */ 1323 */
1040 if (flags & SPLICE_F_NONBLOCK) { 1324 if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) {
1041 if (!ret) 1325 if (!ret)
1042 ret = -EAGAIN; 1326 ret = -EAGAIN;
1043 break; 1327 break;
@@ -1071,7 +1355,12 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1071 if (ret) 1355 if (ret)
1072 break; 1356 break;
1073 } 1357 }
1074 if (flags & SPLICE_F_NONBLOCK) { 1358 /*
1359 * pipe_wait() drops the ipipe mutex. To avoid deadlocks
1360 * with another process, we can only safely do that if
1361 * the ipipe lock is ordered last.
1362 */
1363 if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) {
1075 if (!ret) 1364 if (!ret)
1076 ret = -EAGAIN; 1365 ret = -EAGAIN;
1077 break; 1366 break;