aboutsummaryrefslogtreecommitdiffstats
path: root/fs/splice.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/splice.c')
-rw-r--r--fs/splice.c434
1 files changed, 339 insertions, 95 deletions
diff --git a/fs/splice.c b/fs/splice.c
index 0559e7577a04..a46ddd28561e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -27,15 +27,22 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/syscalls.h> 29#include <linux/syscalls.h>
30#include <linux/uio.h>
31
32struct partial_page {
33 unsigned int offset;
34 unsigned int len;
35};
30 36
31/* 37/*
32 * Passed to the actors 38 * Passed to splice_to_pipe
33 */ 39 */
34struct splice_desc { 40struct splice_pipe_desc {
35 unsigned int len, total_len; /* current and remaining length */ 41 struct page **pages; /* page map */
42 struct partial_page *partial; /* pages[] may not be contig */
43 int nr_pages; /* number of pages in map */
36 unsigned int flags; /* splice flags */ 44 unsigned int flags; /* splice flags */
37 struct file *file; /* file to read/write */ 45 struct pipe_buf_operations *ops;/* ops associated with output pipe */
38 loff_t pos; /* file position */
39}; 46};
40 47
41/* 48/*
@@ -128,6 +135,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
128 kunmap(buf->page); 135 kunmap(buf->page);
129} 136}
130 137
138static void *user_page_pipe_buf_map(struct file *file,
139 struct pipe_inode_info *pipe,
140 struct pipe_buffer *buf)
141{
142 return kmap(buf->page);
143}
144
145static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe,
146 struct pipe_buffer *buf)
147{
148 kunmap(buf->page);
149}
150
131static void page_cache_pipe_buf_get(struct pipe_inode_info *info, 151static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
132 struct pipe_buffer *buf) 152 struct pipe_buffer *buf)
133{ 153{
@@ -143,19 +163,33 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = {
143 .get = page_cache_pipe_buf_get, 163 .get = page_cache_pipe_buf_get,
144}; 164};
145 165
166static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
167 struct pipe_buffer *buf)
168{
169 return 1;
170}
171
172static struct pipe_buf_operations user_page_pipe_buf_ops = {
173 .can_merge = 0,
174 .map = user_page_pipe_buf_map,
175 .unmap = user_page_pipe_buf_unmap,
176 .release = page_cache_pipe_buf_release,
177 .steal = user_page_pipe_buf_steal,
178 .get = page_cache_pipe_buf_get,
179};
180
146/* 181/*
147 * Pipe output worker. This sets up our pipe format with the page cache 182 * Pipe output worker. This sets up our pipe format with the page cache
148 * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). 183 * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
149 */ 184 */
150static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, 185static ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
151 int nr_pages, unsigned long len, 186 struct splice_pipe_desc *spd)
152 unsigned int offset, unsigned int flags)
153{ 187{
154 int ret, do_wakeup, i; 188 int ret, do_wakeup, page_nr;
155 189
156 ret = 0; 190 ret = 0;
157 do_wakeup = 0; 191 do_wakeup = 0;
158 i = 0; 192 page_nr = 0;
159 193
160 if (pipe->inode) 194 if (pipe->inode)
161 mutex_lock(&pipe->inode->i_mutex); 195 mutex_lock(&pipe->inode->i_mutex);
@@ -171,27 +205,19 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
171 if (pipe->nrbufs < PIPE_BUFFERS) { 205 if (pipe->nrbufs < PIPE_BUFFERS) {
172 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 206 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
173 struct pipe_buffer *buf = pipe->bufs + newbuf; 207 struct pipe_buffer *buf = pipe->bufs + newbuf;
174 struct page *page = pages[i++];
175 unsigned long this_len;
176
177 this_len = PAGE_CACHE_SIZE - offset;
178 if (this_len > len)
179 this_len = len;
180 208
181 buf->page = page; 209 buf->page = spd->pages[page_nr];
182 buf->offset = offset; 210 buf->offset = spd->partial[page_nr].offset;
183 buf->len = this_len; 211 buf->len = spd->partial[page_nr].len;
184 buf->ops = &page_cache_pipe_buf_ops; 212 buf->ops = spd->ops;
185 pipe->nrbufs++; 213 pipe->nrbufs++;
214 page_nr++;
215 ret += buf->len;
216
186 if (pipe->inode) 217 if (pipe->inode)
187 do_wakeup = 1; 218 do_wakeup = 1;
188 219
189 ret += this_len; 220 if (!--spd->nr_pages)
190 len -= this_len;
191 offset = 0;
192 if (!--nr_pages)
193 break;
194 if (!len)
195 break; 221 break;
196 if (pipe->nrbufs < PIPE_BUFFERS) 222 if (pipe->nrbufs < PIPE_BUFFERS)
197 continue; 223 continue;
@@ -199,7 +225,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
199 break; 225 break;
200 } 226 }
201 227
202 if (flags & SPLICE_F_NONBLOCK) { 228 if (spd->flags & SPLICE_F_NONBLOCK) {
203 if (!ret) 229 if (!ret)
204 ret = -EAGAIN; 230 ret = -EAGAIN;
205 break; 231 break;
@@ -234,8 +260,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
234 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 260 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
235 } 261 }
236 262
237 while (i < nr_pages) 263 while (page_nr < spd->nr_pages)
238 page_cache_release(pages[i++]); 264 page_cache_release(spd->pages[page_nr++]);
239 265
240 return ret; 266 return ret;
241} 267}
@@ -246,17 +272,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
246 unsigned int flags) 272 unsigned int flags)
247{ 273{
248 struct address_space *mapping = in->f_mapping; 274 struct address_space *mapping = in->f_mapping;
249 unsigned int loff, offset, nr_pages; 275 unsigned int loff, nr_pages;
250 struct page *pages[PIPE_BUFFERS]; 276 struct page *pages[PIPE_BUFFERS];
277 struct partial_page partial[PIPE_BUFFERS];
251 struct page *page; 278 struct page *page;
252 pgoff_t index, end_index; 279 pgoff_t index, end_index;
253 loff_t isize; 280 loff_t isize;
254 size_t bytes; 281 size_t total_len;
255 int i, error; 282 int error, page_nr;
283 struct splice_pipe_desc spd = {
284 .pages = pages,
285 .partial = partial,
286 .flags = flags,
287 .ops = &page_cache_pipe_buf_ops,
288 };
256 289
257 index = *ppos >> PAGE_CACHE_SHIFT; 290 index = *ppos >> PAGE_CACHE_SHIFT;
258 loff = offset = *ppos & ~PAGE_CACHE_MASK; 291 loff = *ppos & ~PAGE_CACHE_MASK;
259 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 292 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
260 293
261 if (nr_pages > PIPE_BUFFERS) 294 if (nr_pages > PIPE_BUFFERS)
262 nr_pages = PIPE_BUFFERS; 295 nr_pages = PIPE_BUFFERS;
@@ -266,47 +299,75 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
266 * read-ahead if this is a non-zero offset (we are likely doing small 299 * read-ahead if this is a non-zero offset (we are likely doing small
267 * chunk splice and the page is already there) for a single page. 300 * chunk splice and the page is already there) for a single page.
268 */ 301 */
269 if (!offset || nr_pages > 1) 302 if (!loff || nr_pages > 1)
270 do_page_cache_readahead(mapping, in, index, nr_pages); 303 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
271 304
272 /* 305 /*
273 * Now fill in the holes: 306 * Now fill in the holes:
274 */ 307 */
275 error = 0; 308 error = 0;
276 bytes = 0; 309 total_len = 0;
277 for (i = 0; i < nr_pages; i++, index++) {
278 unsigned int this_len;
279 310
280 if (!len) 311 /*
281 break; 312 * Lookup the (hopefully) full range of pages we need.
313 */
314 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
282 315
316 /*
317 * If find_get_pages_contig() returned fewer pages than we needed,
318 * allocate the rest.
319 */
320 index += spd.nr_pages;
321 while (spd.nr_pages < nr_pages) {
283 /* 322 /*
284 * this_len is the max we'll use from this page 323 * Page could be there, find_get_pages_contig() breaks on
285 */ 324 * the first hole.
286 this_len = min(len, PAGE_CACHE_SIZE - loff);
287find_page:
288 /*
289 * lookup the page for this index
290 */ 325 */
291 page = find_get_page(mapping, index); 326 page = find_get_page(mapping, index);
292 if (!page) { 327 if (!page) {
293 /* 328 /*
294 * page didn't exist, allocate one 329 * page didn't exist, allocate one.
295 */ 330 */
296 page = page_cache_alloc_cold(mapping); 331 page = page_cache_alloc_cold(mapping);
297 if (!page) 332 if (!page)
298 break; 333 break;
299 334
300 error = add_to_page_cache_lru(page, mapping, index, 335 error = add_to_page_cache_lru(page, mapping, index,
301 mapping_gfp_mask(mapping)); 336 mapping_gfp_mask(mapping));
302 if (unlikely(error)) { 337 if (unlikely(error)) {
303 page_cache_release(page); 338 page_cache_release(page);
304 break; 339 break;
305 } 340 }
306 341 /*
307 goto readpage; 342 * add_to_page_cache() locks the page, unlock it
343 * to avoid convoluting the logic below even more.
344 */
345 unlock_page(page);
308 } 346 }
309 347
348 pages[spd.nr_pages++] = page;
349 index++;
350 }
351
352 /*
353 * Now loop over the map and see if we need to start IO on any
354 * pages, fill in the partial map, etc.
355 */
356 index = *ppos >> PAGE_CACHE_SHIFT;
357 nr_pages = spd.nr_pages;
358 spd.nr_pages = 0;
359 for (page_nr = 0; page_nr < nr_pages; page_nr++) {
360 unsigned int this_len;
361
362 if (!len)
363 break;
364
365 /*
366 * this_len is the max we'll use from this page
367 */
368 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
369 page = pages[page_nr];
370
310 /* 371 /*
311 * If the page isn't uptodate, we may need to start io on it 372 * If the page isn't uptodate, we may need to start io on it
312 */ 373 */
@@ -327,7 +388,6 @@ find_page:
327 */ 388 */
328 if (!page->mapping) { 389 if (!page->mapping) {
329 unlock_page(page); 390 unlock_page(page);
330 page_cache_release(page);
331 break; 391 break;
332 } 392 }
333 /* 393 /*
@@ -338,16 +398,20 @@ find_page:
338 goto fill_it; 398 goto fill_it;
339 } 399 }
340 400
341readpage:
342 /* 401 /*
343 * need to read in the page 402 * need to read in the page
344 */ 403 */
345 error = mapping->a_ops->readpage(in, page); 404 error = mapping->a_ops->readpage(in, page);
346
347 if (unlikely(error)) { 405 if (unlikely(error)) {
348 page_cache_release(page); 406 /*
407 * We really should re-lookup the page here,
408 * but it complicates things a lot. Instead
409 * lets just do what we already stored, and
410 * we'll get it the next time we are called.
411 */
349 if (error == AOP_TRUNCATED_PAGE) 412 if (error == AOP_TRUNCATED_PAGE)
350 goto find_page; 413 error = 0;
414
351 break; 415 break;
352 } 416 }
353 417
@@ -356,10 +420,8 @@ readpage:
356 */ 420 */
357 isize = i_size_read(mapping->host); 421 isize = i_size_read(mapping->host);
358 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 422 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
359 if (unlikely(!isize || index > end_index)) { 423 if (unlikely(!isize || index > end_index))
360 page_cache_release(page);
361 break; 424 break;
362 }
363 425
364 /* 426 /*
365 * if this is the last page, see if we need to shrink 427 * if this is the last page, see if we need to shrink
@@ -367,26 +429,35 @@ readpage:
367 */ 429 */
368 if (end_index == index) { 430 if (end_index == index) {
369 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); 431 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
370 if (bytes + loff > isize) { 432 if (total_len + loff > isize)
371 page_cache_release(page);
372 break; 433 break;
373 }
374 /* 434 /*
375 * force quit after adding this page 435 * force quit after adding this page
376 */ 436 */
377 nr_pages = i; 437 len = this_len;
378 this_len = min(this_len, loff); 438 this_len = min(this_len, loff);
439 loff = 0;
379 } 440 }
380 } 441 }
381fill_it: 442fill_it:
382 pages[i] = page; 443 partial[page_nr].offset = loff;
383 bytes += this_len; 444 partial[page_nr].len = this_len;
384 len -= this_len; 445 len -= this_len;
446 total_len += this_len;
385 loff = 0; 447 loff = 0;
448 spd.nr_pages++;
449 index++;
386 } 450 }
387 451
388 if (i) 452 /*
389 return move_to_pipe(pipe, pages, i, bytes, offset, flags); 453 * Release any pages at the end, if we quit early. 'i' is how far
454 * we got, 'nr_pages' is how many pages are in the map.
455 */
456 while (page_nr < nr_pages)
457 page_cache_release(pages[page_nr++]);
458
459 if (spd.nr_pages)
460 return splice_to_pipe(pipe, &spd);
390 461
391 return error; 462 return error;
392} 463}
@@ -439,14 +510,13 @@ EXPORT_SYMBOL(generic_file_splice_read);
439 510
440/* 511/*
441 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 512 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
442 * using sendpage(). 513 * using sendpage(). Return the number of bytes sent.
443 */ 514 */
444static int pipe_to_sendpage(struct pipe_inode_info *info, 515static int pipe_to_sendpage(struct pipe_inode_info *info,
445 struct pipe_buffer *buf, struct splice_desc *sd) 516 struct pipe_buffer *buf, struct splice_desc *sd)
446{ 517{
447 struct file *file = sd->file; 518 struct file *file = sd->file;
448 loff_t pos = sd->pos; 519 loff_t pos = sd->pos;
449 unsigned int offset;
450 ssize_t ret; 520 ssize_t ret;
451 void *ptr; 521 void *ptr;
452 int more; 522 int more;
@@ -461,16 +531,13 @@ static int pipe_to_sendpage(struct pipe_inode_info *info,
461 if (IS_ERR(ptr)) 531 if (IS_ERR(ptr))
462 return PTR_ERR(ptr); 532 return PTR_ERR(ptr);
463 533
464 offset = pos & ~PAGE_CACHE_MASK;
465 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 534 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
466 535
467 ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more); 536 ret = file->f_op->sendpage(file, buf->page, buf->offset, sd->len,
537 &pos, more);
468 538
469 buf->ops->unmap(info, buf); 539 buf->ops->unmap(info, buf);
470 if (ret == sd->len) 540 return ret;
471 return 0;
472
473 return -EIO;
474} 541}
475 542
476/* 543/*
@@ -499,7 +566,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
499 struct file *file = sd->file; 566 struct file *file = sd->file;
500 struct address_space *mapping = file->f_mapping; 567 struct address_space *mapping = file->f_mapping;
501 gfp_t gfp_mask = mapping_gfp_mask(mapping); 568 gfp_t gfp_mask = mapping_gfp_mask(mapping);
502 unsigned int offset; 569 unsigned int offset, this_len;
503 struct page *page; 570 struct page *page;
504 pgoff_t index; 571 pgoff_t index;
505 char *src; 572 char *src;
@@ -515,6 +582,10 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
515 index = sd->pos >> PAGE_CACHE_SHIFT; 582 index = sd->pos >> PAGE_CACHE_SHIFT;
516 offset = sd->pos & ~PAGE_CACHE_MASK; 583 offset = sd->pos & ~PAGE_CACHE_MASK;
517 584
585 this_len = sd->len;
586 if (this_len + offset > PAGE_CACHE_SIZE)
587 this_len = PAGE_CACHE_SIZE - offset;
588
518 /* 589 /*
519 * Reuse buf page, if SPLICE_F_MOVE is set. 590 * Reuse buf page, if SPLICE_F_MOVE is set.
520 */ 591 */
@@ -558,7 +629,7 @@ find_page:
558 * the full page. 629 * the full page.
559 */ 630 */
560 if (!PageUptodate(page)) { 631 if (!PageUptodate(page)) {
561 if (sd->len < PAGE_CACHE_SIZE) { 632 if (this_len < PAGE_CACHE_SIZE) {
562 ret = mapping->a_ops->readpage(file, page); 633 ret = mapping->a_ops->readpage(file, page);
563 if (unlikely(ret)) 634 if (unlikely(ret))
564 goto out; 635 goto out;
@@ -582,7 +653,7 @@ find_page:
582 } 653 }
583 } 654 }
584 655
585 ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); 656 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
586 if (ret == AOP_TRUNCATED_PAGE) { 657 if (ret == AOP_TRUNCATED_PAGE) {
587 page_cache_release(page); 658 page_cache_release(page);
588 goto find_page; 659 goto find_page;
@@ -592,18 +663,22 @@ find_page:
592 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { 663 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) {
593 char *dst = kmap_atomic(page, KM_USER0); 664 char *dst = kmap_atomic(page, KM_USER0);
594 665
595 memcpy(dst + offset, src + buf->offset, sd->len); 666 memcpy(dst + offset, src + buf->offset, this_len);
596 flush_dcache_page(page); 667 flush_dcache_page(page);
597 kunmap_atomic(dst, KM_USER0); 668 kunmap_atomic(dst, KM_USER0);
598 } 669 }
599 670
600 ret = mapping->a_ops->commit_write(file, page, 0, sd->len); 671 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
601 if (ret == AOP_TRUNCATED_PAGE) { 672 if (ret == AOP_TRUNCATED_PAGE) {
602 page_cache_release(page); 673 page_cache_release(page);
603 goto find_page; 674 goto find_page;
604 } else if (ret) 675 } else if (ret)
605 goto out; 676 goto out;
606 677
678 /*
679 * Return the number of bytes written.
680 */
681 ret = this_len;
607 mark_page_accessed(page); 682 mark_page_accessed(page);
608 balance_dirty_pages_ratelimited(mapping); 683 balance_dirty_pages_ratelimited(mapping);
609out: 684out:
@@ -616,17 +691,14 @@ out_nomem:
616 return ret; 691 return ret;
617} 692}
618 693
619typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
620 struct splice_desc *);
621
622/* 694/*
623 * Pipe input worker. Most of this logic works like a regular pipe, the 695 * Pipe input worker. Most of this logic works like a regular pipe, the
624 * key here is the 'actor' worker passed in that actually moves the data 696 * key here is the 'actor' worker passed in that actually moves the data
625 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. 697 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
626 */ 698 */
627static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, 699ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
628 loff_t *ppos, size_t len, unsigned int flags, 700 loff_t *ppos, size_t len, unsigned int flags,
629 splice_actor *actor) 701 splice_actor *actor)
630{ 702{
631 int ret, do_wakeup, err; 703 int ret, do_wakeup, err;
632 struct splice_desc sd; 704 struct splice_desc sd;
@@ -652,16 +724,22 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
652 sd.len = sd.total_len; 724 sd.len = sd.total_len;
653 725
654 err = actor(pipe, buf, &sd); 726 err = actor(pipe, buf, &sd);
655 if (err) { 727 if (err <= 0) {
656 if (!ret && err != -ENODATA) 728 if (!ret && err != -ENODATA)
657 ret = err; 729 ret = err;
658 730
659 break; 731 break;
660 } 732 }
661 733
662 ret += sd.len; 734 ret += err;
663 buf->offset += sd.len; 735 buf->offset += err;
664 buf->len -= sd.len; 736 buf->len -= err;
737
738 sd.len -= err;
739 sd.pos += err;
740 sd.total_len -= err;
741 if (sd.len)
742 continue;
665 743
666 if (!buf->len) { 744 if (!buf->len) {
667 buf->ops = NULL; 745 buf->ops = NULL;
@@ -672,8 +750,6 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
672 do_wakeup = 1; 750 do_wakeup = 1;
673 } 751 }
674 752
675 sd.pos += sd.len;
676 sd.total_len -= sd.len;
677 if (!sd.total_len) 753 if (!sd.total_len)
678 break; 754 break;
679 } 755 }
@@ -741,7 +817,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
741 struct address_space *mapping = out->f_mapping; 817 struct address_space *mapping = out->f_mapping;
742 ssize_t ret; 818 ssize_t ret;
743 819
744 ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 820 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
745 if (ret > 0) { 821 if (ret > 0) {
746 struct inode *inode = mapping->host; 822 struct inode *inode = mapping->host;
747 823
@@ -783,7 +859,7 @@ EXPORT_SYMBOL(generic_file_splice_write);
783ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 859ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
784 loff_t *ppos, size_t len, unsigned int flags) 860 loff_t *ppos, size_t len, unsigned int flags)
785{ 861{
786 return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 862 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
787} 863}
788 864
789EXPORT_SYMBOL(generic_splice_sendpage); 865EXPORT_SYMBOL(generic_splice_sendpage);
@@ -870,7 +946,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
870 946
871 /* 947 /*
872 * We don't have an immediate reader, but we'll read the stuff 948 * We don't have an immediate reader, but we'll read the stuff
873 * out of the pipe right after the move_to_pipe(). So set 949 * out of the pipe right after the splice_to_pipe(). So set
874 * PIPE_READERS appropriately. 950 * PIPE_READERS appropriately.
875 */ 951 */
876 pipe->readers = 1; 952 pipe->readers = 1;
@@ -1010,6 +1086,174 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1010 return -EINVAL; 1086 return -EINVAL;
1011} 1087}
1012 1088
1089/*
1090 * Map an iov into an array of pages and offset/length tupples. With the
1091 * partial_page structure, we can map several non-contiguous ranges into
1092 * our ones pages[] map instead of splitting that operation into pieces.
1093 * Could easily be exported as a generic helper for other users, in which
1094 * case one would probably want to add a 'max_nr_pages' parameter as well.
1095 */
1096static int get_iovec_page_array(const struct iovec __user *iov,
1097 unsigned int nr_vecs, struct page **pages,
1098 struct partial_page *partial)
1099{
1100 int buffers = 0, error = 0;
1101
1102 /*
1103 * It's ok to take the mmap_sem for reading, even
1104 * across a "get_user()".
1105 */
1106 down_read(&current->mm->mmap_sem);
1107
1108 while (nr_vecs) {
1109 unsigned long off, npages;
1110 void __user *base;
1111 size_t len;
1112 int i;
1113
1114 /*
1115 * Get user address base and length for this iovec.
1116 */
1117 error = get_user(base, &iov->iov_base);
1118 if (unlikely(error))
1119 break;
1120 error = get_user(len, &iov->iov_len);
1121 if (unlikely(error))
1122 break;
1123
1124 /*
1125 * Sanity check this iovec. 0 read succeeds.
1126 */
1127 if (unlikely(!len))
1128 break;
1129 error = -EFAULT;
1130 if (unlikely(!base))
1131 break;
1132
1133 /*
1134 * Get this base offset and number of pages, then map
1135 * in the user pages.
1136 */
1137 off = (unsigned long) base & ~PAGE_MASK;
1138 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1139 if (npages > PIPE_BUFFERS - buffers)
1140 npages = PIPE_BUFFERS - buffers;
1141
1142 error = get_user_pages(current, current->mm,
1143 (unsigned long) base, npages, 0, 0,
1144 &pages[buffers], NULL);
1145
1146 if (unlikely(error <= 0))
1147 break;
1148
1149 /*
1150 * Fill this contiguous range into the partial page map.
1151 */
1152 for (i = 0; i < error; i++) {
1153 const int plen = min_t(size_t, len, PAGE_SIZE) - off;
1154
1155 partial[buffers].offset = off;
1156 partial[buffers].len = plen;
1157
1158 off = 0;
1159 len -= plen;
1160 buffers++;
1161 }
1162
1163 /*
1164 * We didn't complete this iov, stop here since it probably
1165 * means we have to move some of this into a pipe to
1166 * be able to continue.
1167 */
1168 if (len)
1169 break;
1170
1171 /*
1172 * Don't continue if we mapped fewer pages than we asked for,
1173 * or if we mapped the max number of pages that we have
1174 * room for.
1175 */
1176 if (error < npages || buffers == PIPE_BUFFERS)
1177 break;
1178
1179 nr_vecs--;
1180 iov++;
1181 }
1182
1183 up_read(&current->mm->mmap_sem);
1184
1185 if (buffers)
1186 return buffers;
1187
1188 return error;
1189}
1190
1191/*
1192 * vmsplice splices a user address range into a pipe. It can be thought of
1193 * as splice-from-memory, where the regular splice is splice-from-file (or
1194 * to file). In both cases the output is a pipe, naturally.
1195 *
1196 * Note that vmsplice only supports splicing _from_ user memory to a pipe,
1197 * not the other way around. Splicing from user memory is a simple operation
1198 * that can be supported without any funky alignment restrictions or nasty
1199 * vm tricks. We simply map in the user memory and fill them into a pipe.
1200 * The reverse isn't quite as easy, though. There are two possible solutions
1201 * for that:
1202 *
1203 * - memcpy() the data internally, at which point we might as well just
1204 * do a regular read() on the buffer anyway.
1205 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1206 * has restriction limitations on both ends of the pipe).
1207 *
1208 * Alas, it isn't here.
1209 *
1210 */
1211static long do_vmsplice(struct file *file, const struct iovec __user *iov,
1212 unsigned long nr_segs, unsigned int flags)
1213{
1214 struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe;
1215 struct page *pages[PIPE_BUFFERS];
1216 struct partial_page partial[PIPE_BUFFERS];
1217 struct splice_pipe_desc spd = {
1218 .pages = pages,
1219 .partial = partial,
1220 .flags = flags,
1221 .ops = &user_page_pipe_buf_ops,
1222 };
1223
1224 if (unlikely(!pipe))
1225 return -EBADF;
1226 if (unlikely(nr_segs > UIO_MAXIOV))
1227 return -EINVAL;
1228 else if (unlikely(!nr_segs))
1229 return 0;
1230
1231 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial);
1232 if (spd.nr_pages <= 0)
1233 return spd.nr_pages;
1234
1235 return splice_to_pipe(pipe, &spd);
1236}
1237
1238asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1239 unsigned long nr_segs, unsigned int flags)
1240{
1241 struct file *file;
1242 long error;
1243 int fput;
1244
1245 error = -EBADF;
1246 file = fget_light(fd, &fput);
1247 if (file) {
1248 if (file->f_mode & FMODE_WRITE)
1249 error = do_vmsplice(file, iov, nr_segs, flags);
1250
1251 fput_light(file, fput);
1252 }
1253
1254 return error;
1255}
1256
1013asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1257asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
1014 int fd_out, loff_t __user *off_out, 1258 int fd_out, loff_t __user *off_out,
1015 size_t len, unsigned int flags) 1259 size_t len, unsigned int flags)