aboutsummaryrefslogtreecommitdiffstats
path: root/fs/splice.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/splice.c')
-rw-r--r--fs/splice.c701
1 files changed, 518 insertions, 183 deletions
diff --git a/fs/splice.c b/fs/splice.c
index 8d57e89924a6..7fb04970c72d 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -27,15 +27,22 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/syscalls.h> 29#include <linux/syscalls.h>
30#include <linux/uio.h>
31
32struct partial_page {
33 unsigned int offset;
34 unsigned int len;
35};
30 36
31/* 37/*
32 * Passed to the actors 38 * Passed to splice_to_pipe
33 */ 39 */
34struct splice_desc { 40struct splice_pipe_desc {
35 unsigned int len, total_len; /* current and remaining length */ 41 struct page **pages; /* page map */
42 struct partial_page *partial; /* pages[] may not be contig */
43 int nr_pages; /* number of pages in map */
36 unsigned int flags; /* splice flags */ 44 unsigned int flags; /* splice flags */
37 struct file *file; /* file to read/write */ 45 struct pipe_buf_operations *ops;/* ops associated with output pipe */
38 loff_t pos; /* file position */
39}; 46};
40 47
41/* 48/*
@@ -50,7 +57,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
50 struct page *page = buf->page; 57 struct page *page = buf->page;
51 struct address_space *mapping = page_mapping(page); 58 struct address_space *mapping = page_mapping(page);
52 59
53 WARN_ON(!PageLocked(page)); 60 lock_page(page);
61
54 WARN_ON(!PageUptodate(page)); 62 WARN_ON(!PageUptodate(page));
55 63
56 /* 64 /*
@@ -65,10 +73,11 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
65 if (PagePrivate(page)) 73 if (PagePrivate(page))
66 try_to_release_page(page, mapping_gfp_mask(mapping)); 74 try_to_release_page(page, mapping_gfp_mask(mapping));
67 75
68 if (!remove_mapping(mapping, page)) 76 if (!remove_mapping(mapping, page)) {
77 unlock_page(page);
69 return 1; 78 return 1;
79 }
70 80
71 buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU;
72 return 0; 81 return 0;
73} 82}
74 83
@@ -76,13 +85,10 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
76 struct pipe_buffer *buf) 85 struct pipe_buffer *buf)
77{ 86{
78 page_cache_release(buf->page); 87 page_cache_release(buf->page);
79 buf->page = NULL;
80 buf->flags &= ~(PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU);
81} 88}
82 89
83static void *page_cache_pipe_buf_map(struct file *file, 90static int page_cache_pipe_buf_pin(struct pipe_inode_info *info,
84 struct pipe_inode_info *info, 91 struct pipe_buffer *buf)
85 struct pipe_buffer *buf)
86{ 92{
87 struct page *page = buf->page; 93 struct page *page = buf->page;
88 int err; 94 int err;
@@ -108,51 +114,58 @@ static void *page_cache_pipe_buf_map(struct file *file,
108 } 114 }
109 115
110 /* 116 /*
111 * Page is ok afterall, fall through to mapping. 117 * Page is ok afterall, we are done.
112 */ 118 */
113 unlock_page(page); 119 unlock_page(page);
114 } 120 }
115 121
116 return kmap(page); 122 return 0;
117error: 123error:
118 unlock_page(page); 124 unlock_page(page);
119 return ERR_PTR(err); 125 return err;
120} 126}
121 127
122static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, 128static struct pipe_buf_operations page_cache_pipe_buf_ops = {
123 struct pipe_buffer *buf) 129 .can_merge = 0,
124{ 130 .map = generic_pipe_buf_map,
125 kunmap(buf->page); 131 .unmap = generic_pipe_buf_unmap,
126} 132 .pin = page_cache_pipe_buf_pin,
133 .release = page_cache_pipe_buf_release,
134 .steal = page_cache_pipe_buf_steal,
135 .get = generic_pipe_buf_get,
136};
127 137
128static void page_cache_pipe_buf_get(struct pipe_inode_info *info, 138static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
129 struct pipe_buffer *buf) 139 struct pipe_buffer *buf)
130{ 140{
131 page_cache_get(buf->page); 141 if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
142 return 1;
143
144 return generic_pipe_buf_steal(pipe, buf);
132} 145}
133 146
134static struct pipe_buf_operations page_cache_pipe_buf_ops = { 147static struct pipe_buf_operations user_page_pipe_buf_ops = {
135 .can_merge = 0, 148 .can_merge = 0,
136 .map = page_cache_pipe_buf_map, 149 .map = generic_pipe_buf_map,
137 .unmap = page_cache_pipe_buf_unmap, 150 .unmap = generic_pipe_buf_unmap,
151 .pin = generic_pipe_buf_pin,
138 .release = page_cache_pipe_buf_release, 152 .release = page_cache_pipe_buf_release,
139 .steal = page_cache_pipe_buf_steal, 153 .steal = user_page_pipe_buf_steal,
140 .get = page_cache_pipe_buf_get, 154 .get = generic_pipe_buf_get,
141}; 155};
142 156
143/* 157/*
144 * Pipe output worker. This sets up our pipe format with the page cache 158 * Pipe output worker. This sets up our pipe format with the page cache
145 * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). 159 * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
146 */ 160 */
147static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, 161static ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
148 int nr_pages, unsigned long offset, 162 struct splice_pipe_desc *spd)
149 unsigned long len, unsigned int flags)
150{ 163{
151 int ret, do_wakeup, i; 164 int ret, do_wakeup, page_nr;
152 165
153 ret = 0; 166 ret = 0;
154 do_wakeup = 0; 167 do_wakeup = 0;
155 i = 0; 168 page_nr = 0;
156 169
157 if (pipe->inode) 170 if (pipe->inode)
158 mutex_lock(&pipe->inode->i_mutex); 171 mutex_lock(&pipe->inode->i_mutex);
@@ -168,27 +181,22 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
168 if (pipe->nrbufs < PIPE_BUFFERS) { 181 if (pipe->nrbufs < PIPE_BUFFERS) {
169 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 182 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
170 struct pipe_buffer *buf = pipe->bufs + newbuf; 183 struct pipe_buffer *buf = pipe->bufs + newbuf;
171 struct page *page = pages[i++];
172 unsigned long this_len;
173 184
174 this_len = PAGE_CACHE_SIZE - offset; 185 buf->page = spd->pages[page_nr];
175 if (this_len > len) 186 buf->offset = spd->partial[page_nr].offset;
176 this_len = len; 187 buf->len = spd->partial[page_nr].len;
188 buf->ops = spd->ops;
189 if (spd->flags & SPLICE_F_GIFT)
190 buf->flags |= PIPE_BUF_FLAG_GIFT;
177 191
178 buf->page = page;
179 buf->offset = offset;
180 buf->len = this_len;
181 buf->ops = &page_cache_pipe_buf_ops;
182 pipe->nrbufs++; 192 pipe->nrbufs++;
193 page_nr++;
194 ret += buf->len;
195
183 if (pipe->inode) 196 if (pipe->inode)
184 do_wakeup = 1; 197 do_wakeup = 1;
185 198
186 ret += this_len; 199 if (!--spd->nr_pages)
187 len -= this_len;
188 offset = 0;
189 if (!--nr_pages)
190 break;
191 if (!len)
192 break; 200 break;
193 if (pipe->nrbufs < PIPE_BUFFERS) 201 if (pipe->nrbufs < PIPE_BUFFERS)
194 continue; 202 continue;
@@ -196,7 +204,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
196 break; 204 break;
197 } 205 }
198 206
199 if (flags & SPLICE_F_NONBLOCK) { 207 if (spd->flags & SPLICE_F_NONBLOCK) {
200 if (!ret) 208 if (!ret)
201 ret = -EAGAIN; 209 ret = -EAGAIN;
202 break; 210 break;
@@ -231,8 +239,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
231 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 239 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
232 } 240 }
233 241
234 while (i < nr_pages) 242 while (page_nr < spd->nr_pages)
235 page_cache_release(pages[i++]); 243 page_cache_release(spd->pages[page_nr++]);
236 244
237 return ret; 245 return ret;
238} 246}
@@ -243,15 +251,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
243 unsigned int flags) 251 unsigned int flags)
244{ 252{
245 struct address_space *mapping = in->f_mapping; 253 struct address_space *mapping = in->f_mapping;
246 unsigned int offset, nr_pages; 254 unsigned int loff, nr_pages;
247 struct page *pages[PIPE_BUFFERS]; 255 struct page *pages[PIPE_BUFFERS];
256 struct partial_page partial[PIPE_BUFFERS];
248 struct page *page; 257 struct page *page;
249 pgoff_t index; 258 pgoff_t index, end_index;
250 int i, error; 259 loff_t isize;
260 size_t total_len;
261 int error, page_nr;
262 struct splice_pipe_desc spd = {
263 .pages = pages,
264 .partial = partial,
265 .flags = flags,
266 .ops = &page_cache_pipe_buf_ops,
267 };
251 268
252 index = *ppos >> PAGE_CACHE_SHIFT; 269 index = *ppos >> PAGE_CACHE_SHIFT;
253 offset = *ppos & ~PAGE_CACHE_MASK; 270 loff = *ppos & ~PAGE_CACHE_MASK;
254 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 271 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
255 272
256 if (nr_pages > PIPE_BUFFERS) 273 if (nr_pages > PIPE_BUFFERS)
257 nr_pages = PIPE_BUFFERS; 274 nr_pages = PIPE_BUFFERS;
@@ -261,49 +278,92 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
261 * read-ahead if this is a non-zero offset (we are likely doing small 278 * read-ahead if this is a non-zero offset (we are likely doing small
262 * chunk splice and the page is already there) for a single page. 279 * chunk splice and the page is already there) for a single page.
263 */ 280 */
264 if (!offset || nr_pages > 1) 281 if (!loff || nr_pages > 1)
265 do_page_cache_readahead(mapping, in, index, nr_pages); 282 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
266 283
267 /* 284 /*
268 * Now fill in the holes: 285 * Now fill in the holes:
269 */ 286 */
270 error = 0; 287 error = 0;
271 for (i = 0; i < nr_pages; i++, index++) { 288 total_len = 0;
272find_page: 289
290 /*
291 * Lookup the (hopefully) full range of pages we need.
292 */
293 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
294
295 /*
296 * If find_get_pages_contig() returned fewer pages than we needed,
297 * allocate the rest.
298 */
299 index += spd.nr_pages;
300 while (spd.nr_pages < nr_pages) {
273 /* 301 /*
274 * lookup the page for this index 302 * Page could be there, find_get_pages_contig() breaks on
303 * the first hole.
275 */ 304 */
276 page = find_get_page(mapping, index); 305 page = find_get_page(mapping, index);
277 if (!page) { 306 if (!page) {
278 /* 307 /*
279 * If in nonblock mode then dont block on 308 * Make sure the read-ahead engine is notified
280 * readpage (we've kicked readahead so there 309 * about this failure.
281 * will be asynchronous progress):
282 */ 310 */
283 if (flags & SPLICE_F_NONBLOCK) 311 handle_ra_miss(mapping, &in->f_ra, index);
284 break;
285 312
286 /* 313 /*
287 * page didn't exist, allocate one 314 * page didn't exist, allocate one.
288 */ 315 */
289 page = page_cache_alloc_cold(mapping); 316 page = page_cache_alloc_cold(mapping);
290 if (!page) 317 if (!page)
291 break; 318 break;
292 319
293 error = add_to_page_cache_lru(page, mapping, index, 320 error = add_to_page_cache_lru(page, mapping, index,
294 mapping_gfp_mask(mapping)); 321 mapping_gfp_mask(mapping));
295 if (unlikely(error)) { 322 if (unlikely(error)) {
296 page_cache_release(page); 323 page_cache_release(page);
297 break; 324 break;
298 } 325 }
299 326 /*
300 goto readpage; 327 * add_to_page_cache() locks the page, unlock it
328 * to avoid convoluting the logic below even more.
329 */
330 unlock_page(page);
301 } 331 }
302 332
333 pages[spd.nr_pages++] = page;
334 index++;
335 }
336
337 /*
338 * Now loop over the map and see if we need to start IO on any
339 * pages, fill in the partial map, etc.
340 */
341 index = *ppos >> PAGE_CACHE_SHIFT;
342 nr_pages = spd.nr_pages;
343 spd.nr_pages = 0;
344 for (page_nr = 0; page_nr < nr_pages; page_nr++) {
345 unsigned int this_len;
346
347 if (!len)
348 break;
349
350 /*
351 * this_len is the max we'll use from this page
352 */
353 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
354 page = pages[page_nr];
355
303 /* 356 /*
304 * If the page isn't uptodate, we may need to start io on it 357 * If the page isn't uptodate, we may need to start io on it
305 */ 358 */
306 if (!PageUptodate(page)) { 359 if (!PageUptodate(page)) {
360 /*
361 * If in nonblock mode then dont block on waiting
362 * for an in-flight io page
363 */
364 if (flags & SPLICE_F_NONBLOCK)
365 break;
366
307 lock_page(page); 367 lock_page(page);
308 368
309 /* 369 /*
@@ -313,7 +373,6 @@ find_page:
313 */ 373 */
314 if (!page->mapping) { 374 if (!page->mapping) {
315 unlock_page(page); 375 unlock_page(page);
316 page_cache_release(page);
317 break; 376 break;
318 } 377 }
319 /* 378 /*
@@ -324,25 +383,66 @@ find_page:
324 goto fill_it; 383 goto fill_it;
325 } 384 }
326 385
327readpage:
328 /* 386 /*
329 * need to read in the page 387 * need to read in the page
330 */ 388 */
331 error = mapping->a_ops->readpage(in, page); 389 error = mapping->a_ops->readpage(in, page);
332
333 if (unlikely(error)) { 390 if (unlikely(error)) {
334 page_cache_release(page); 391 /*
392 * We really should re-lookup the page here,
393 * but it complicates things a lot. Instead
394 * lets just do what we already stored, and
395 * we'll get it the next time we are called.
396 */
335 if (error == AOP_TRUNCATED_PAGE) 397 if (error == AOP_TRUNCATED_PAGE)
336 goto find_page; 398 error = 0;
399
337 break; 400 break;
338 } 401 }
402
403 /*
404 * i_size must be checked after ->readpage().
405 */
406 isize = i_size_read(mapping->host);
407 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
408 if (unlikely(!isize || index > end_index))
409 break;
410
411 /*
412 * if this is the last page, see if we need to shrink
413 * the length and stop
414 */
415 if (end_index == index) {
416 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
417 if (total_len + loff > isize)
418 break;
419 /*
420 * force quit after adding this page
421 */
422 len = this_len;
423 this_len = min(this_len, loff);
424 loff = 0;
425 }
339 } 426 }
340fill_it: 427fill_it:
341 pages[i] = page; 428 partial[page_nr].offset = loff;
429 partial[page_nr].len = this_len;
430 len -= this_len;
431 total_len += this_len;
432 loff = 0;
433 spd.nr_pages++;
434 index++;
342 } 435 }
343 436
344 if (i) 437 /*
345 return move_to_pipe(pipe, pages, i, offset, len, flags); 438 * Release any pages at the end, if we quit early. 'i' is how far
439 * we got, 'nr_pages' is how many pages are in the map.
440 */
441 while (page_nr < nr_pages)
442 page_cache_release(pages[page_nr++]);
443
444 if (spd.nr_pages)
445 return splice_to_pipe(pipe, &spd);
346 446
347 return error; 447 return error;
348} 448}
@@ -369,17 +469,20 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
369 while (len) { 469 while (len) {
370 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 470 ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
371 471
372 if (ret <= 0) 472 if (ret < 0)
373 break; 473 break;
474 else if (!ret) {
475 if (spliced)
476 break;
477 if (flags & SPLICE_F_NONBLOCK) {
478 ret = -EAGAIN;
479 break;
480 }
481 }
374 482
375 *ppos += ret; 483 *ppos += ret;
376 len -= ret; 484 len -= ret;
377 spliced += ret; 485 spliced += ret;
378
379 if (!(flags & SPLICE_F_NONBLOCK))
380 continue;
381 ret = -EAGAIN;
382 break;
383 } 486 }
384 487
385 if (spliced) 488 if (spliced)
@@ -392,38 +495,24 @@ EXPORT_SYMBOL(generic_file_splice_read);
392 495
393/* 496/*
394 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 497 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
395 * using sendpage(). 498 * using sendpage(). Return the number of bytes sent.
396 */ 499 */
397static int pipe_to_sendpage(struct pipe_inode_info *info, 500static int pipe_to_sendpage(struct pipe_inode_info *info,
398 struct pipe_buffer *buf, struct splice_desc *sd) 501 struct pipe_buffer *buf, struct splice_desc *sd)
399{ 502{
400 struct file *file = sd->file; 503 struct file *file = sd->file;
401 loff_t pos = sd->pos; 504 loff_t pos = sd->pos;
402 unsigned int offset; 505 int ret, more;
403 ssize_t ret;
404 void *ptr;
405 int more;
406
407 /*
408 * Sub-optimal, but we are limited by the pipe ->map. We don't
409 * need a kmap'ed buffer here, we just want to make sure we
410 * have the page pinned if the pipe page originates from the
411 * page cache.
412 */
413 ptr = buf->ops->map(file, info, buf);
414 if (IS_ERR(ptr))
415 return PTR_ERR(ptr);
416 506
417 offset = pos & ~PAGE_CACHE_MASK; 507 ret = buf->ops->pin(info, buf);
418 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 508 if (!ret) {
509 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
419 510
420 ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more); 511 ret = file->f_op->sendpage(file, buf->page, buf->offset,
421 512 sd->len, &pos, more);
422 buf->ops->unmap(info, buf); 513 }
423 if (ret == sd->len)
424 return 0;
425 514
426 return -EIO; 515 return ret;
427} 516}
428 517
429/* 518/*
@@ -452,56 +541,88 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
452 struct file *file = sd->file; 541 struct file *file = sd->file;
453 struct address_space *mapping = file->f_mapping; 542 struct address_space *mapping = file->f_mapping;
454 gfp_t gfp_mask = mapping_gfp_mask(mapping); 543 gfp_t gfp_mask = mapping_gfp_mask(mapping);
455 unsigned int offset; 544 unsigned int offset, this_len;
456 struct page *page; 545 struct page *page;
457 pgoff_t index; 546 pgoff_t index;
458 char *src;
459 int ret; 547 int ret;
460 548
461 /* 549 /*
462 * make sure the data in this buffer is uptodate 550 * make sure the data in this buffer is uptodate
463 */ 551 */
464 src = buf->ops->map(file, info, buf); 552 ret = buf->ops->pin(info, buf);
465 if (IS_ERR(src)) 553 if (unlikely(ret))
466 return PTR_ERR(src); 554 return ret;
467 555
468 index = sd->pos >> PAGE_CACHE_SHIFT; 556 index = sd->pos >> PAGE_CACHE_SHIFT;
469 offset = sd->pos & ~PAGE_CACHE_MASK; 557 offset = sd->pos & ~PAGE_CACHE_MASK;
470 558
559 this_len = sd->len;
560 if (this_len + offset > PAGE_CACHE_SIZE)
561 this_len = PAGE_CACHE_SIZE - offset;
562
471 /* 563 /*
472 * Reuse buf page, if SPLICE_F_MOVE is set. 564 * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full
565 * page.
473 */ 566 */
474 if (sd->flags & SPLICE_F_MOVE) { 567 if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) {
475 /* 568 /*
476 * If steal succeeds, buf->page is now pruned from the vm 569 * If steal succeeds, buf->page is now pruned from the vm
477 * side (LRU and page cache) and we can reuse it. 570 * side (page cache) and we can reuse it. The page will also
571 * be locked on successful return.
478 */ 572 */
479 if (buf->ops->steal(info, buf)) 573 if (buf->ops->steal(info, buf))
480 goto find_page; 574 goto find_page;
481 575
576 page = buf->page;
577 page_cache_get(page);
578
482 /* 579 /*
483 * this will also set the page locked 580 * page must be on the LRU for adding to the pagecache.
581 * Check this without grabbing the zone lock, if it isn't
582 * the do grab the zone lock, recheck, and add if necessary.
484 */ 583 */
485 page = buf->page; 584 if (!PageLRU(page)) {
486 if (add_to_page_cache(page, mapping, index, gfp_mask)) 585 struct zone *zone = page_zone(page);
487 goto find_page; 586
587 spin_lock_irq(&zone->lru_lock);
588 if (!PageLRU(page)) {
589 SetPageLRU(page);
590 add_page_to_inactive_list(zone, page);
591 }
592 spin_unlock_irq(&zone->lru_lock);
593 }
488 594
489 if (!(buf->flags & PIPE_BUF_FLAG_LRU)) 595 if (add_to_page_cache(page, mapping, index, gfp_mask)) {
490 lru_cache_add(page); 596 page_cache_release(page);
597 unlock_page(page);
598 goto find_page;
599 }
491 } else { 600 } else {
492find_page: 601find_page:
493 ret = -ENOMEM; 602 page = find_lock_page(mapping, index);
494 page = find_or_create_page(mapping, index, gfp_mask); 603 if (!page) {
495 if (!page) 604 ret = -ENOMEM;
496 goto out_nomem; 605 page = page_cache_alloc_cold(mapping);
606 if (unlikely(!page))
607 goto out_nomem;
608
609 /*
610 * This will also lock the page
611 */
612 ret = add_to_page_cache_lru(page, mapping, index,
613 gfp_mask);
614 if (unlikely(ret))
615 goto out;
616 }
497 617
498 /* 618 /*
499 * If the page is uptodate, it is also locked. If it isn't 619 * We get here with the page locked. If the page is also
500 * uptodate, we can mark it uptodate if we are filling the 620 * uptodate, we don't need to do more. If it isn't, we
501 * full page. Otherwise we need to read it in first... 621 * may need to bring it in if we are not going to overwrite
622 * the full page.
502 */ 623 */
503 if (!PageUptodate(page)) { 624 if (!PageUptodate(page)) {
504 if (sd->len < PAGE_CACHE_SIZE) { 625 if (this_len < PAGE_CACHE_SIZE) {
505 ret = mapping->a_ops->readpage(file, page); 626 ret = mapping->a_ops->readpage(file, page);
506 if (unlikely(ret)) 627 if (unlikely(ret))
507 goto out; 628 goto out;
@@ -520,58 +641,59 @@ find_page:
520 ret = -EIO; 641 ret = -EIO;
521 goto out; 642 goto out;
522 } 643 }
523 } else { 644 } else
524 WARN_ON(!PageLocked(page));
525 SetPageUptodate(page); 645 SetPageUptodate(page);
526 }
527 } 646 }
528 } 647 }
529 648
530 ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); 649 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
531 if (ret == AOP_TRUNCATED_PAGE) { 650 if (ret == AOP_TRUNCATED_PAGE) {
532 page_cache_release(page); 651 page_cache_release(page);
533 goto find_page; 652 goto find_page;
534 } else if (ret) 653 } else if (ret)
535 goto out; 654 goto out;
536 655
537 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { 656 if (buf->page != page) {
538 char *dst = kmap_atomic(page, KM_USER0); 657 /*
658 * Careful, ->map() uses KM_USER0!
659 */
660 char *src = buf->ops->map(info, buf, 1);
661 char *dst = kmap_atomic(page, KM_USER1);
539 662
540 memcpy(dst + offset, src + buf->offset, sd->len); 663 memcpy(dst + offset, src + buf->offset, this_len);
541 flush_dcache_page(page); 664 flush_dcache_page(page);
542 kunmap_atomic(dst, KM_USER0); 665 kunmap_atomic(dst, KM_USER1);
666 buf->ops->unmap(info, buf, src);
543 } 667 }
544 668
545 ret = mapping->a_ops->commit_write(file, page, 0, sd->len); 669 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
546 if (ret == AOP_TRUNCATED_PAGE) { 670 if (!ret) {
671 /*
672 * Return the number of bytes written and mark page as
673 * accessed, we are now done!
674 */
675 ret = this_len;
676 mark_page_accessed(page);
677 balance_dirty_pages_ratelimited(mapping);
678 } else if (ret == AOP_TRUNCATED_PAGE) {
547 page_cache_release(page); 679 page_cache_release(page);
548 goto find_page; 680 goto find_page;
549 } else if (ret)
550 goto out;
551
552 mark_page_accessed(page);
553 balance_dirty_pages_ratelimited(mapping);
554out:
555 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) {
556 page_cache_release(page);
557 unlock_page(page);
558 } 681 }
682out:
683 page_cache_release(page);
684 unlock_page(page);
559out_nomem: 685out_nomem:
560 buf->ops->unmap(info, buf);
561 return ret; 686 return ret;
562} 687}
563 688
564typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
565 struct splice_desc *);
566
567/* 689/*
568 * Pipe input worker. Most of this logic works like a regular pipe, the 690 * Pipe input worker. Most of this logic works like a regular pipe, the
569 * key here is the 'actor' worker passed in that actually moves the data 691 * key here is the 'actor' worker passed in that actually moves the data
570 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. 692 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
571 */ 693 */
572static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, 694ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
573 loff_t *ppos, size_t len, unsigned int flags, 695 loff_t *ppos, size_t len, unsigned int flags,
574 splice_actor *actor) 696 splice_actor *actor)
575{ 697{
576 int ret, do_wakeup, err; 698 int ret, do_wakeup, err;
577 struct splice_desc sd; 699 struct splice_desc sd;
@@ -597,16 +719,22 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
597 sd.len = sd.total_len; 719 sd.len = sd.total_len;
598 720
599 err = actor(pipe, buf, &sd); 721 err = actor(pipe, buf, &sd);
600 if (err) { 722 if (err <= 0) {
601 if (!ret && err != -ENODATA) 723 if (!ret && err != -ENODATA)
602 ret = err; 724 ret = err;
603 725
604 break; 726 break;
605 } 727 }
606 728
607 ret += sd.len; 729 ret += err;
608 buf->offset += sd.len; 730 buf->offset += err;
609 buf->len -= sd.len; 731 buf->len -= err;
732
733 sd.len -= err;
734 sd.pos += err;
735 sd.total_len -= err;
736 if (sd.len)
737 continue;
610 738
611 if (!buf->len) { 739 if (!buf->len) {
612 buf->ops = NULL; 740 buf->ops = NULL;
@@ -617,8 +745,6 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
617 do_wakeup = 1; 745 do_wakeup = 1;
618 } 746 }
619 747
620 sd.pos += sd.len;
621 sd.total_len -= sd.len;
622 if (!sd.total_len) 748 if (!sd.total_len)
623 break; 749 break;
624 } 750 }
@@ -686,23 +812,27 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
686 struct address_space *mapping = out->f_mapping; 812 struct address_space *mapping = out->f_mapping;
687 ssize_t ret; 813 ssize_t ret;
688 814
689 ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 815 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
690 816 if (ret > 0) {
691 /*
692 * If file or inode is SYNC and we actually wrote some data, sync it.
693 */
694 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host))
695 && ret > 0) {
696 struct inode *inode = mapping->host; 817 struct inode *inode = mapping->host;
697 int err;
698 818
699 mutex_lock(&inode->i_mutex); 819 *ppos += ret;
700 err = generic_osync_inode(mapping->host, mapping,
701 OSYNC_METADATA|OSYNC_DATA);
702 mutex_unlock(&inode->i_mutex);
703 820
704 if (err) 821 /*
705 ret = err; 822 * If file or inode is SYNC and we actually wrote some data,
823 * sync it.
824 */
825 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
826 int err;
827
828 mutex_lock(&inode->i_mutex);
829 err = generic_osync_inode(inode, mapping,
830 OSYNC_METADATA|OSYNC_DATA);
831 mutex_unlock(&inode->i_mutex);
832
833 if (err)
834 ret = err;
835 }
706 } 836 }
707 837
708 return ret; 838 return ret;
@@ -724,7 +854,7 @@ EXPORT_SYMBOL(generic_file_splice_write);
724ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 854ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
725 loff_t *ppos, size_t len, unsigned int flags) 855 loff_t *ppos, size_t len, unsigned int flags)
726{ 856{
727 return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 857 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
728} 858}
729 859
730EXPORT_SYMBOL(generic_splice_sendpage); 860EXPORT_SYMBOL(generic_splice_sendpage);
@@ -811,7 +941,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
811 941
812 /* 942 /*
813 * We don't have an immediate reader, but we'll read the stuff 943 * We don't have an immediate reader, but we'll read the stuff
814 * out of the pipe right after the move_to_pipe(). So set 944 * out of the pipe right after the splice_to_pipe(). So set
815 * PIPE_READERS appropriately. 945 * PIPE_READERS appropriately.
816 */ 946 */
817 pipe->readers = 1; 947 pipe->readers = 1;
@@ -904,6 +1034,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
904{ 1034{
905 struct pipe_inode_info *pipe; 1035 struct pipe_inode_info *pipe;
906 loff_t offset, *off; 1036 loff_t offset, *off;
1037 long ret;
907 1038
908 pipe = in->f_dentry->d_inode->i_pipe; 1039 pipe = in->f_dentry->d_inode->i_pipe;
909 if (pipe) { 1040 if (pipe) {
@@ -918,7 +1049,12 @@ static long do_splice(struct file *in, loff_t __user *off_in,
918 } else 1049 } else
919 off = &out->f_pos; 1050 off = &out->f_pos;
920 1051
921 return do_splice_from(pipe, out, off, len, flags); 1052 ret = do_splice_from(pipe, out, off, len, flags);
1053
1054 if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1055 ret = -EFAULT;
1056
1057 return ret;
922 } 1058 }
923 1059
924 pipe = out->f_dentry->d_inode->i_pipe; 1060 pipe = out->f_dentry->d_inode->i_pipe;
@@ -934,12 +1070,195 @@ static long do_splice(struct file *in, loff_t __user *off_in,
934 } else 1070 } else
935 off = &in->f_pos; 1071 off = &in->f_pos;
936 1072
937 return do_splice_to(in, off, pipe, len, flags); 1073 ret = do_splice_to(in, off, pipe, len, flags);
1074
1075 if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1076 ret = -EFAULT;
1077
1078 return ret;
938 } 1079 }
939 1080
940 return -EINVAL; 1081 return -EINVAL;
941} 1082}
942 1083
1084/*
1085 * Map an iov into an array of pages and offset/length tupples. With the
1086 * partial_page structure, we can map several non-contiguous ranges into
1087 * our ones pages[] map instead of splitting that operation into pieces.
1088 * Could easily be exported as a generic helper for other users, in which
1089 * case one would probably want to add a 'max_nr_pages' parameter as well.
1090 */
1091static int get_iovec_page_array(const struct iovec __user *iov,
1092 unsigned int nr_vecs, struct page **pages,
1093 struct partial_page *partial, int aligned)
1094{
1095 int buffers = 0, error = 0;
1096
1097 /*
1098 * It's ok to take the mmap_sem for reading, even
1099 * across a "get_user()".
1100 */
1101 down_read(&current->mm->mmap_sem);
1102
1103 while (nr_vecs) {
1104 unsigned long off, npages;
1105 void __user *base;
1106 size_t len;
1107 int i;
1108
1109 /*
1110 * Get user address base and length for this iovec.
1111 */
1112 error = get_user(base, &iov->iov_base);
1113 if (unlikely(error))
1114 break;
1115 error = get_user(len, &iov->iov_len);
1116 if (unlikely(error))
1117 break;
1118
1119 /*
1120 * Sanity check this iovec. 0 read succeeds.
1121 */
1122 if (unlikely(!len))
1123 break;
1124 error = -EFAULT;
1125 if (unlikely(!base))
1126 break;
1127
1128 /*
1129 * Get this base offset and number of pages, then map
1130 * in the user pages.
1131 */
1132 off = (unsigned long) base & ~PAGE_MASK;
1133
1134 /*
1135 * If asked for alignment, the offset must be zero and the
1136 * length a multiple of the PAGE_SIZE.
1137 */
1138 error = -EINVAL;
1139 if (aligned && (off || len & ~PAGE_MASK))
1140 break;
1141
1142 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1143 if (npages > PIPE_BUFFERS - buffers)
1144 npages = PIPE_BUFFERS - buffers;
1145
1146 error = get_user_pages(current, current->mm,
1147 (unsigned long) base, npages, 0, 0,
1148 &pages[buffers], NULL);
1149
1150 if (unlikely(error <= 0))
1151 break;
1152
1153 /*
1154 * Fill this contiguous range into the partial page map.
1155 */
1156 for (i = 0; i < error; i++) {
1157 const int plen = min_t(size_t, len, PAGE_SIZE - off);
1158
1159 partial[buffers].offset = off;
1160 partial[buffers].len = plen;
1161
1162 off = 0;
1163 len -= plen;
1164 buffers++;
1165 }
1166
1167 /*
1168 * We didn't complete this iov, stop here since it probably
1169 * means we have to move some of this into a pipe to
1170 * be able to continue.
1171 */
1172 if (len)
1173 break;
1174
1175 /*
1176 * Don't continue if we mapped fewer pages than we asked for,
1177 * or if we mapped the max number of pages that we have
1178 * room for.
1179 */
1180 if (error < npages || buffers == PIPE_BUFFERS)
1181 break;
1182
1183 nr_vecs--;
1184 iov++;
1185 }
1186
1187 up_read(&current->mm->mmap_sem);
1188
1189 if (buffers)
1190 return buffers;
1191
1192 return error;
1193}
1194
1195/*
1196 * vmsplice splices a user address range into a pipe. It can be thought of
1197 * as splice-from-memory, where the regular splice is splice-from-file (or
1198 * to file). In both cases the output is a pipe, naturally.
1199 *
1200 * Note that vmsplice only supports splicing _from_ user memory to a pipe,
1201 * not the other way around. Splicing from user memory is a simple operation
1202 * that can be supported without any funky alignment restrictions or nasty
1203 * vm tricks. We simply map in the user memory and fill them into a pipe.
1204 * The reverse isn't quite as easy, though. There are two possible solutions
1205 * for that:
1206 *
1207 * - memcpy() the data internally, at which point we might as well just
1208 * do a regular read() on the buffer anyway.
1209 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1210 * has restriction limitations on both ends of the pipe).
1211 *
1212 * Alas, it isn't here.
1213 *
1214 */
1215static long do_vmsplice(struct file *file, const struct iovec __user *iov,
1216 unsigned long nr_segs, unsigned int flags)
1217{
1218 struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe;
1219 struct page *pages[PIPE_BUFFERS];
1220 struct partial_page partial[PIPE_BUFFERS];
1221 struct splice_pipe_desc spd = {
1222 .pages = pages,
1223 .partial = partial,
1224 .flags = flags,
1225 .ops = &user_page_pipe_buf_ops,
1226 };
1227
1228 if (unlikely(!pipe))
1229 return -EBADF;
1230 if (unlikely(nr_segs > UIO_MAXIOV))
1231 return -EINVAL;
1232 else if (unlikely(!nr_segs))
1233 return 0;
1234
1235 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
1236 flags & SPLICE_F_GIFT);
1237 if (spd.nr_pages <= 0)
1238 return spd.nr_pages;
1239
1240 return splice_to_pipe(pipe, &spd);
1241}
1242
1243asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1244 unsigned long nr_segs, unsigned int flags)
1245{
1246 struct file *file;
1247 long error;
1248 int fput;
1249
1250 error = -EBADF;
1251 file = fget_light(fd, &fput);
1252 if (file) {
1253 if (file->f_mode & FMODE_WRITE)
1254 error = do_vmsplice(file, iov, nr_segs, flags);
1255
1256 fput_light(file, fput);
1257 }
1258
1259 return error;
1260}
1261
943asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1262asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
944 int fd_out, loff_t __user *off_out, 1263 int fd_out, loff_t __user *off_out,
945 size_t len, unsigned int flags) 1264 size_t len, unsigned int flags)
@@ -979,7 +1298,9 @@ static int link_pipe(struct pipe_inode_info *ipipe,
979 size_t len, unsigned int flags) 1298 size_t len, unsigned int flags)
980{ 1299{
981 struct pipe_buffer *ibuf, *obuf; 1300 struct pipe_buffer *ibuf, *obuf;
982 int ret = 0, do_wakeup = 0, i; 1301 int ret, do_wakeup, i, ipipe_first;
1302
1303 ret = do_wakeup = ipipe_first = 0;
983 1304
984 /* 1305 /*
985 * Potential ABBA deadlock, work around it by ordering lock 1306 * Potential ABBA deadlock, work around it by ordering lock
@@ -987,6 +1308,7 @@ static int link_pipe(struct pipe_inode_info *ipipe,
987 * could deadlock (one doing tee from A -> B, the other from B -> A). 1308 * could deadlock (one doing tee from A -> B, the other from B -> A).
988 */ 1309 */
989 if (ipipe->inode < opipe->inode) { 1310 if (ipipe->inode < opipe->inode) {
1311 ipipe_first = 1;
990 mutex_lock(&ipipe->inode->i_mutex); 1312 mutex_lock(&ipipe->inode->i_mutex);
991 mutex_lock(&opipe->inode->i_mutex); 1313 mutex_lock(&opipe->inode->i_mutex);
992 } else { 1314 } else {
@@ -1019,6 +1341,12 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1019 obuf = opipe->bufs + nbuf; 1341 obuf = opipe->bufs + nbuf;
1020 *obuf = *ibuf; 1342 *obuf = *ibuf;
1021 1343
1344 /*
1345 * Don't inherit the gift flag, we need to
1346 * prevent multiple steals of this page.
1347 */
1348 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1349
1022 if (obuf->len > len) 1350 if (obuf->len > len)
1023 obuf->len = len; 1351 obuf->len = len;
1024 1352
@@ -1035,9 +1363,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1035 1363
1036 /* 1364 /*
1037 * We have input available, but no output room. 1365 * We have input available, but no output room.
1038 * If we already copied data, return that. 1366 * If we already copied data, return that. If we
1367 * need to drop the opipe lock, it must be ordered
1368 * last to avoid deadlocks.
1039 */ 1369 */
1040 if (flags & SPLICE_F_NONBLOCK) { 1370 if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) {
1041 if (!ret) 1371 if (!ret)
1042 ret = -EAGAIN; 1372 ret = -EAGAIN;
1043 break; 1373 break;
@@ -1071,7 +1401,12 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1071 if (ret) 1401 if (ret)
1072 break; 1402 break;
1073 } 1403 }
1074 if (flags & SPLICE_F_NONBLOCK) { 1404 /*
1405 * pipe_wait() drops the ipipe mutex. To avoid deadlocks
1406 * with another process, we can only safely do that if
1407 * the ipipe lock is ordered last.
1408 */
1409 if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) {
1075 if (!ret) 1410 if (!ret)
1076 ret = -EAGAIN; 1411 ret = -EAGAIN;
1077 break; 1412 break;