aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJens Axboe <axboe@suse.de>2006-04-26 04:59:21 -0400
committerJens Axboe <axboe@suse.de>2006-04-26 04:59:21 -0400
commit912d35f86781e64d73be1ef358f703c08905ac37 (patch)
tree5863ad70f5cdccba0c1cd28f83d173deaf71fb83 /fs
parent016b661e2f717168e600f3c85f29e1a49f88e004 (diff)
[PATCH] Add support for the sys_vmsplice syscall
sys_splice() moves data to/from pipes with a file input/output. sys_vmsplice() moves data to a pipe, with the input being a user address range instead. This uses an approach suggested by Linus, where we can hold partial ranges inside the pages[] map. Hopefully this will be useful for network receive support as well. Signed-off-by: Jens Axboe <axboe@suse.de>
Diffstat (limited to 'fs')
-rw-r--r--fs/splice.c292
1 files changed, 253 insertions, 39 deletions
diff --git a/fs/splice.c b/fs/splice.c
index 8c6030c762e2..0b2c1f060cae 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -27,6 +27,7 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/syscalls.h> 29#include <linux/syscalls.h>
30#include <linux/uio.h>
30 31
31/* 32/*
32 * Passed to the actors 33 * Passed to the actors
@@ -38,6 +39,22 @@ struct splice_desc {
38 loff_t pos; /* file position */ 39 loff_t pos; /* file position */
39}; 40};
40 41
42struct partial_page {
43 unsigned int offset;
44 unsigned int len;
45};
46
47/*
48 * Passed to move_to_pipe
49 */
50struct splice_pipe_desc {
51 struct page **pages; /* page map */
52 struct partial_page *partial; /* pages[] may not be contig */
53 int nr_pages; /* number of pages in map */
54 unsigned int flags; /* splice flags */
55 struct pipe_buf_operations *ops;/* ops associated with output pipe */
56};
57
41/* 58/*
42 * Attempt to steal a page from a pipe buffer. This should perhaps go into 59 * Attempt to steal a page from a pipe buffer. This should perhaps go into
43 * a vm helper function, it's already simplified quite a bit by the 60 * a vm helper function, it's already simplified quite a bit by the
@@ -128,6 +145,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
128 kunmap(buf->page); 145 kunmap(buf->page);
129} 146}
130 147
148static void *user_page_pipe_buf_map(struct file *file,
149 struct pipe_inode_info *pipe,
150 struct pipe_buffer *buf)
151{
152 return kmap(buf->page);
153}
154
155static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe,
156 struct pipe_buffer *buf)
157{
158 kunmap(buf->page);
159}
160
131static void page_cache_pipe_buf_get(struct pipe_inode_info *info, 161static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
132 struct pipe_buffer *buf) 162 struct pipe_buffer *buf)
133{ 163{
@@ -143,19 +173,33 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = {
143 .get = page_cache_pipe_buf_get, 173 .get = page_cache_pipe_buf_get,
144}; 174};
145 175
176static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
177 struct pipe_buffer *buf)
178{
179 return 1;
180}
181
182static struct pipe_buf_operations user_page_pipe_buf_ops = {
183 .can_merge = 0,
184 .map = user_page_pipe_buf_map,
185 .unmap = user_page_pipe_buf_unmap,
186 .release = page_cache_pipe_buf_release,
187 .steal = user_page_pipe_buf_steal,
188 .get = page_cache_pipe_buf_get,
189};
190
146/* 191/*
147 * Pipe output worker. This sets up our pipe format with the page cache 192 * Pipe output worker. This sets up our pipe format with the page cache
148 * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). 193 * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
149 */ 194 */
150static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, 195static ssize_t move_to_pipe(struct pipe_inode_info *pipe,
151 int nr_pages, unsigned long len, 196 struct splice_pipe_desc *spd)
152 unsigned int offset, unsigned int flags)
153{ 197{
154 int ret, do_wakeup, i; 198 int ret, do_wakeup, page_nr;
155 199
156 ret = 0; 200 ret = 0;
157 do_wakeup = 0; 201 do_wakeup = 0;
158 i = 0; 202 page_nr = 0;
159 203
160 if (pipe->inode) 204 if (pipe->inode)
161 mutex_lock(&pipe->inode->i_mutex); 205 mutex_lock(&pipe->inode->i_mutex);
@@ -171,27 +215,19 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
171 if (pipe->nrbufs < PIPE_BUFFERS) { 215 if (pipe->nrbufs < PIPE_BUFFERS) {
172 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 216 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
173 struct pipe_buffer *buf = pipe->bufs + newbuf; 217 struct pipe_buffer *buf = pipe->bufs + newbuf;
174 struct page *page = pages[i++];
175 unsigned long this_len;
176 218
177 this_len = PAGE_CACHE_SIZE - offset; 219 buf->page = spd->pages[page_nr];
178 if (this_len > len) 220 buf->offset = spd->partial[page_nr].offset;
179 this_len = len; 221 buf->len = spd->partial[page_nr].len;
180 222 buf->ops = spd->ops;
181 buf->page = page;
182 buf->offset = offset;
183 buf->len = this_len;
184 buf->ops = &page_cache_pipe_buf_ops;
185 pipe->nrbufs++; 223 pipe->nrbufs++;
224 page_nr++;
225 ret += buf->len;
226
186 if (pipe->inode) 227 if (pipe->inode)
187 do_wakeup = 1; 228 do_wakeup = 1;
188 229
189 ret += this_len; 230 if (!--spd->nr_pages)
190 len -= this_len;
191 offset = 0;
192 if (!--nr_pages)
193 break;
194 if (!len)
195 break; 231 break;
196 if (pipe->nrbufs < PIPE_BUFFERS) 232 if (pipe->nrbufs < PIPE_BUFFERS)
197 continue; 233 continue;
@@ -199,7 +235,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
199 break; 235 break;
200 } 236 }
201 237
202 if (flags & SPLICE_F_NONBLOCK) { 238 if (spd->flags & SPLICE_F_NONBLOCK) {
203 if (!ret) 239 if (!ret)
204 ret = -EAGAIN; 240 ret = -EAGAIN;
205 break; 241 break;
@@ -234,8 +270,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
234 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 270 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
235 } 271 }
236 272
237 while (i < nr_pages) 273 while (page_nr < spd->nr_pages)
238 page_cache_release(pages[i++]); 274 page_cache_release(spd->pages[page_nr++]);
239 275
240 return ret; 276 return ret;
241} 277}
@@ -246,17 +282,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
246 unsigned int flags) 282 unsigned int flags)
247{ 283{
248 struct address_space *mapping = in->f_mapping; 284 struct address_space *mapping = in->f_mapping;
249 unsigned int loff, offset, nr_pages; 285 unsigned int loff, nr_pages;
250 struct page *pages[PIPE_BUFFERS]; 286 struct page *pages[PIPE_BUFFERS];
287 struct partial_page partial[PIPE_BUFFERS];
251 struct page *page; 288 struct page *page;
252 pgoff_t index, end_index; 289 pgoff_t index, end_index;
253 loff_t isize; 290 loff_t isize;
254 size_t bytes; 291 size_t total_len;
255 int i, error; 292 int error;
293 struct splice_pipe_desc spd = {
294 .pages = pages,
295 .partial = partial,
296 .flags = flags,
297 .ops = &page_cache_pipe_buf_ops,
298 };
256 299
257 index = *ppos >> PAGE_CACHE_SHIFT; 300 index = *ppos >> PAGE_CACHE_SHIFT;
258 loff = offset = *ppos & ~PAGE_CACHE_MASK; 301 loff = *ppos & ~PAGE_CACHE_MASK;
259 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 302 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
260 303
261 if (nr_pages > PIPE_BUFFERS) 304 if (nr_pages > PIPE_BUFFERS)
262 nr_pages = PIPE_BUFFERS; 305 nr_pages = PIPE_BUFFERS;
@@ -266,15 +309,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
266 * read-ahead if this is a non-zero offset (we are likely doing small 309 * read-ahead if this is a non-zero offset (we are likely doing small
267 * chunk splice and the page is already there) for a single page. 310 * chunk splice and the page is already there) for a single page.
268 */ 311 */
269 if (!offset || nr_pages > 1) 312 if (!loff || spd.nr_pages > 1)
270 do_page_cache_readahead(mapping, in, index, nr_pages); 313 do_page_cache_readahead(mapping, in, index, spd.nr_pages);
271 314
272 /* 315 /*
273 * Now fill in the holes: 316 * Now fill in the holes:
274 */ 317 */
275 error = 0; 318 error = 0;
276 bytes = 0; 319 total_len = 0;
277 for (i = 0; i < nr_pages; i++, index++) { 320 for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) {
278 unsigned int this_len; 321 unsigned int this_len;
279 322
280 if (!len) 323 if (!len)
@@ -367,26 +410,29 @@ readpage:
367 */ 410 */
368 if (end_index == index) { 411 if (end_index == index) {
369 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); 412 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
370 if (bytes + loff > isize) { 413 if (total_len + loff > isize) {
371 page_cache_release(page); 414 page_cache_release(page);
372 break; 415 break;
373 } 416 }
374 /* 417 /*
375 * force quit after adding this page 418 * force quit after adding this page
376 */ 419 */
377 nr_pages = i; 420 nr_pages = spd.nr_pages;
378 this_len = min(this_len, loff); 421 this_len = min(this_len, loff);
422 loff = 0;
379 } 423 }
380 } 424 }
381fill_it: 425fill_it:
382 pages[i] = page; 426 pages[spd.nr_pages] = page;
383 bytes += this_len; 427 partial[spd.nr_pages].offset = loff;
428 partial[spd.nr_pages].len = this_len;
384 len -= this_len; 429 len -= this_len;
430 total_len += this_len;
385 loff = 0; 431 loff = 0;
386 } 432 }
387 433
388 if (i) 434 if (spd.nr_pages)
389 return move_to_pipe(pipe, pages, i, bytes, offset, flags); 435 return move_to_pipe(pipe, &spd);
390 436
391 return error; 437 return error;
392} 438}
@@ -1018,6 +1064,174 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1018 return -EINVAL; 1064 return -EINVAL;
1019} 1065}
1020 1066
1067/*
1068 * Map an iov into an array of pages and offset/length tupples. With the
1069 * partial_page structure, we can map several non-contiguous ranges into
1070 * our ones pages[] map instead of splitting that operation into pieces.
1071 * Could easily be exported as a generic helper for other users, in which
1072 * case one would probably want to add a 'max_nr_pages' parameter as well.
1073 */
1074static int get_iovec_page_array(const struct iovec __user *iov,
1075 unsigned int nr_vecs, struct page **pages,
1076 struct partial_page *partial)
1077{
1078 int buffers = 0, error = 0;
1079
1080 /*
1081 * It's ok to take the mmap_sem for reading, even
1082 * across a "get_user()".
1083 */
1084 down_read(&current->mm->mmap_sem);
1085
1086 while (nr_vecs) {
1087 unsigned long off, npages;
1088 void __user *base;
1089 size_t len;
1090 int i;
1091
1092 /*
1093 * Get user address base and length for this iovec.
1094 */
1095 error = get_user(base, &iov->iov_base);
1096 if (unlikely(error))
1097 break;
1098 error = get_user(len, &iov->iov_len);
1099 if (unlikely(error))
1100 break;
1101
1102 /*
1103 * Sanity check this iovec. 0 read succeeds.
1104 */
1105 if (unlikely(!len))
1106 break;
1107 error = -EFAULT;
1108 if (unlikely(!base))
1109 break;
1110
1111 /*
1112 * Get this base offset and number of pages, then map
1113 * in the user pages.
1114 */
1115 off = (unsigned long) base & ~PAGE_MASK;
1116 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1117 if (npages > PIPE_BUFFERS - buffers)
1118 npages = PIPE_BUFFERS - buffers;
1119
1120 error = get_user_pages(current, current->mm,
1121 (unsigned long) base, npages, 0, 0,
1122 &pages[buffers], NULL);
1123
1124 if (unlikely(error <= 0))
1125 break;
1126
1127 /*
1128 * Fill this contiguous range into the partial page map.
1129 */
1130 for (i = 0; i < error; i++) {
1131 const int plen = min_t(size_t, len, PAGE_SIZE) - off;
1132
1133 partial[buffers].offset = off;
1134 partial[buffers].len = plen;
1135
1136 off = 0;
1137 len -= plen;
1138 buffers++;
1139 }
1140
1141 /*
1142 * We didn't complete this iov, stop here since it probably
1143 * means we have to move some of this into a pipe to
1144 * be able to continue.
1145 */
1146 if (len)
1147 break;
1148
1149 /*
1150 * Don't continue if we mapped fewer pages than we asked for,
1151 * or if we mapped the max number of pages that we have
1152 * room for.
1153 */
1154 if (error < npages || buffers == PIPE_BUFFERS)
1155 break;
1156
1157 nr_vecs--;
1158 iov++;
1159 }
1160
1161 up_read(&current->mm->mmap_sem);
1162
1163 if (buffers)
1164 return buffers;
1165
1166 return error;
1167}
1168
1169/*
1170 * vmsplice splices a user address range into a pipe. It can be thought of
1171 * as splice-from-memory, where the regular splice is splice-from-file (or
1172 * to file). In both cases the output is a pipe, naturally.
1173 *
1174 * Note that vmsplice only supports splicing _from_ user memory to a pipe,
1175 * not the other way around. Splicing from user memory is a simple operation
1176 * that can be supported without any funky alignment restrictions or nasty
1177 * vm tricks. We simply map in the user memory and fill them into a pipe.
1178 * The reverse isn't quite as easy, though. There are two possible solutions
1179 * for that:
1180 *
1181 * - memcpy() the data internally, at which point we might as well just
1182 * do a regular read() on the buffer anyway.
1183 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1184 * has restriction limitations on both ends of the pipe).
1185 *
1186 * Alas, it isn't here.
1187 *
1188 */
1189static long do_vmsplice(struct file *file, const struct iovec __user *iov,
1190 unsigned long nr_segs, unsigned int flags)
1191{
1192 struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe;
1193 struct page *pages[PIPE_BUFFERS];
1194 struct partial_page partial[PIPE_BUFFERS];
1195 struct splice_pipe_desc spd = {
1196 .pages = pages,
1197 .partial = partial,
1198 .flags = flags,
1199 .ops = &user_page_pipe_buf_ops,
1200 };
1201
1202 if (unlikely(!pipe))
1203 return -EBADF;
1204 if (unlikely(nr_segs > UIO_MAXIOV))
1205 return -EINVAL;
1206 else if (unlikely(!nr_segs))
1207 return 0;
1208
1209 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial);
1210 if (spd.nr_pages <= 0)
1211 return spd.nr_pages;
1212
1213 return move_to_pipe(pipe, &spd);
1214}
1215
1216asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1217 unsigned long nr_segs, unsigned int flags)
1218{
1219 struct file *file;
1220 long error;
1221 int fput;
1222
1223 error = -EBADF;
1224 file = fget_light(fd, &fput);
1225 if (file) {
1226 if (file->f_mode & FMODE_WRITE)
1227 error = do_vmsplice(file, iov, nr_segs, flags);
1228
1229 fput_light(file, fput);
1230 }
1231
1232 return error;
1233}
1234
1021asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1235asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
1022 int fd_out, loff_t __user *off_out, 1236 int fd_out, loff_t __user *off_out,
1023 size_t len, unsigned int flags) 1237 size_t len, unsigned int flags)