aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/ia64/kernel/entry.S1
-rw-r--r--arch/powerpc/kernel/systbl.S1
-rw-r--r--arch/powerpc/platforms/cell/spu_callbacks.c1
-rw-r--r--fs/splice.c292
-rw-r--r--include/asm-i386/unistd.h3
-rw-r--r--include/asm-ia64/unistd.h3
-rw-r--r--include/asm-powerpc/unistd.h3
-rw-r--r--include/asm-x86_64/unistd.h4
-rw-r--r--include/linux/syscalls.h3
9 files changed, 268 insertions, 43 deletions
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index e30798811216..bcb80ca5cf40 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1610,5 +1610,6 @@ sys_call_table:
1610 data8 sys_get_robust_list 1610 data8 sys_get_robust_list
1611 data8 sys_sync_file_range // 1300 1611 data8 sys_sync_file_range // 1300
1612 data8 sys_tee 1612 data8 sys_tee
1613 data8 sys_vmsplice
1613 1614
1614 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls 1615 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls
diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S
index 8d1522690501..0b98eea73c5e 100644
--- a/arch/powerpc/kernel/systbl.S
+++ b/arch/powerpc/kernel/systbl.S
@@ -324,6 +324,7 @@ COMPAT_SYS(ppoll)
324SYSCALL(unshare) 324SYSCALL(unshare)
325SYSCALL(splice) 325SYSCALL(splice)
326SYSCALL(tee) 326SYSCALL(tee)
327SYSCALL(vmsplice)
327 328
328/* 329/*
329 * please add new calls to arch/powerpc/platforms/cell/spu_callbacks.c 330 * please add new calls to arch/powerpc/platforms/cell/spu_callbacks.c
diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c
index deb3afb94484..b283380a2a18 100644
--- a/arch/powerpc/platforms/cell/spu_callbacks.c
+++ b/arch/powerpc/platforms/cell/spu_callbacks.c
@@ -318,6 +318,7 @@ void *spu_syscall_table[] = {
318 [__NR_unshare] sys_unshare, 318 [__NR_unshare] sys_unshare,
319 [__NR_splice] sys_splice, 319 [__NR_splice] sys_splice,
320 [__NR_tee] sys_tee, 320 [__NR_tee] sys_tee,
321 [__NR_vmsplice] sys_vmsplice,
321}; 322};
322 323
323long spu_sys_callback(struct spu_syscall_block *s) 324long spu_sys_callback(struct spu_syscall_block *s)
diff --git a/fs/splice.c b/fs/splice.c
index 8c6030c762e2..0b2c1f060cae 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -27,6 +27,7 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/syscalls.h> 29#include <linux/syscalls.h>
30#include <linux/uio.h>
30 31
31/* 32/*
32 * Passed to the actors 33 * Passed to the actors
@@ -38,6 +39,22 @@ struct splice_desc {
38 loff_t pos; /* file position */ 39 loff_t pos; /* file position */
39}; 40};
40 41
42struct partial_page {
43 unsigned int offset;
44 unsigned int len;
45};
46
47/*
48 * Passed to move_to_pipe
49 */
50struct splice_pipe_desc {
51 struct page **pages; /* page map */
52 struct partial_page *partial; /* pages[] may not be contig */
53 int nr_pages; /* number of pages in map */
54 unsigned int flags; /* splice flags */
55 struct pipe_buf_operations *ops;/* ops associated with output pipe */
56};
57
41/* 58/*
42 * Attempt to steal a page from a pipe buffer. This should perhaps go into 59 * Attempt to steal a page from a pipe buffer. This should perhaps go into
43 * a vm helper function, it's already simplified quite a bit by the 60 * a vm helper function, it's already simplified quite a bit by the
@@ -128,6 +145,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
128 kunmap(buf->page); 145 kunmap(buf->page);
129} 146}
130 147
148static void *user_page_pipe_buf_map(struct file *file,
149 struct pipe_inode_info *pipe,
150 struct pipe_buffer *buf)
151{
152 return kmap(buf->page);
153}
154
155static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe,
156 struct pipe_buffer *buf)
157{
158 kunmap(buf->page);
159}
160
131static void page_cache_pipe_buf_get(struct pipe_inode_info *info, 161static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
132 struct pipe_buffer *buf) 162 struct pipe_buffer *buf)
133{ 163{
@@ -143,19 +173,33 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = {
143 .get = page_cache_pipe_buf_get, 173 .get = page_cache_pipe_buf_get,
144}; 174};
145 175
176static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
177 struct pipe_buffer *buf)
178{
179 return 1;
180}
181
182static struct pipe_buf_operations user_page_pipe_buf_ops = {
183 .can_merge = 0,
184 .map = user_page_pipe_buf_map,
185 .unmap = user_page_pipe_buf_unmap,
186 .release = page_cache_pipe_buf_release,
187 .steal = user_page_pipe_buf_steal,
188 .get = page_cache_pipe_buf_get,
189};
190
146/* 191/*
147 * Pipe output worker. This sets up our pipe format with the page cache 192 * Pipe output worker. This sets up our pipe format with the page cache
148 * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). 193 * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
149 */ 194 */
150static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, 195static ssize_t move_to_pipe(struct pipe_inode_info *pipe,
151 int nr_pages, unsigned long len, 196 struct splice_pipe_desc *spd)
152 unsigned int offset, unsigned int flags)
153{ 197{
154 int ret, do_wakeup, i; 198 int ret, do_wakeup, page_nr;
155 199
156 ret = 0; 200 ret = 0;
157 do_wakeup = 0; 201 do_wakeup = 0;
158 i = 0; 202 page_nr = 0;
159 203
160 if (pipe->inode) 204 if (pipe->inode)
161 mutex_lock(&pipe->inode->i_mutex); 205 mutex_lock(&pipe->inode->i_mutex);
@@ -171,27 +215,19 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
171 if (pipe->nrbufs < PIPE_BUFFERS) { 215 if (pipe->nrbufs < PIPE_BUFFERS) {
172 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 216 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
173 struct pipe_buffer *buf = pipe->bufs + newbuf; 217 struct pipe_buffer *buf = pipe->bufs + newbuf;
174 struct page *page = pages[i++];
175 unsigned long this_len;
176 218
177 this_len = PAGE_CACHE_SIZE - offset; 219 buf->page = spd->pages[page_nr];
178 if (this_len > len) 220 buf->offset = spd->partial[page_nr].offset;
179 this_len = len; 221 buf->len = spd->partial[page_nr].len;
180 222 buf->ops = spd->ops;
181 buf->page = page;
182 buf->offset = offset;
183 buf->len = this_len;
184 buf->ops = &page_cache_pipe_buf_ops;
185 pipe->nrbufs++; 223 pipe->nrbufs++;
224 page_nr++;
225 ret += buf->len;
226
186 if (pipe->inode) 227 if (pipe->inode)
187 do_wakeup = 1; 228 do_wakeup = 1;
188 229
189 ret += this_len; 230 if (!--spd->nr_pages)
190 len -= this_len;
191 offset = 0;
192 if (!--nr_pages)
193 break;
194 if (!len)
195 break; 231 break;
196 if (pipe->nrbufs < PIPE_BUFFERS) 232 if (pipe->nrbufs < PIPE_BUFFERS)
197 continue; 233 continue;
@@ -199,7 +235,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
199 break; 235 break;
200 } 236 }
201 237
202 if (flags & SPLICE_F_NONBLOCK) { 238 if (spd->flags & SPLICE_F_NONBLOCK) {
203 if (!ret) 239 if (!ret)
204 ret = -EAGAIN; 240 ret = -EAGAIN;
205 break; 241 break;
@@ -234,8 +270,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
234 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 270 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
235 } 271 }
236 272
237 while (i < nr_pages) 273 while (page_nr < spd->nr_pages)
238 page_cache_release(pages[i++]); 274 page_cache_release(spd->pages[page_nr++]);
239 275
240 return ret; 276 return ret;
241} 277}
@@ -246,17 +282,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
246 unsigned int flags) 282 unsigned int flags)
247{ 283{
248 struct address_space *mapping = in->f_mapping; 284 struct address_space *mapping = in->f_mapping;
249 unsigned int loff, offset, nr_pages; 285 unsigned int loff, nr_pages;
250 struct page *pages[PIPE_BUFFERS]; 286 struct page *pages[PIPE_BUFFERS];
287 struct partial_page partial[PIPE_BUFFERS];
251 struct page *page; 288 struct page *page;
252 pgoff_t index, end_index; 289 pgoff_t index, end_index;
253 loff_t isize; 290 loff_t isize;
254 size_t bytes; 291 size_t total_len;
255 int i, error; 292 int error;
293 struct splice_pipe_desc spd = {
294 .pages = pages,
295 .partial = partial,
296 .flags = flags,
297 .ops = &page_cache_pipe_buf_ops,
298 };
256 299
257 index = *ppos >> PAGE_CACHE_SHIFT; 300 index = *ppos >> PAGE_CACHE_SHIFT;
258 loff = offset = *ppos & ~PAGE_CACHE_MASK; 301 loff = *ppos & ~PAGE_CACHE_MASK;
259 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 302 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
260 303
261 if (nr_pages > PIPE_BUFFERS) 304 if (nr_pages > PIPE_BUFFERS)
262 nr_pages = PIPE_BUFFERS; 305 nr_pages = PIPE_BUFFERS;
@@ -266,15 +309,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
266 * read-ahead if this is a non-zero offset (we are likely doing small 309 * read-ahead if this is a non-zero offset (we are likely doing small
267 * chunk splice and the page is already there) for a single page. 310 * chunk splice and the page is already there) for a single page.
268 */ 311 */
269 if (!offset || nr_pages > 1) 312 if (!loff || spd.nr_pages > 1)
270 do_page_cache_readahead(mapping, in, index, nr_pages); 313 do_page_cache_readahead(mapping, in, index, spd.nr_pages);
271 314
272 /* 315 /*
273 * Now fill in the holes: 316 * Now fill in the holes:
274 */ 317 */
275 error = 0; 318 error = 0;
276 bytes = 0; 319 total_len = 0;
277 for (i = 0; i < nr_pages; i++, index++) { 320 for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) {
278 unsigned int this_len; 321 unsigned int this_len;
279 322
280 if (!len) 323 if (!len)
@@ -367,26 +410,29 @@ readpage:
367 */ 410 */
368 if (end_index == index) { 411 if (end_index == index) {
369 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); 412 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
370 if (bytes + loff > isize) { 413 if (total_len + loff > isize) {
371 page_cache_release(page); 414 page_cache_release(page);
372 break; 415 break;
373 } 416 }
374 /* 417 /*
375 * force quit after adding this page 418 * force quit after adding this page
376 */ 419 */
377 nr_pages = i; 420 nr_pages = spd.nr_pages;
378 this_len = min(this_len, loff); 421 this_len = min(this_len, loff);
422 loff = 0;
379 } 423 }
380 } 424 }
381fill_it: 425fill_it:
382 pages[i] = page; 426 pages[spd.nr_pages] = page;
383 bytes += this_len; 427 partial[spd.nr_pages].offset = loff;
428 partial[spd.nr_pages].len = this_len;
384 len -= this_len; 429 len -= this_len;
430 total_len += this_len;
385 loff = 0; 431 loff = 0;
386 } 432 }
387 433
388 if (i) 434 if (spd.nr_pages)
389 return move_to_pipe(pipe, pages, i, bytes, offset, flags); 435 return move_to_pipe(pipe, &spd);
390 436
391 return error; 437 return error;
392} 438}
@@ -1018,6 +1064,174 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1018 return -EINVAL; 1064 return -EINVAL;
1019} 1065}
1020 1066
1067/*
1068 * Map an iov into an array of pages and offset/length tupples. With the
1069 * partial_page structure, we can map several non-contiguous ranges into
1070 * our ones pages[] map instead of splitting that operation into pieces.
1071 * Could easily be exported as a generic helper for other users, in which
1072 * case one would probably want to add a 'max_nr_pages' parameter as well.
1073 */
1074static int get_iovec_page_array(const struct iovec __user *iov,
1075 unsigned int nr_vecs, struct page **pages,
1076 struct partial_page *partial)
1077{
1078 int buffers = 0, error = 0;
1079
1080 /*
1081 * It's ok to take the mmap_sem for reading, even
1082 * across a "get_user()".
1083 */
1084 down_read(&current->mm->mmap_sem);
1085
1086 while (nr_vecs) {
1087 unsigned long off, npages;
1088 void __user *base;
1089 size_t len;
1090 int i;
1091
1092 /*
1093 * Get user address base and length for this iovec.
1094 */
1095 error = get_user(base, &iov->iov_base);
1096 if (unlikely(error))
1097 break;
1098 error = get_user(len, &iov->iov_len);
1099 if (unlikely(error))
1100 break;
1101
1102 /*
1103 * Sanity check this iovec. 0 read succeeds.
1104 */
1105 if (unlikely(!len))
1106 break;
1107 error = -EFAULT;
1108 if (unlikely(!base))
1109 break;
1110
1111 /*
1112 * Get this base offset and number of pages, then map
1113 * in the user pages.
1114 */
1115 off = (unsigned long) base & ~PAGE_MASK;
1116 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1117 if (npages > PIPE_BUFFERS - buffers)
1118 npages = PIPE_BUFFERS - buffers;
1119
1120 error = get_user_pages(current, current->mm,
1121 (unsigned long) base, npages, 0, 0,
1122 &pages[buffers], NULL);
1123
1124 if (unlikely(error <= 0))
1125 break;
1126
1127 /*
1128 * Fill this contiguous range into the partial page map.
1129 */
1130 for (i = 0; i < error; i++) {
1131 const int plen = min_t(size_t, len, PAGE_SIZE) - off;
1132
1133 partial[buffers].offset = off;
1134 partial[buffers].len = plen;
1135
1136 off = 0;
1137 len -= plen;
1138 buffers++;
1139 }
1140
1141 /*
1142 * We didn't complete this iov, stop here since it probably
1143 * means we have to move some of this into a pipe to
1144 * be able to continue.
1145 */
1146 if (len)
1147 break;
1148
1149 /*
1150 * Don't continue if we mapped fewer pages than we asked for,
1151 * or if we mapped the max number of pages that we have
1152 * room for.
1153 */
1154 if (error < npages || buffers == PIPE_BUFFERS)
1155 break;
1156
1157 nr_vecs--;
1158 iov++;
1159 }
1160
1161 up_read(&current->mm->mmap_sem);
1162
1163 if (buffers)
1164 return buffers;
1165
1166 return error;
1167}
1168
1169/*
1170 * vmsplice splices a user address range into a pipe. It can be thought of
1171 * as splice-from-memory, where the regular splice is splice-from-file (or
1172 * to file). In both cases the output is a pipe, naturally.
1173 *
1174 * Note that vmsplice only supports splicing _from_ user memory to a pipe,
1175 * not the other way around. Splicing from user memory is a simple operation
1176 * that can be supported without any funky alignment restrictions or nasty
1177 * vm tricks. We simply map in the user memory and fill them into a pipe.
1178 * The reverse isn't quite as easy, though. There are two possible solutions
1179 * for that:
1180 *
1181 * - memcpy() the data internally, at which point we might as well just
1182 * do a regular read() on the buffer anyway.
1183 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1184 * has restriction limitations on both ends of the pipe).
1185 *
1186 * Alas, it isn't here.
1187 *
1188 */
1189static long do_vmsplice(struct file *file, const struct iovec __user *iov,
1190 unsigned long nr_segs, unsigned int flags)
1191{
1192 struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe;
1193 struct page *pages[PIPE_BUFFERS];
1194 struct partial_page partial[PIPE_BUFFERS];
1195 struct splice_pipe_desc spd = {
1196 .pages = pages,
1197 .partial = partial,
1198 .flags = flags,
1199 .ops = &user_page_pipe_buf_ops,
1200 };
1201
1202 if (unlikely(!pipe))
1203 return -EBADF;
1204 if (unlikely(nr_segs > UIO_MAXIOV))
1205 return -EINVAL;
1206 else if (unlikely(!nr_segs))
1207 return 0;
1208
1209 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial);
1210 if (spd.nr_pages <= 0)
1211 return spd.nr_pages;
1212
1213 return move_to_pipe(pipe, &spd);
1214}
1215
1216asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1217 unsigned long nr_segs, unsigned int flags)
1218{
1219 struct file *file;
1220 long error;
1221 int fput;
1222
1223 error = -EBADF;
1224 file = fget_light(fd, &fput);
1225 if (file) {
1226 if (file->f_mode & FMODE_WRITE)
1227 error = do_vmsplice(file, iov, nr_segs, flags);
1228
1229 fput_light(file, fput);
1230 }
1231
1232 return error;
1233}
1234
1021asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1235asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
1022 int fd_out, loff_t __user *off_out, 1236 int fd_out, loff_t __user *off_out,
1023 size_t len, unsigned int flags) 1237 size_t len, unsigned int flags)
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index d81d6cfc1bb4..eb4b152c82fc 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -321,8 +321,9 @@
321#define __NR_splice 313 321#define __NR_splice 313
322#define __NR_sync_file_range 314 322#define __NR_sync_file_range 314
323#define __NR_tee 315 323#define __NR_tee 315
324#define __NR_vmsplice 316
324 325
325#define NR_syscalls 316 326#define NR_syscalls 317
326 327
327/* 328/*
328 * user-visible error numbers are in the range -1 - -128: see 329 * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index a40ebec6aeeb..7107763168bf 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -290,12 +290,13 @@
290#define __NR_get_robust_list 1299 290#define __NR_get_robust_list 1299
291#define __NR_sync_file_range 1300 291#define __NR_sync_file_range 1300
292#define __NR_tee 1301 292#define __NR_tee 1301
293#define __NR_vmsplice 1302
293 294
294#ifdef __KERNEL__ 295#ifdef __KERNEL__
295 296
296#include <linux/config.h> 297#include <linux/config.h>
297 298
298#define NR_syscalls 278 /* length of syscall table */ 299#define NR_syscalls 279 /* length of syscall table */
299 300
300#define __ARCH_WANT_SYS_RT_SIGACTION 301#define __ARCH_WANT_SYS_RT_SIGACTION
301 302
diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h
index c612f1a62772..34325e292596 100644
--- a/include/asm-powerpc/unistd.h
+++ b/include/asm-powerpc/unistd.h
@@ -303,8 +303,9 @@
303#define __NR_unshare 282 303#define __NR_unshare 282
304#define __NR_splice 283 304#define __NR_splice 283
305#define __NR_tee 284 305#define __NR_tee 284
306#define __NR_vmsplice 285
306 307
307#define __NR_syscalls 285 308#define __NR_syscalls 286
308 309
309#ifdef __KERNEL__ 310#ifdef __KERNEL__
310#define __NR__exit __NR_exit 311#define __NR__exit __NR_exit
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 98c36eae567c..feb77cb8c044 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -615,8 +615,10 @@ __SYSCALL(__NR_splice, sys_splice)
615__SYSCALL(__NR_tee, sys_tee) 615__SYSCALL(__NR_tee, sys_tee)
616#define __NR_sync_file_range 277 616#define __NR_sync_file_range 277
617__SYSCALL(__NR_sync_file_range, sys_sync_file_range) 617__SYSCALL(__NR_sync_file_range, sys_sync_file_range)
618#define __NR_vmsplice 278
619__SYSCALL(__NR_vmsplice, sys_vmsplice)
618 620
619#define __NR_syscall_max __NR_sync_file_range 621#define __NR_syscall_max __NR_vmsplice
620 622
621#ifndef __NO_STUBS 623#ifndef __NO_STUBS
622 624
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d3ebc0e68b2b..3996960fc565 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -574,6 +574,9 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
574 int fd_out, loff_t __user *off_out, 574 int fd_out, loff_t __user *off_out,
575 size_t len, unsigned int flags); 575 size_t len, unsigned int flags);
576 576
577asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
578 unsigned long nr_segs, unsigned int flags);
579
577asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags); 580asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags);
578 581
579asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, 582asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,