aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDavid Woodhouse <dwmw2@shinybook.infradead.org>2006-04-28 20:42:26 -0400
committerDavid Woodhouse <dwmw2@infradead.org>2006-04-28 20:42:26 -0400
commitd6754b401a15eaa16492ea5dbaa4826361d3f411 (patch)
tree032f067d3af458527d903a7653885404ed82431e /fs
parentacc429a517bd11fdcac9bea97d082d26231beb92 (diff)
parent693f7d362055261882659475d2ef022e32edbff1 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'fs')
-rw-r--r--fs/compat.c4
-rw-r--r--fs/ext3/ioctl.c18
-rw-r--r--fs/ext3/resize.c2
-rw-r--r--fs/fuse/dev.c35
-rw-r--r--fs/fuse/fuse_i.h12
-rw-r--r--fs/fuse/inode.c40
-rw-r--r--fs/splice.c434
7 files changed, 405 insertions, 140 deletions
diff --git a/fs/compat.c b/fs/compat.c
index 7f8e26ea427c..2e32bd340474 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1217,6 +1217,10 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
1217 if (ret < 0) 1217 if (ret < 0)
1218 goto out; 1218 goto out;
1219 1219
1220 ret = security_file_permission(file, type == READ ? MAY_READ:MAY_WRITE);
1221 if (ret)
1222 goto out;
1223
1220 fnv = NULL; 1224 fnv = NULL;
1221 if (type == READ) { 1225 if (type == READ) {
1222 fn = file->f_op->read; 1226 fn = file->f_op->read;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index aaf1da17b6d4..8c22aa9a7fbb 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -48,6 +48,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
48 if (!S_ISDIR(inode->i_mode)) 48 if (!S_ISDIR(inode->i_mode))
49 flags &= ~EXT3_DIRSYNC_FL; 49 flags &= ~EXT3_DIRSYNC_FL;
50 50
51 mutex_lock(&inode->i_mutex);
51 oldflags = ei->i_flags; 52 oldflags = ei->i_flags;
52 53
53 /* The JOURNAL_DATA flag is modifiable only by root */ 54 /* The JOURNAL_DATA flag is modifiable only by root */
@@ -60,8 +61,10 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
60 * This test looks nicer. Thanks to Pauline Middelink 61 * This test looks nicer. Thanks to Pauline Middelink
61 */ 62 */
62 if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { 63 if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
63 if (!capable(CAP_LINUX_IMMUTABLE)) 64 if (!capable(CAP_LINUX_IMMUTABLE)) {
65 mutex_unlock(&inode->i_mutex);
64 return -EPERM; 66 return -EPERM;
67 }
65 } 68 }
66 69
67 /* 70 /*
@@ -69,14 +72,18 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
69 * the relevant capability. 72 * the relevant capability.
70 */ 73 */
71 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { 74 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
72 if (!capable(CAP_SYS_RESOURCE)) 75 if (!capable(CAP_SYS_RESOURCE)) {
76 mutex_unlock(&inode->i_mutex);
73 return -EPERM; 77 return -EPERM;
78 }
74 } 79 }
75 80
76 81
77 handle = ext3_journal_start(inode, 1); 82 handle = ext3_journal_start(inode, 1);
78 if (IS_ERR(handle)) 83 if (IS_ERR(handle)) {
84 mutex_unlock(&inode->i_mutex);
79 return PTR_ERR(handle); 85 return PTR_ERR(handle);
86 }
80 if (IS_SYNC(inode)) 87 if (IS_SYNC(inode))
81 handle->h_sync = 1; 88 handle->h_sync = 1;
82 err = ext3_reserve_inode_write(handle, inode, &iloc); 89 err = ext3_reserve_inode_write(handle, inode, &iloc);
@@ -93,11 +100,14 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
93 err = ext3_mark_iloc_dirty(handle, inode, &iloc); 100 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
94flags_err: 101flags_err:
95 ext3_journal_stop(handle); 102 ext3_journal_stop(handle);
96 if (err) 103 if (err) {
104 mutex_unlock(&inode->i_mutex);
97 return err; 105 return err;
106 }
98 107
99 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) 108 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
100 err = ext3_change_inode_journal_flag(inode, jflag); 109 err = ext3_change_inode_journal_flag(inode, jflag);
110 mutex_unlock(&inode->i_mutex);
101 return err; 111 return err;
102 } 112 }
103 case EXT3_IOC_GETVERSION: 113 case EXT3_IOC_GETVERSION:
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index c5ffa8523968..8aac5334680d 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -213,7 +213,7 @@ static int setup_new_group_blocks(struct super_block *sb,
213 goto exit_bh; 213 goto exit_bh;
214 } 214 }
215 lock_buffer(bh); 215 lock_buffer(bh);
216 memcpy(gdb->b_data, sbi->s_group_desc[i], bh->b_size); 216 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size);
217 set_buffer_uptodate(gdb); 217 set_buffer_uptodate(gdb);
218 unlock_buffer(bh); 218 unlock_buffer(bh);
219 ext3_journal_dirty_metadata(handle, gdb); 219 ext3_journal_dirty_metadata(handle, gdb);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cc750c68fe70..104a62dadb94 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -128,14 +128,24 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
128 } 128 }
129} 129}
130 130
131void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req) 131/*
132 * Called with sbput_sem held for read (request_end) or write
133 * (fuse_put_super). By the time fuse_put_super() is finished, all
134 * inodes belonging to background requests must be released, so the
135 * iputs have to be done within the locked region.
136 */
137void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req)
132{ 138{
133 list_del_init(&req->bg_entry); 139 iput(req->inode);
140 iput(req->inode2);
141 spin_lock(&fc->lock);
142 list_del(&req->bg_entry);
134 if (fc->num_background == FUSE_MAX_BACKGROUND) { 143 if (fc->num_background == FUSE_MAX_BACKGROUND) {
135 fc->blocked = 0; 144 fc->blocked = 0;
136 wake_up_all(&fc->blocked_waitq); 145 wake_up_all(&fc->blocked_waitq);
137 } 146 }
138 fc->num_background--; 147 fc->num_background--;
148 spin_unlock(&fc->lock);
139} 149}
140 150
141/* 151/*
@@ -165,27 +175,22 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
165 wake_up(&req->waitq); 175 wake_up(&req->waitq);
166 fuse_put_request(fc, req); 176 fuse_put_request(fc, req);
167 } else { 177 } else {
168 struct inode *inode = req->inode;
169 struct inode *inode2 = req->inode2;
170 struct file *file = req->file;
171 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; 178 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
172 req->end = NULL; 179 req->end = NULL;
173 req->inode = NULL;
174 req->inode2 = NULL;
175 req->file = NULL;
176 if (!list_empty(&req->bg_entry))
177 fuse_remove_background(fc, req);
178 spin_unlock(&fc->lock); 180 spin_unlock(&fc->lock);
181 down_read(&fc->sbput_sem);
182 if (fc->mounted)
183 fuse_release_background(fc, req);
184 up_read(&fc->sbput_sem);
185
186 /* fput must go outside sbput_sem, otherwise it can deadlock */
187 if (req->file)
188 fput(req->file);
179 189
180 if (end) 190 if (end)
181 end(fc, req); 191 end(fc, req);
182 else 192 else
183 fuse_put_request(fc, req); 193 fuse_put_request(fc, req);
184
185 if (file)
186 fput(file);
187 iput(inode);
188 iput(inode2);
189 } 194 }
190} 195}
191 196
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 59661c481d9d..0474202cb5dc 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -258,9 +258,15 @@ struct fuse_conn {
258 /** waitq for blocked connection */ 258 /** waitq for blocked connection */
259 wait_queue_head_t blocked_waitq; 259 wait_queue_head_t blocked_waitq;
260 260
261 /** RW semaphore for exclusion with fuse_put_super() */
262 struct rw_semaphore sbput_sem;
263
261 /** The next unique request id */ 264 /** The next unique request id */
262 u64 reqctr; 265 u64 reqctr;
263 266
267 /** Mount is active */
268 unsigned mounted;
269
264 /** Connection established, cleared on umount, connection 270 /** Connection established, cleared on umount, connection
265 abort and device release */ 271 abort and device release */
266 unsigned connected; 272 unsigned connected;
@@ -471,11 +477,11 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
471void request_send_background(struct fuse_conn *fc, struct fuse_req *req); 477void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
472 478
473/** 479/**
474 * Remove request from the the background list 480 * Release inodes and file associated with background request
475 */ 481 */
476void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req); 482void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req);
477 483
478/** Abort all requests */ 484/* Abort all requests */
479void fuse_abort_conn(struct fuse_conn *fc); 485void fuse_abort_conn(struct fuse_conn *fc);
480 486
481/** 487/**
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 43a6fc0db8a7..7627022446b2 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -204,26 +204,17 @@ static void fuse_put_super(struct super_block *sb)
204{ 204{
205 struct fuse_conn *fc = get_fuse_conn_super(sb); 205 struct fuse_conn *fc = get_fuse_conn_super(sb);
206 206
207 down_write(&fc->sbput_sem);
208 while (!list_empty(&fc->background))
209 fuse_release_background(fc,
210 list_entry(fc->background.next,
211 struct fuse_req, bg_entry));
212
207 spin_lock(&fc->lock); 213 spin_lock(&fc->lock);
214 fc->mounted = 0;
208 fc->connected = 0; 215 fc->connected = 0;
209 while (!list_empty(&fc->background)) {
210 struct fuse_req *req = list_entry(fc->background.next,
211 struct fuse_req, bg_entry);
212 struct inode *inode = req->inode;
213 struct inode *inode2 = req->inode2;
214
215 /* File would hold a reference to vfsmount */
216 BUG_ON(req->file);
217 req->inode = NULL;
218 req->inode2 = NULL;
219 fuse_remove_background(fc, req);
220
221 spin_unlock(&fc->lock);
222 iput(inode);
223 iput(inode2);
224 spin_lock(&fc->lock);
225 }
226 spin_unlock(&fc->lock); 216 spin_unlock(&fc->lock);
217 up_write(&fc->sbput_sem);
227 /* Flush all readers on this fs */ 218 /* Flush all readers on this fs */
228 kill_fasync(&fc->fasync, SIGIO, POLL_IN); 219 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
229 wake_up_all(&fc->waitq); 220 wake_up_all(&fc->waitq);
@@ -395,6 +386,7 @@ static struct fuse_conn *new_conn(void)
395 INIT_LIST_HEAD(&fc->processing); 386 INIT_LIST_HEAD(&fc->processing);
396 INIT_LIST_HEAD(&fc->io); 387 INIT_LIST_HEAD(&fc->io);
397 INIT_LIST_HEAD(&fc->background); 388 INIT_LIST_HEAD(&fc->background);
389 init_rwsem(&fc->sbput_sem);
398 kobj_set_kset_s(fc, connections_subsys); 390 kobj_set_kset_s(fc, connections_subsys);
399 kobject_init(&fc->kobj); 391 kobject_init(&fc->kobj);
400 atomic_set(&fc->num_waiting, 0); 392 atomic_set(&fc->num_waiting, 0);
@@ -508,11 +500,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
508 if (file->f_op != &fuse_dev_operations) 500 if (file->f_op != &fuse_dev_operations)
509 return -EINVAL; 501 return -EINVAL;
510 502
511 /* Setting file->private_data can't race with other mount()
512 instances, since BKL is held for ->get_sb() */
513 if (file->private_data)
514 return -EINVAL;
515
516 fc = new_conn(); 503 fc = new_conn();
517 if (!fc) 504 if (!fc)
518 return -ENOMEM; 505 return -ENOMEM;
@@ -548,7 +535,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
548 if (err) 535 if (err)
549 goto err_free_req; 536 goto err_free_req;
550 537
538 /* Setting file->private_data can't race with other mount()
539 instances, since BKL is held for ->get_sb() */
540 err = -EINVAL;
541 if (file->private_data)
542 goto err_kobject_del;
543
551 sb->s_root = root_dentry; 544 sb->s_root = root_dentry;
545 fc->mounted = 1;
552 fc->connected = 1; 546 fc->connected = 1;
553 kobject_get(&fc->kobj); 547 kobject_get(&fc->kobj);
554 file->private_data = fc; 548 file->private_data = fc;
@@ -563,6 +557,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
563 557
564 return 0; 558 return 0;
565 559
560 err_kobject_del:
561 kobject_del(&fc->kobj);
566 err_free_req: 562 err_free_req:
567 fuse_request_free(init_req); 563 fuse_request_free(init_req);
568 err_put_root: 564 err_put_root:
diff --git a/fs/splice.c b/fs/splice.c
index 0559e7577a04..a46ddd28561e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -27,15 +27,22 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/syscalls.h> 29#include <linux/syscalls.h>
30#include <linux/uio.h>
31
32struct partial_page {
33 unsigned int offset;
34 unsigned int len;
35};
30 36
31/* 37/*
32 * Passed to the actors 38 * Passed to splice_to_pipe
33 */ 39 */
34struct splice_desc { 40struct splice_pipe_desc {
35 unsigned int len, total_len; /* current and remaining length */ 41 struct page **pages; /* page map */
42 struct partial_page *partial; /* pages[] may not be contig */
43 int nr_pages; /* number of pages in map */
36 unsigned int flags; /* splice flags */ 44 unsigned int flags; /* splice flags */
37 struct file *file; /* file to read/write */ 45 struct pipe_buf_operations *ops;/* ops associated with output pipe */
38 loff_t pos; /* file position */
39}; 46};
40 47
41/* 48/*
@@ -128,6 +135,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
128 kunmap(buf->page); 135 kunmap(buf->page);
129} 136}
130 137
138static void *user_page_pipe_buf_map(struct file *file,
139 struct pipe_inode_info *pipe,
140 struct pipe_buffer *buf)
141{
142 return kmap(buf->page);
143}
144
145static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe,
146 struct pipe_buffer *buf)
147{
148 kunmap(buf->page);
149}
150
131static void page_cache_pipe_buf_get(struct pipe_inode_info *info, 151static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
132 struct pipe_buffer *buf) 152 struct pipe_buffer *buf)
133{ 153{
@@ -143,19 +163,33 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = {
143 .get = page_cache_pipe_buf_get, 163 .get = page_cache_pipe_buf_get,
144}; 164};
145 165
166static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
167 struct pipe_buffer *buf)
168{
169 return 1;
170}
171
172static struct pipe_buf_operations user_page_pipe_buf_ops = {
173 .can_merge = 0,
174 .map = user_page_pipe_buf_map,
175 .unmap = user_page_pipe_buf_unmap,
176 .release = page_cache_pipe_buf_release,
177 .steal = user_page_pipe_buf_steal,
178 .get = page_cache_pipe_buf_get,
179};
180
146/* 181/*
147 * Pipe output worker. This sets up our pipe format with the page cache 182 * Pipe output worker. This sets up our pipe format with the page cache
148 * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). 183 * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
149 */ 184 */
150static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, 185static ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
151 int nr_pages, unsigned long len, 186 struct splice_pipe_desc *spd)
152 unsigned int offset, unsigned int flags)
153{ 187{
154 int ret, do_wakeup, i; 188 int ret, do_wakeup, page_nr;
155 189
156 ret = 0; 190 ret = 0;
157 do_wakeup = 0; 191 do_wakeup = 0;
158 i = 0; 192 page_nr = 0;
159 193
160 if (pipe->inode) 194 if (pipe->inode)
161 mutex_lock(&pipe->inode->i_mutex); 195 mutex_lock(&pipe->inode->i_mutex);
@@ -171,27 +205,19 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
171 if (pipe->nrbufs < PIPE_BUFFERS) { 205 if (pipe->nrbufs < PIPE_BUFFERS) {
172 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 206 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
173 struct pipe_buffer *buf = pipe->bufs + newbuf; 207 struct pipe_buffer *buf = pipe->bufs + newbuf;
174 struct page *page = pages[i++];
175 unsigned long this_len;
176
177 this_len = PAGE_CACHE_SIZE - offset;
178 if (this_len > len)
179 this_len = len;
180 208
181 buf->page = page; 209 buf->page = spd->pages[page_nr];
182 buf->offset = offset; 210 buf->offset = spd->partial[page_nr].offset;
183 buf->len = this_len; 211 buf->len = spd->partial[page_nr].len;
184 buf->ops = &page_cache_pipe_buf_ops; 212 buf->ops = spd->ops;
185 pipe->nrbufs++; 213 pipe->nrbufs++;
214 page_nr++;
215 ret += buf->len;
216
186 if (pipe->inode) 217 if (pipe->inode)
187 do_wakeup = 1; 218 do_wakeup = 1;
188 219
189 ret += this_len; 220 if (!--spd->nr_pages)
190 len -= this_len;
191 offset = 0;
192 if (!--nr_pages)
193 break;
194 if (!len)
195 break; 221 break;
196 if (pipe->nrbufs < PIPE_BUFFERS) 222 if (pipe->nrbufs < PIPE_BUFFERS)
197 continue; 223 continue;
@@ -199,7 +225,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
199 break; 225 break;
200 } 226 }
201 227
202 if (flags & SPLICE_F_NONBLOCK) { 228 if (spd->flags & SPLICE_F_NONBLOCK) {
203 if (!ret) 229 if (!ret)
204 ret = -EAGAIN; 230 ret = -EAGAIN;
205 break; 231 break;
@@ -234,8 +260,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
234 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 260 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
235 } 261 }
236 262
237 while (i < nr_pages) 263 while (page_nr < spd->nr_pages)
238 page_cache_release(pages[i++]); 264 page_cache_release(spd->pages[page_nr++]);
239 265
240 return ret; 266 return ret;
241} 267}
@@ -246,17 +272,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
246 unsigned int flags) 272 unsigned int flags)
247{ 273{
248 struct address_space *mapping = in->f_mapping; 274 struct address_space *mapping = in->f_mapping;
249 unsigned int loff, offset, nr_pages; 275 unsigned int loff, nr_pages;
250 struct page *pages[PIPE_BUFFERS]; 276 struct page *pages[PIPE_BUFFERS];
277 struct partial_page partial[PIPE_BUFFERS];
251 struct page *page; 278 struct page *page;
252 pgoff_t index, end_index; 279 pgoff_t index, end_index;
253 loff_t isize; 280 loff_t isize;
254 size_t bytes; 281 size_t total_len;
255 int i, error; 282 int error, page_nr;
283 struct splice_pipe_desc spd = {
284 .pages = pages,
285 .partial = partial,
286 .flags = flags,
287 .ops = &page_cache_pipe_buf_ops,
288 };
256 289
257 index = *ppos >> PAGE_CACHE_SHIFT; 290 index = *ppos >> PAGE_CACHE_SHIFT;
258 loff = offset = *ppos & ~PAGE_CACHE_MASK; 291 loff = *ppos & ~PAGE_CACHE_MASK;
259 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 292 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
260 293
261 if (nr_pages > PIPE_BUFFERS) 294 if (nr_pages > PIPE_BUFFERS)
262 nr_pages = PIPE_BUFFERS; 295 nr_pages = PIPE_BUFFERS;
@@ -266,47 +299,75 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
266 * read-ahead if this is a non-zero offset (we are likely doing small 299 * read-ahead if this is a non-zero offset (we are likely doing small
267 * chunk splice and the page is already there) for a single page. 300 * chunk splice and the page is already there) for a single page.
268 */ 301 */
269 if (!offset || nr_pages > 1) 302 if (!loff || nr_pages > 1)
270 do_page_cache_readahead(mapping, in, index, nr_pages); 303 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
271 304
272 /* 305 /*
273 * Now fill in the holes: 306 * Now fill in the holes:
274 */ 307 */
275 error = 0; 308 error = 0;
276 bytes = 0; 309 total_len = 0;
277 for (i = 0; i < nr_pages; i++, index++) {
278 unsigned int this_len;
279 310
280 if (!len) 311 /*
281 break; 312 * Lookup the (hopefully) full range of pages we need.
313 */
314 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
282 315
316 /*
317 * If find_get_pages_contig() returned fewer pages than we needed,
318 * allocate the rest.
319 */
320 index += spd.nr_pages;
321 while (spd.nr_pages < nr_pages) {
283 /* 322 /*
284 * this_len is the max we'll use from this page 323 * Page could be there, find_get_pages_contig() breaks on
285 */ 324 * the first hole.
286 this_len = min(len, PAGE_CACHE_SIZE - loff);
287find_page:
288 /*
289 * lookup the page for this index
290 */ 325 */
291 page = find_get_page(mapping, index); 326 page = find_get_page(mapping, index);
292 if (!page) { 327 if (!page) {
293 /* 328 /*
294 * page didn't exist, allocate one 329 * page didn't exist, allocate one.
295 */ 330 */
296 page = page_cache_alloc_cold(mapping); 331 page = page_cache_alloc_cold(mapping);
297 if (!page) 332 if (!page)
298 break; 333 break;
299 334
300 error = add_to_page_cache_lru(page, mapping, index, 335 error = add_to_page_cache_lru(page, mapping, index,
301 mapping_gfp_mask(mapping)); 336 mapping_gfp_mask(mapping));
302 if (unlikely(error)) { 337 if (unlikely(error)) {
303 page_cache_release(page); 338 page_cache_release(page);
304 break; 339 break;
305 } 340 }
306 341 /*
307 goto readpage; 342 * add_to_page_cache() locks the page, unlock it
343 * to avoid convoluting the logic below even more.
344 */
345 unlock_page(page);
308 } 346 }
309 347
348 pages[spd.nr_pages++] = page;
349 index++;
350 }
351
352 /*
353 * Now loop over the map and see if we need to start IO on any
354 * pages, fill in the partial map, etc.
355 */
356 index = *ppos >> PAGE_CACHE_SHIFT;
357 nr_pages = spd.nr_pages;
358 spd.nr_pages = 0;
359 for (page_nr = 0; page_nr < nr_pages; page_nr++) {
360 unsigned int this_len;
361
362 if (!len)
363 break;
364
365 /*
366 * this_len is the max we'll use from this page
367 */
368 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
369 page = pages[page_nr];
370
310 /* 371 /*
311 * If the page isn't uptodate, we may need to start io on it 372 * If the page isn't uptodate, we may need to start io on it
312 */ 373 */
@@ -327,7 +388,6 @@ find_page:
327 */ 388 */
328 if (!page->mapping) { 389 if (!page->mapping) {
329 unlock_page(page); 390 unlock_page(page);
330 page_cache_release(page);
331 break; 391 break;
332 } 392 }
333 /* 393 /*
@@ -338,16 +398,20 @@ find_page:
338 goto fill_it; 398 goto fill_it;
339 } 399 }
340 400
341readpage:
342 /* 401 /*
343 * need to read in the page 402 * need to read in the page
344 */ 403 */
345 error = mapping->a_ops->readpage(in, page); 404 error = mapping->a_ops->readpage(in, page);
346
347 if (unlikely(error)) { 405 if (unlikely(error)) {
348 page_cache_release(page); 406 /*
407 * We really should re-lookup the page here,
408 * but it complicates things a lot. Instead
409 * lets just do what we already stored, and
410 * we'll get it the next time we are called.
411 */
349 if (error == AOP_TRUNCATED_PAGE) 412 if (error == AOP_TRUNCATED_PAGE)
350 goto find_page; 413 error = 0;
414
351 break; 415 break;
352 } 416 }
353 417
@@ -356,10 +420,8 @@ readpage:
356 */ 420 */
357 isize = i_size_read(mapping->host); 421 isize = i_size_read(mapping->host);
358 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 422 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
359 if (unlikely(!isize || index > end_index)) { 423 if (unlikely(!isize || index > end_index))
360 page_cache_release(page);
361 break; 424 break;
362 }
363 425
364 /* 426 /*
365 * if this is the last page, see if we need to shrink 427 * if this is the last page, see if we need to shrink
@@ -367,26 +429,35 @@ readpage:
367 */ 429 */
368 if (end_index == index) { 430 if (end_index == index) {
369 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); 431 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
370 if (bytes + loff > isize) { 432 if (total_len + loff > isize)
371 page_cache_release(page);
372 break; 433 break;
373 }
374 /* 434 /*
375 * force quit after adding this page 435 * force quit after adding this page
376 */ 436 */
377 nr_pages = i; 437 len = this_len;
378 this_len = min(this_len, loff); 438 this_len = min(this_len, loff);
439 loff = 0;
379 } 440 }
380 } 441 }
381fill_it: 442fill_it:
382 pages[i] = page; 443 partial[page_nr].offset = loff;
383 bytes += this_len; 444 partial[page_nr].len = this_len;
384 len -= this_len; 445 len -= this_len;
446 total_len += this_len;
385 loff = 0; 447 loff = 0;
448 spd.nr_pages++;
449 index++;
386 } 450 }
387 451
388 if (i) 452 /*
389 return move_to_pipe(pipe, pages, i, bytes, offset, flags); 453 * Release any pages at the end, if we quit early. 'i' is how far
454 * we got, 'nr_pages' is how many pages are in the map.
455 */
456 while (page_nr < nr_pages)
457 page_cache_release(pages[page_nr++]);
458
459 if (spd.nr_pages)
460 return splice_to_pipe(pipe, &spd);
390 461
391 return error; 462 return error;
392} 463}
@@ -439,14 +510,13 @@ EXPORT_SYMBOL(generic_file_splice_read);
439 510
440/* 511/*
441 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 512 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
442 * using sendpage(). 513 * using sendpage(). Return the number of bytes sent.
443 */ 514 */
444static int pipe_to_sendpage(struct pipe_inode_info *info, 515static int pipe_to_sendpage(struct pipe_inode_info *info,
445 struct pipe_buffer *buf, struct splice_desc *sd) 516 struct pipe_buffer *buf, struct splice_desc *sd)
446{ 517{
447 struct file *file = sd->file; 518 struct file *file = sd->file;
448 loff_t pos = sd->pos; 519 loff_t pos = sd->pos;
449 unsigned int offset;
450 ssize_t ret; 520 ssize_t ret;
451 void *ptr; 521 void *ptr;
452 int more; 522 int more;
@@ -461,16 +531,13 @@ static int pipe_to_sendpage(struct pipe_inode_info *info,
461 if (IS_ERR(ptr)) 531 if (IS_ERR(ptr))
462 return PTR_ERR(ptr); 532 return PTR_ERR(ptr);
463 533
464 offset = pos & ~PAGE_CACHE_MASK;
465 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 534 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
466 535
467 ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more); 536 ret = file->f_op->sendpage(file, buf->page, buf->offset, sd->len,
537 &pos, more);
468 538
469 buf->ops->unmap(info, buf); 539 buf->ops->unmap(info, buf);
470 if (ret == sd->len) 540 return ret;
471 return 0;
472
473 return -EIO;
474} 541}
475 542
476/* 543/*
@@ -499,7 +566,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
499 struct file *file = sd->file; 566 struct file *file = sd->file;
500 struct address_space *mapping = file->f_mapping; 567 struct address_space *mapping = file->f_mapping;
501 gfp_t gfp_mask = mapping_gfp_mask(mapping); 568 gfp_t gfp_mask = mapping_gfp_mask(mapping);
502 unsigned int offset; 569 unsigned int offset, this_len;
503 struct page *page; 570 struct page *page;
504 pgoff_t index; 571 pgoff_t index;
505 char *src; 572 char *src;
@@ -515,6 +582,10 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
515 index = sd->pos >> PAGE_CACHE_SHIFT; 582 index = sd->pos >> PAGE_CACHE_SHIFT;
516 offset = sd->pos & ~PAGE_CACHE_MASK; 583 offset = sd->pos & ~PAGE_CACHE_MASK;
517 584
585 this_len = sd->len;
586 if (this_len + offset > PAGE_CACHE_SIZE)
587 this_len = PAGE_CACHE_SIZE - offset;
588
518 /* 589 /*
519 * Reuse buf page, if SPLICE_F_MOVE is set. 590 * Reuse buf page, if SPLICE_F_MOVE is set.
520 */ 591 */
@@ -558,7 +629,7 @@ find_page:
558 * the full page. 629 * the full page.
559 */ 630 */
560 if (!PageUptodate(page)) { 631 if (!PageUptodate(page)) {
561 if (sd->len < PAGE_CACHE_SIZE) { 632 if (this_len < PAGE_CACHE_SIZE) {
562 ret = mapping->a_ops->readpage(file, page); 633 ret = mapping->a_ops->readpage(file, page);
563 if (unlikely(ret)) 634 if (unlikely(ret))
564 goto out; 635 goto out;
@@ -582,7 +653,7 @@ find_page:
582 } 653 }
583 } 654 }
584 655
585 ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); 656 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
586 if (ret == AOP_TRUNCATED_PAGE) { 657 if (ret == AOP_TRUNCATED_PAGE) {
587 page_cache_release(page); 658 page_cache_release(page);
588 goto find_page; 659 goto find_page;
@@ -592,18 +663,22 @@ find_page:
592 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { 663 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) {
593 char *dst = kmap_atomic(page, KM_USER0); 664 char *dst = kmap_atomic(page, KM_USER0);
594 665
595 memcpy(dst + offset, src + buf->offset, sd->len); 666 memcpy(dst + offset, src + buf->offset, this_len);
596 flush_dcache_page(page); 667 flush_dcache_page(page);
597 kunmap_atomic(dst, KM_USER0); 668 kunmap_atomic(dst, KM_USER0);
598 } 669 }
599 670
600 ret = mapping->a_ops->commit_write(file, page, 0, sd->len); 671 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
601 if (ret == AOP_TRUNCATED_PAGE) { 672 if (ret == AOP_TRUNCATED_PAGE) {
602 page_cache_release(page); 673 page_cache_release(page);
603 goto find_page; 674 goto find_page;
604 } else if (ret) 675 } else if (ret)
605 goto out; 676 goto out;
606 677
678 /*
679 * Return the number of bytes written.
680 */
681 ret = this_len;
607 mark_page_accessed(page); 682 mark_page_accessed(page);
608 balance_dirty_pages_ratelimited(mapping); 683 balance_dirty_pages_ratelimited(mapping);
609out: 684out:
@@ -616,17 +691,14 @@ out_nomem:
616 return ret; 691 return ret;
617} 692}
618 693
619typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
620 struct splice_desc *);
621
622/* 694/*
623 * Pipe input worker. Most of this logic works like a regular pipe, the 695 * Pipe input worker. Most of this logic works like a regular pipe, the
624 * key here is the 'actor' worker passed in that actually moves the data 696 * key here is the 'actor' worker passed in that actually moves the data
625 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. 697 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
626 */ 698 */
627static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, 699ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
628 loff_t *ppos, size_t len, unsigned int flags, 700 loff_t *ppos, size_t len, unsigned int flags,
629 splice_actor *actor) 701 splice_actor *actor)
630{ 702{
631 int ret, do_wakeup, err; 703 int ret, do_wakeup, err;
632 struct splice_desc sd; 704 struct splice_desc sd;
@@ -652,16 +724,22 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
652 sd.len = sd.total_len; 724 sd.len = sd.total_len;
653 725
654 err = actor(pipe, buf, &sd); 726 err = actor(pipe, buf, &sd);
655 if (err) { 727 if (err <= 0) {
656 if (!ret && err != -ENODATA) 728 if (!ret && err != -ENODATA)
657 ret = err; 729 ret = err;
658 730
659 break; 731 break;
660 } 732 }
661 733
662 ret += sd.len; 734 ret += err;
663 buf->offset += sd.len; 735 buf->offset += err;
664 buf->len -= sd.len; 736 buf->len -= err;
737
738 sd.len -= err;
739 sd.pos += err;
740 sd.total_len -= err;
741 if (sd.len)
742 continue;
665 743
666 if (!buf->len) { 744 if (!buf->len) {
667 buf->ops = NULL; 745 buf->ops = NULL;
@@ -672,8 +750,6 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
672 do_wakeup = 1; 750 do_wakeup = 1;
673 } 751 }
674 752
675 sd.pos += sd.len;
676 sd.total_len -= sd.len;
677 if (!sd.total_len) 753 if (!sd.total_len)
678 break; 754 break;
679 } 755 }
@@ -741,7 +817,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
741 struct address_space *mapping = out->f_mapping; 817 struct address_space *mapping = out->f_mapping;
742 ssize_t ret; 818 ssize_t ret;
743 819
744 ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 820 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
745 if (ret > 0) { 821 if (ret > 0) {
746 struct inode *inode = mapping->host; 822 struct inode *inode = mapping->host;
747 823
@@ -783,7 +859,7 @@ EXPORT_SYMBOL(generic_file_splice_write);
783ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 859ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
784 loff_t *ppos, size_t len, unsigned int flags) 860 loff_t *ppos, size_t len, unsigned int flags)
785{ 861{
786 return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 862 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
787} 863}
788 864
789EXPORT_SYMBOL(generic_splice_sendpage); 865EXPORT_SYMBOL(generic_splice_sendpage);
@@ -870,7 +946,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
870 946
871 /* 947 /*
872 * We don't have an immediate reader, but we'll read the stuff 948 * We don't have an immediate reader, but we'll read the stuff
873 * out of the pipe right after the move_to_pipe(). So set 949 * out of the pipe right after the splice_to_pipe(). So set
874 * PIPE_READERS appropriately. 950 * PIPE_READERS appropriately.
875 */ 951 */
876 pipe->readers = 1; 952 pipe->readers = 1;
@@ -1010,6 +1086,174 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1010 return -EINVAL; 1086 return -EINVAL;
1011} 1087}
1012 1088
1089/*
1090 * Map an iov into an array of pages and offset/length tupples. With the
1091 * partial_page structure, we can map several non-contiguous ranges into
1092 * our ones pages[] map instead of splitting that operation into pieces.
1093 * Could easily be exported as a generic helper for other users, in which
1094 * case one would probably want to add a 'max_nr_pages' parameter as well.
1095 */
1096static int get_iovec_page_array(const struct iovec __user *iov,
1097 unsigned int nr_vecs, struct page **pages,
1098 struct partial_page *partial)
1099{
1100 int buffers = 0, error = 0;
1101
1102 /*
1103 * It's ok to take the mmap_sem for reading, even
1104 * across a "get_user()".
1105 */
1106 down_read(&current->mm->mmap_sem);
1107
1108 while (nr_vecs) {
1109 unsigned long off, npages;
1110 void __user *base;
1111 size_t len;
1112 int i;
1113
1114 /*
1115 * Get user address base and length for this iovec.
1116 */
1117 error = get_user(base, &iov->iov_base);
1118 if (unlikely(error))
1119 break;
1120 error = get_user(len, &iov->iov_len);
1121 if (unlikely(error))
1122 break;
1123
1124 /*
1125 * Sanity check this iovec. 0 read succeeds.
1126 */
1127 if (unlikely(!len))
1128 break;
1129 error = -EFAULT;
1130 if (unlikely(!base))
1131 break;
1132
1133 /*
1134 * Get this base offset and number of pages, then map
1135 * in the user pages.
1136 */
1137 off = (unsigned long) base & ~PAGE_MASK;
1138 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1139 if (npages > PIPE_BUFFERS - buffers)
1140 npages = PIPE_BUFFERS - buffers;
1141
1142 error = get_user_pages(current, current->mm,
1143 (unsigned long) base, npages, 0, 0,
1144 &pages[buffers], NULL);
1145
1146 if (unlikely(error <= 0))
1147 break;
1148
1149 /*
1150 * Fill this contiguous range into the partial page map.
1151 */
1152 for (i = 0; i < error; i++) {
1153 const int plen = min_t(size_t, len, PAGE_SIZE) - off;
1154
1155 partial[buffers].offset = off;
1156 partial[buffers].len = plen;
1157
1158 off = 0;
1159 len -= plen;
1160 buffers++;
1161 }
1162
1163 /*
1164 * We didn't complete this iov, stop here since it probably
1165 * means we have to move some of this into a pipe to
1166 * be able to continue.
1167 */
1168 if (len)
1169 break;
1170
1171 /*
1172 * Don't continue if we mapped fewer pages than we asked for,
1173 * or if we mapped the max number of pages that we have
1174 * room for.
1175 */
1176 if (error < npages || buffers == PIPE_BUFFERS)
1177 break;
1178
1179 nr_vecs--;
1180 iov++;
1181 }
1182
1183 up_read(&current->mm->mmap_sem);
1184
1185 if (buffers)
1186 return buffers;
1187
1188 return error;
1189}
1190
1191/*
1192 * vmsplice splices a user address range into a pipe. It can be thought of
1193 * as splice-from-memory, where the regular splice is splice-from-file (or
1194 * to file). In both cases the output is a pipe, naturally.
1195 *
1196 * Note that vmsplice only supports splicing _from_ user memory to a pipe,
1197 * not the other way around. Splicing from user memory is a simple operation
1198 * that can be supported without any funky alignment restrictions or nasty
1199 * vm tricks. We simply map in the user memory and fill them into a pipe.
1200 * The reverse isn't quite as easy, though. There are two possible solutions
1201 * for that:
1202 *
1203 * - memcpy() the data internally, at which point we might as well just
1204 * do a regular read() on the buffer anyway.
1205 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1206 * has restriction limitations on both ends of the pipe).
1207 *
1208 * Alas, it isn't here.
1209 *
1210 */
1211static long do_vmsplice(struct file *file, const struct iovec __user *iov,
1212 unsigned long nr_segs, unsigned int flags)
1213{
1214 struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe;
1215 struct page *pages[PIPE_BUFFERS];
1216 struct partial_page partial[PIPE_BUFFERS];
1217 struct splice_pipe_desc spd = {
1218 .pages = pages,
1219 .partial = partial,
1220 .flags = flags,
1221 .ops = &user_page_pipe_buf_ops,
1222 };
1223
1224 if (unlikely(!pipe))
1225 return -EBADF;
1226 if (unlikely(nr_segs > UIO_MAXIOV))
1227 return -EINVAL;
1228 else if (unlikely(!nr_segs))
1229 return 0;
1230
1231 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial);
1232 if (spd.nr_pages <= 0)
1233 return spd.nr_pages;
1234
1235 return splice_to_pipe(pipe, &spd);
1236}
1237
1238asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1239 unsigned long nr_segs, unsigned int flags)
1240{
1241 struct file *file;
1242 long error;
1243 int fput;
1244
1245 error = -EBADF;
1246 file = fget_light(fd, &fput);
1247 if (file) {
1248 if (file->f_mode & FMODE_WRITE)
1249 error = do_vmsplice(file, iov, nr_segs, flags);
1250
1251 fput_light(file, fput);
1252 }
1253
1254 return error;
1255}
1256
1013asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1257asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
1014 int fd_out, loff_t __user *off_out, 1258 int fd_out, loff_t __user *off_out,
1015 size_t len, unsigned int flags) 1259 size_t len, unsigned int flags)