aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMiklos Szeredi <mszeredi@suse.cz>2010-05-25 09:06:06 -0400
committerMiklos Szeredi <mszeredi@suse.cz>2010-05-25 09:06:06 -0400
commitdd3bb14f44a6382de2508ec387c7e5569ad2d4f1 (patch)
tree4c84a75f9f40a55f7f7d6da0e3c83b2f073bf4e8
parentb5dd328537edeb4c1d2e71e344b6c443e0874d90 (diff)
fuse: support splice() writing to fuse device
Allow userspace filesystem implementation to use splice() to write to the fuse device. The semantics of using splice() are: 1) buffer the message header and data in a temporary pipe 2) with a *single* splice() call move the message from the temporary pipe to the fuse device The READ reply message has the most interesting use for this, since now the data from an arbitrary file descriptor (which could be a regular file, a block device or a socket) can be tranferred into the fuse device without having to go through a userspace buffer. It will also allow zero copy moving of pages. One caveat is that the protocol on the fuse device requires the length of the whole message to be written into the header. But the length of the data transferred into the temporary pipe may not be known in advance. The current library implementation works around this by using vmplice to write the header and modifying the header after splicing the data into the pipe (error handling omitted): struct fuse_out_header out; iov.iov_base = &out; iov.iov_len = sizeof(struct fuse_out_header); vmsplice(pip[1], &iov, 1, 0); len = splice(input_fd, input_offset, pip[1], NULL, len, 0); /* retrospectively modify the header: */ out.len = len + sizeof(struct fuse_out_header); splice(pip[0], NULL, fuse_chan_fd(req->ch), NULL, out.len, flags); This works since vmsplice only saves a pointer to the data, it does not copy the data itself. Since pipes are currently limited to 16 pages and messages need to be spliced atomically, the length of the data is limited to 15 pages (or 60kB for 4k pages). Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
-rw-r--r--fs/fuse/dev.c175
-rw-r--r--include/linux/fuse.h5
2 files changed, 148 insertions, 32 deletions
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 4623018e104a..2795045484ee 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -16,6 +16,7 @@
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/file.h> 17#include <linux/file.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/pipe_fs_i.h>
19 20
20MODULE_ALIAS_MISCDEV(FUSE_MINOR); 21MODULE_ALIAS_MISCDEV(FUSE_MINOR);
21 22
@@ -498,6 +499,9 @@ struct fuse_copy_state {
498 int write; 499 int write;
499 struct fuse_req *req; 500 struct fuse_req *req;
500 const struct iovec *iov; 501 const struct iovec *iov;
502 struct pipe_buffer *pipebufs;
503 struct pipe_buffer *currbuf;
504 struct pipe_inode_info *pipe;
501 unsigned long nr_segs; 505 unsigned long nr_segs;
502 unsigned long seglen; 506 unsigned long seglen;
503 unsigned long addr; 507 unsigned long addr;
@@ -522,7 +526,14 @@ static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
522/* Unmap and put previous page of userspace buffer */ 526/* Unmap and put previous page of userspace buffer */
523static void fuse_copy_finish(struct fuse_copy_state *cs) 527static void fuse_copy_finish(struct fuse_copy_state *cs)
524{ 528{
525 if (cs->mapaddr) { 529 if (cs->currbuf) {
530 struct pipe_buffer *buf = cs->currbuf;
531
532 buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
533
534 cs->currbuf = NULL;
535 cs->mapaddr = NULL;
536 } else if (cs->mapaddr) {
526 kunmap_atomic(cs->mapaddr, KM_USER0); 537 kunmap_atomic(cs->mapaddr, KM_USER0);
527 if (cs->write) { 538 if (cs->write) {
528 flush_dcache_page(cs->pg); 539 flush_dcache_page(cs->pg);
@@ -544,23 +555,39 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
544 555
545 unlock_request(cs->fc, cs->req); 556 unlock_request(cs->fc, cs->req);
546 fuse_copy_finish(cs); 557 fuse_copy_finish(cs);
547 if (!cs->seglen) { 558 if (cs->pipebufs) {
559 struct pipe_buffer *buf = cs->pipebufs;
560
561 err = buf->ops->confirm(cs->pipe, buf);
562 if (err)
563 return err;
564
548 BUG_ON(!cs->nr_segs); 565 BUG_ON(!cs->nr_segs);
549 cs->seglen = cs->iov[0].iov_len; 566 cs->currbuf = buf;
550 cs->addr = (unsigned long) cs->iov[0].iov_base; 567 cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
551 cs->iov++; 568 cs->len = buf->len;
569 cs->buf = cs->mapaddr + buf->offset;
570 cs->pipebufs++;
552 cs->nr_segs--; 571 cs->nr_segs--;
572 } else {
573 if (!cs->seglen) {
574 BUG_ON(!cs->nr_segs);
575 cs->seglen = cs->iov[0].iov_len;
576 cs->addr = (unsigned long) cs->iov[0].iov_base;
577 cs->iov++;
578 cs->nr_segs--;
579 }
580 err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg);
581 if (err < 0)
582 return err;
583 BUG_ON(err != 1);
584 offset = cs->addr % PAGE_SIZE;
585 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
586 cs->buf = cs->mapaddr + offset;
587 cs->len = min(PAGE_SIZE - offset, cs->seglen);
588 cs->seglen -= cs->len;
589 cs->addr += cs->len;
553 } 590 }
554 err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg);
555 if (err < 0)
556 return err;
557 BUG_ON(err != 1);
558 offset = cs->addr % PAGE_SIZE;
559 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
560 cs->buf = cs->mapaddr + offset;
561 cs->len = min(PAGE_SIZE - offset, cs->seglen);
562 cs->seglen -= cs->len;
563 cs->addr += cs->len;
564 591
565 return lock_request(cs->fc, cs->req); 592 return lock_request(cs->fc, cs->req);
566} 593}
@@ -984,23 +1011,17 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
984 * it from the list and copy the rest of the buffer to the request. 1011 * it from the list and copy the rest of the buffer to the request.
985 * The request is finished by calling request_end() 1012 * The request is finished by calling request_end()
986 */ 1013 */
987static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, 1014static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
988 unsigned long nr_segs, loff_t pos) 1015 struct fuse_copy_state *cs, size_t nbytes)
989{ 1016{
990 int err; 1017 int err;
991 size_t nbytes = iov_length(iov, nr_segs);
992 struct fuse_req *req; 1018 struct fuse_req *req;
993 struct fuse_out_header oh; 1019 struct fuse_out_header oh;
994 struct fuse_copy_state cs;
995 struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
996 if (!fc)
997 return -EPERM;
998 1020
999 fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
1000 if (nbytes < sizeof(struct fuse_out_header)) 1021 if (nbytes < sizeof(struct fuse_out_header))
1001 return -EINVAL; 1022 return -EINVAL;
1002 1023
1003 err = fuse_copy_one(&cs, &oh, sizeof(oh)); 1024 err = fuse_copy_one(cs, &oh, sizeof(oh));
1004 if (err) 1025 if (err)
1005 goto err_finish; 1026 goto err_finish;
1006 1027
@@ -1013,7 +1034,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1013 * and error contains notification code. 1034 * and error contains notification code.
1014 */ 1035 */
1015 if (!oh.unique) { 1036 if (!oh.unique) {
1016 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs); 1037 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
1017 return err ? err : nbytes; 1038 return err ? err : nbytes;
1018 } 1039 }
1019 1040
@@ -1032,7 +1053,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1032 1053
1033 if (req->aborted) { 1054 if (req->aborted) {
1034 spin_unlock(&fc->lock); 1055 spin_unlock(&fc->lock);
1035 fuse_copy_finish(&cs); 1056 fuse_copy_finish(cs);
1036 spin_lock(&fc->lock); 1057 spin_lock(&fc->lock);
1037 request_end(fc, req); 1058 request_end(fc, req);
1038 return -ENOENT; 1059 return -ENOENT;
@@ -1049,7 +1070,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1049 queue_interrupt(fc, req); 1070 queue_interrupt(fc, req);
1050 1071
1051 spin_unlock(&fc->lock); 1072 spin_unlock(&fc->lock);
1052 fuse_copy_finish(&cs); 1073 fuse_copy_finish(cs);
1053 return nbytes; 1074 return nbytes;
1054 } 1075 }
1055 1076
@@ -1057,11 +1078,11 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1057 list_move(&req->list, &fc->io); 1078 list_move(&req->list, &fc->io);
1058 req->out.h = oh; 1079 req->out.h = oh;
1059 req->locked = 1; 1080 req->locked = 1;
1060 cs.req = req; 1081 cs->req = req;
1061 spin_unlock(&fc->lock); 1082 spin_unlock(&fc->lock);
1062 1083
1063 err = copy_out_args(&cs, &req->out, nbytes); 1084 err = copy_out_args(cs, &req->out, nbytes);
1064 fuse_copy_finish(&cs); 1085 fuse_copy_finish(cs);
1065 1086
1066 spin_lock(&fc->lock); 1087 spin_lock(&fc->lock);
1067 req->locked = 0; 1088 req->locked = 0;
@@ -1077,10 +1098,101 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1077 err_unlock: 1098 err_unlock:
1078 spin_unlock(&fc->lock); 1099 spin_unlock(&fc->lock);
1079 err_finish: 1100 err_finish:
1080 fuse_copy_finish(&cs); 1101 fuse_copy_finish(cs);
1081 return err; 1102 return err;
1082} 1103}
1083 1104
1105static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1106 unsigned long nr_segs, loff_t pos)
1107{
1108 struct fuse_copy_state cs;
1109 struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
1110 if (!fc)
1111 return -EPERM;
1112
1113 fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
1114
1115 return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs));
1116}
1117
1118static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
1119 struct file *out, loff_t *ppos,
1120 size_t len, unsigned int flags)
1121{
1122 unsigned nbuf;
1123 unsigned idx;
1124 struct pipe_buffer *bufs;
1125 struct fuse_copy_state cs;
1126 struct fuse_conn *fc;
1127 size_t rem;
1128 ssize_t ret;
1129
1130 fc = fuse_get_conn(out);
1131 if (!fc)
1132 return -EPERM;
1133
1134 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
1135 if (!bufs)
1136 return -ENOMEM;
1137
1138 pipe_lock(pipe);
1139 nbuf = 0;
1140 rem = 0;
1141 for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
1142 rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
1143
1144 ret = -EINVAL;
1145 if (rem < len) {
1146 pipe_unlock(pipe);
1147 goto out;
1148 }
1149
1150 rem = len;
1151 while (rem) {
1152 struct pipe_buffer *ibuf;
1153 struct pipe_buffer *obuf;
1154
1155 BUG_ON(nbuf >= pipe->buffers);
1156 BUG_ON(!pipe->nrbufs);
1157 ibuf = &pipe->bufs[pipe->curbuf];
1158 obuf = &bufs[nbuf];
1159
1160 if (rem >= ibuf->len) {
1161 *obuf = *ibuf;
1162 ibuf->ops = NULL;
1163 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
1164 pipe->nrbufs--;
1165 } else {
1166 ibuf->ops->get(pipe, ibuf);
1167 *obuf = *ibuf;
1168 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1169 obuf->len = rem;
1170 ibuf->offset += obuf->len;
1171 ibuf->len -= obuf->len;
1172 }
1173 nbuf++;
1174 rem -= obuf->len;
1175 }
1176 pipe_unlock(pipe);
1177
1178 memset(&cs, 0, sizeof(struct fuse_copy_state));
1179 cs.fc = fc;
1180 cs.write = 0;
1181 cs.pipebufs = bufs;
1182 cs.nr_segs = nbuf;
1183 cs.pipe = pipe;
1184
1185 ret = fuse_dev_do_write(fc, &cs, len);
1186
1187 for (idx = 0; idx < nbuf; idx++) {
1188 struct pipe_buffer *buf = &bufs[idx];
1189 buf->ops->release(pipe, buf);
1190 }
1191out:
1192 kfree(bufs);
1193 return ret;
1194}
1195
1084static unsigned fuse_dev_poll(struct file *file, poll_table *wait) 1196static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
1085{ 1197{
1086 unsigned mask = POLLOUT | POLLWRNORM; 1198 unsigned mask = POLLOUT | POLLWRNORM;
@@ -1224,6 +1336,7 @@ const struct file_operations fuse_dev_operations = {
1224 .aio_read = fuse_dev_read, 1336 .aio_read = fuse_dev_read,
1225 .write = do_sync_write, 1337 .write = do_sync_write,
1226 .aio_write = fuse_dev_write, 1338 .aio_write = fuse_dev_write,
1339 .splice_write = fuse_dev_splice_write,
1227 .poll = fuse_dev_poll, 1340 .poll = fuse_dev_poll,
1228 .release = fuse_dev_release, 1341 .release = fuse_dev_release,
1229 .fasync = fuse_dev_fasync, 1342 .fasync = fuse_dev_fasync,
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 3e2925a34bf0..88e0eb596919 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -34,6 +34,9 @@
34 * 7.13 34 * 7.13
35 * - make max number of background requests and congestion threshold 35 * - make max number of background requests and congestion threshold
36 * tunables 36 * tunables
37 *
38 * 7.14
39 * - add splice support to fuse device
37 */ 40 */
38 41
39#ifndef _LINUX_FUSE_H 42#ifndef _LINUX_FUSE_H
@@ -65,7 +68,7 @@
65#define FUSE_KERNEL_VERSION 7 68#define FUSE_KERNEL_VERSION 7
66 69
67/** Minor version number of this interface */ 70/** Minor version number of this interface */
68#define FUSE_KERNEL_MINOR_VERSION 13 71#define FUSE_KERNEL_MINOR_VERSION 14
69 72
70/** The node ID of the root inode */ 73/** The node ID of the root inode */
71#define FUSE_ROOT_ID 1 74#define FUSE_ROOT_ID 1