diff options
-rw-r--r-- | Documentation/filesystems/fuse.txt | 341 | ||||
-rw-r--r-- | fs/fuse/Makefile | 2 | ||||
-rw-r--r-- | fs/fuse/dev.c | 884 | ||||
-rw-r--r-- | fs/fuse/fuse_i.h | 223 | ||||
-rw-r--r-- | fs/fuse/inode.c | 58 | ||||
-rw-r--r-- | include/linux/fuse.h | 36 |
6 files changed, 1537 insertions, 7 deletions
diff --git a/Documentation/filesystems/fuse.txt b/Documentation/filesystems/fuse.txt new file mode 100644 index 000000000000..83f96cf56960 --- /dev/null +++ b/Documentation/filesystems/fuse.txt | |||
@@ -0,0 +1,341 @@ | |||
1 | Definitions | ||
2 | ~~~~~~~~~~~ | ||
3 | |||
4 | Userspace filesystem: | ||
5 | |||
6 | A filesystem in which data and metadata are provided by an ordinary | ||
7 | userspace process. The filesystem can be accessed normally through | ||
8 | the kernel interface. | ||
9 | |||
10 | Filesystem daemon: | ||
11 | |||
12 | The process(es) providing the data and metadata of the filesystem. | ||
13 | |||
14 | Non-privileged mount (or user mount): | ||
15 | |||
16 | A userspace filesystem mounted by a non-privileged (non-root) user. | ||
17 | The filesystem daemon is running with the privileges of the mounting | ||
18 | user. NOTE: this is not the same as mounts allowed with the "user" | ||
19 | option in /etc/fstab, which is not discussed here. | ||
20 | |||
21 | Mount owner: | ||
22 | |||
23 | The user who does the mounting. | ||
24 | |||
25 | User: | ||
26 | |||
27 | The user who is performing filesystem operations. | ||
28 | |||
29 | What is FUSE? | ||
30 | ~~~~~~~~~~~~~ | ||
31 | |||
32 | FUSE is a userspace filesystem framework. It consists of a kernel | ||
33 | module (fuse.ko), a userspace library (libfuse.*) and a mount utility | ||
34 | (fusermount). | ||
35 | |||
36 | One of the most important features of FUSE is allowing secure, | ||
37 | non-privileged mounts. This opens up new possibilities for the use of | ||
38 | filesystems. A good example is sshfs: a secure network filesystem | ||
39 | using the sftp protocol. | ||
40 | |||
41 | The userspace library and utilities are available from the FUSE | ||
42 | homepage: | ||
43 | |||
44 | http://fuse.sourceforge.net/ | ||
45 | |||
46 | Mount options | ||
47 | ~~~~~~~~~~~~~ | ||
48 | |||
49 | 'fd=N' | ||
50 | |||
51 | The file descriptor to use for communication between the userspace | ||
52 | filesystem and the kernel. The file descriptor must have been | ||
53 | obtained by opening the FUSE device ('/dev/fuse'). | ||
54 | |||
55 | 'rootmode=M' | ||
56 | |||
57 | The file mode of the filesystem's root in octal representation. | ||
58 | |||
59 | 'user_id=N' | ||
60 | |||
61 | The numeric user id of the mount owner. | ||
62 | |||
63 | 'group_id=N' | ||
64 | |||
65 | The numeric group id of the mount owner. | ||
66 | |||
67 | 'default_permissions' | ||
68 | |||
69 | By default FUSE doesn't check file access permissions, the | ||
70 | filesystem is free to implement it's access policy or leave it to | ||
71 | the underlying file access mechanism (e.g. in case of network | ||
72 | filesystems). This option enables permission checking, restricting | ||
73 | access based on file mode. This is option is usually useful | ||
74 | together with the 'allow_other' mount option. | ||
75 | |||
76 | 'allow_other' | ||
77 | |||
78 | This option overrides the security measure restricting file access | ||
79 | to the user mounting the filesystem. This option is by default only | ||
80 | allowed to root, but this restriction can be removed with a | ||
81 | (userspace) configuration option. | ||
82 | |||
83 | 'kernel_cache' | ||
84 | |||
85 | This option disables flushing the cache of the file contents on | ||
86 | every open(). This should only be enabled on filesystems, where the | ||
87 | file data is never changed externally (not through the mounted FUSE | ||
88 | filesystem). Thus it is not suitable for network filesystems and | ||
89 | other "intermediate" filesystems. | ||
90 | |||
91 | NOTE: if this option is not specified (and neither 'direct_io') data | ||
92 | is still cached after the open(), so a read() system call will not | ||
93 | always initiate a read operation. | ||
94 | |||
95 | 'direct_io' | ||
96 | |||
97 | This option disables the use of page cache (file content cache) in | ||
98 | the kernel for this filesystem. This has several affects: | ||
99 | |||
100 | - Each read() or write() system call will initiate one or more | ||
101 | read or write operations, data will not be cached in the | ||
102 | kernel. | ||
103 | |||
104 | - The return value of the read() and write() system calls will | ||
105 | correspond to the return values of the read and write | ||
106 | operations. This is useful for example if the file size is not | ||
107 | known in advance (before reading it). | ||
108 | |||
109 | 'max_read=N' | ||
110 | |||
111 | With this option the maximum size of read operations can be set. | ||
112 | The default is infinite. Note that the size of read requests is | ||
113 | limited anyway to 32 pages (which is 128kbyte on i386). | ||
114 | |||
115 | How do non-privileged mounts work? | ||
116 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
117 | |||
118 | Since the mount() system call is a privileged operation, a helper | ||
119 | program (fusermount) is needed, which is installed setuid root. | ||
120 | |||
121 | The implication of providing non-privileged mounts is that the mount | ||
122 | owner must not be able to use this capability to compromise the | ||
123 | system. Obvious requirements arising from this are: | ||
124 | |||
125 | A) mount owner should not be able to get elevated privileges with the | ||
126 | help of the mounted filesystem | ||
127 | |||
128 | B) mount owner should not get illegitimate access to information from | ||
129 | other users' and the super user's processes | ||
130 | |||
131 | C) mount owner should not be able to induce undesired behavior in | ||
132 | other users' or the super user's processes | ||
133 | |||
134 | How are requirements fulfilled? | ||
135 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
136 | |||
137 | A) The mount owner could gain elevated privileges by either: | ||
138 | |||
139 | 1) creating a filesystem containing a device file, then opening | ||
140 | this device | ||
141 | |||
142 | 2) creating a filesystem containing a suid or sgid application, | ||
143 | then executing this application | ||
144 | |||
145 | The solution is not to allow opening device files and ignore | ||
146 | setuid and setgid bits when executing programs. To ensure this | ||
147 | fusermount always adds "nosuid" and "nodev" to the mount options | ||
148 | for non-privileged mounts. | ||
149 | |||
150 | B) If another user is accessing files or directories in the | ||
151 | filesystem, the filesystem daemon serving requests can record the | ||
152 | exact sequence and timing of operations performed. This | ||
153 | information is otherwise inaccessible to the mount owner, so this | ||
154 | counts as an information leak. | ||
155 | |||
156 | The solution to this problem will be presented in point 2) of C). | ||
157 | |||
158 | C) There are several ways in which the mount owner can induce | ||
159 | undesired behavior in other users' processes, such as: | ||
160 | |||
161 | 1) mounting a filesystem over a file or directory which the mount | ||
162 | owner could otherwise not be able to modify (or could only | ||
163 | make limited modifications). | ||
164 | |||
165 | This is solved in fusermount, by checking the access | ||
166 | permissions on the mountpoint and only allowing the mount if | ||
167 | the mount owner can do unlimited modification (has write | ||
168 | access to the mountpoint, and mountpoint is not a "sticky" | ||
169 | directory) | ||
170 | |||
171 | 2) Even if 1) is solved the mount owner can change the behavior | ||
172 | of other users' processes. | ||
173 | |||
174 | i) It can slow down or indefinitely delay the execution of a | ||
175 | filesystem operation creating a DoS against the user or the | ||
176 | whole system. For example a suid application locking a | ||
177 | system file, and then accessing a file on the mount owner's | ||
178 | filesystem could be stopped, and thus causing the system | ||
179 | file to be locked forever. | ||
180 | |||
181 | ii) It can present files or directories of unlimited length, or | ||
182 | directory structures of unlimited depth, possibly causing a | ||
183 | system process to eat up diskspace, memory or other | ||
184 | resources, again causing DoS. | ||
185 | |||
186 | The solution to this as well as B) is not to allow processes | ||
187 | to access the filesystem, which could otherwise not be | ||
188 | monitored or manipulated by the mount owner. Since if the | ||
189 | mount owner can ptrace a process, it can do all of the above | ||
190 | without using a FUSE mount, the same criteria as used in | ||
191 | ptrace can be used to check if a process is allowed to access | ||
192 | the filesystem or not. | ||
193 | |||
194 | Note that the ptrace check is not strictly necessary to | ||
195 | prevent B/2/i, it is enough to check if mount owner has enough | ||
196 | privilege to send signal to the process accessing the | ||
197 | filesystem, since SIGSTOP can be used to get a similar effect. | ||
198 | |||
199 | I think these limitations are unacceptable? | ||
200 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
201 | |||
202 | If a sysadmin trusts the users enough, or can ensure through other | ||
203 | measures, that system processes will never enter non-privileged | ||
204 | mounts, it can relax the last limitation with a "user_allow_other" | ||
205 | config option. If this config option is set, the mounting user can | ||
206 | add the "allow_other" mount option which disables the check for other | ||
207 | users' processes. | ||
208 | |||
209 | Kernel - userspace interface | ||
210 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
211 | |||
212 | The following diagram shows how a filesystem operation (in this | ||
213 | example unlink) is performed in FUSE. | ||
214 | |||
215 | NOTE: everything in this description is greatly simplified | ||
216 | |||
217 | | "rm /mnt/fuse/file" | FUSE filesystem daemon | ||
218 | | | | ||
219 | | | >sys_read() | ||
220 | | | >fuse_dev_read() | ||
221 | | | >request_wait() | ||
222 | | | [sleep on fc->waitq] | ||
223 | | | | ||
224 | | >sys_unlink() | | ||
225 | | >fuse_unlink() | | ||
226 | | [get request from | | ||
227 | | fc->unused_list] | | ||
228 | | >request_send() | | ||
229 | | [queue req on fc->pending] | | ||
230 | | [wake up fc->waitq] | [woken up] | ||
231 | | >request_wait_answer() | | ||
232 | | [sleep on req->waitq] | | ||
233 | | | <request_wait() | ||
234 | | | [remove req from fc->pending] | ||
235 | | | [copy req to read buffer] | ||
236 | | | [add req to fc->processing] | ||
237 | | | <fuse_dev_read() | ||
238 | | | <sys_read() | ||
239 | | | | ||
240 | | | [perform unlink] | ||
241 | | | | ||
242 | | | >sys_write() | ||
243 | | | >fuse_dev_write() | ||
244 | | | [look up req in fc->processing] | ||
245 | | | [remove from fc->processing] | ||
246 | | | [copy write buffer to req] | ||
247 | | [woken up] | [wake up req->waitq] | ||
248 | | | <fuse_dev_write() | ||
249 | | | <sys_write() | ||
250 | | <request_wait_answer() | | ||
251 | | <request_send() | | ||
252 | | [add request to | | ||
253 | | fc->unused_list] | | ||
254 | | <fuse_unlink() | | ||
255 | | <sys_unlink() | | ||
256 | |||
257 | There are a couple of ways in which to deadlock a FUSE filesystem. | ||
258 | Since we are talking about unprivileged userspace programs, | ||
259 | something must be done about these. | ||
260 | |||
261 | Scenario 1 - Simple deadlock | ||
262 | ----------------------------- | ||
263 | |||
264 | | "rm /mnt/fuse/file" | FUSE filesystem daemon | ||
265 | | | | ||
266 | | >sys_unlink("/mnt/fuse/file") | | ||
267 | | [acquire inode semaphore | | ||
268 | | for "file"] | | ||
269 | | >fuse_unlink() | | ||
270 | | [sleep on req->waitq] | | ||
271 | | | <sys_read() | ||
272 | | | >sys_unlink("/mnt/fuse/file") | ||
273 | | | [acquire inode semaphore | ||
274 | | | for "file"] | ||
275 | | | *DEADLOCK* | ||
276 | |||
277 | The solution for this is to allow requests to be interrupted while | ||
278 | they are in userspace: | ||
279 | |||
280 | | [interrupted by signal] | | ||
281 | | <fuse_unlink() | | ||
282 | | [release semaphore] | [semaphore acquired] | ||
283 | | <sys_unlink() | | ||
284 | | | >fuse_unlink() | ||
285 | | | [queue req on fc->pending] | ||
286 | | | [wake up fc->waitq] | ||
287 | | | [sleep on req->waitq] | ||
288 | |||
289 | If the filesystem daemon was single threaded, this will stop here, | ||
290 | since there's no other thread to dequeue and execute the request. | ||
291 | In this case the solution is to kill the FUSE daemon as well. If | ||
292 | there are multiple serving threads, you just have to kill them as | ||
293 | long as any remain. | ||
294 | |||
295 | Moral: a filesystem which deadlocks, can soon find itself dead. | ||
296 | |||
297 | Scenario 2 - Tricky deadlock | ||
298 | ---------------------------- | ||
299 | |||
300 | This one needs a carefully crafted filesystem. It's a variation on | ||
301 | the above, only the call back to the filesystem is not explicit, | ||
302 | but is caused by a pagefault. | ||
303 | |||
304 | | Kamikaze filesystem thread 1 | Kamikaze filesystem thread 2 | ||
305 | | | | ||
306 | | [fd = open("/mnt/fuse/file")] | [request served normally] | ||
307 | | [mmap fd to 'addr'] | | ||
308 | | [close fd] | [FLUSH triggers 'magic' flag] | ||
309 | | [read a byte from addr] | | ||
310 | | >do_page_fault() | | ||
311 | | [find or create page] | | ||
312 | | [lock page] | | ||
313 | | >fuse_readpage() | | ||
314 | | [queue READ request] | | ||
315 | | [sleep on req->waitq] | | ||
316 | | | [read request to buffer] | ||
317 | | | [create reply header before addr] | ||
318 | | | >sys_write(addr - headerlength) | ||
319 | | | >fuse_dev_write() | ||
320 | | | [look up req in fc->processing] | ||
321 | | | [remove from fc->processing] | ||
322 | | | [copy write buffer to req] | ||
323 | | | >do_page_fault() | ||
324 | | | [find or create page] | ||
325 | | | [lock page] | ||
326 | | | * DEADLOCK * | ||
327 | |||
328 | Solution is again to let the the request be interrupted (not | ||
329 | elaborated further). | ||
330 | |||
331 | An additional problem is that while the write buffer is being | ||
332 | copied to the request, the request must not be interrupted. This | ||
333 | is because the destination address of the copy may not be valid | ||
334 | after the request is interrupted. | ||
335 | |||
336 | This is solved with doing the copy atomically, and allowing | ||
337 | interruption while the page(s) belonging to the write buffer are | ||
338 | faulted with get_user_pages(). The 'req->locked' flag indicates | ||
339 | when the copy is taking place, and interruption is delayed until | ||
340 | this flag is unset. | ||
341 | |||
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 9c3e4cc7b1a6..21021c356481 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile | |||
@@ -4,4 +4,4 @@ | |||
4 | 4 | ||
5 | obj-$(CONFIG_FUSE_FS) += fuse.o | 5 | obj-$(CONFIG_FUSE_FS) += fuse.o |
6 | 6 | ||
7 | fuse-objs := inode.o | 7 | fuse-objs := dev.o inode.o |
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c new file mode 100644 index 000000000000..9aaf10a6588f --- /dev/null +++ b/fs/fuse/dev.c | |||
@@ -0,0 +1,884 @@ | |||
1 | /* | ||
2 | FUSE: Filesystem in Userspace | ||
3 | Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu> | ||
4 | |||
5 | This program can be distributed under the terms of the GNU GPL. | ||
6 | See the file COPYING. | ||
7 | */ | ||
8 | |||
9 | #include "fuse_i.h" | ||
10 | |||
11 | #include <linux/init.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/poll.h> | ||
14 | #include <linux/uio.h> | ||
15 | #include <linux/miscdevice.h> | ||
16 | #include <linux/pagemap.h> | ||
17 | #include <linux/file.h> | ||
18 | #include <linux/slab.h> | ||
19 | |||
20 | MODULE_ALIAS_MISCDEV(FUSE_MINOR); | ||
21 | |||
22 | static kmem_cache_t *fuse_req_cachep; | ||
23 | |||
24 | static inline struct fuse_conn *fuse_get_conn(struct file *file) | ||
25 | { | ||
26 | struct fuse_conn *fc; | ||
27 | spin_lock(&fuse_lock); | ||
28 | fc = file->private_data; | ||
29 | if (fc && !fc->sb) | ||
30 | fc = NULL; | ||
31 | spin_unlock(&fuse_lock); | ||
32 | return fc; | ||
33 | } | ||
34 | |||
35 | static inline void fuse_request_init(struct fuse_req *req) | ||
36 | { | ||
37 | memset(req, 0, sizeof(*req)); | ||
38 | INIT_LIST_HEAD(&req->list); | ||
39 | init_waitqueue_head(&req->waitq); | ||
40 | atomic_set(&req->count, 1); | ||
41 | } | ||
42 | |||
43 | struct fuse_req *fuse_request_alloc(void) | ||
44 | { | ||
45 | struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, SLAB_KERNEL); | ||
46 | if (req) | ||
47 | fuse_request_init(req); | ||
48 | return req; | ||
49 | } | ||
50 | |||
51 | void fuse_request_free(struct fuse_req *req) | ||
52 | { | ||
53 | kmem_cache_free(fuse_req_cachep, req); | ||
54 | } | ||
55 | |||
56 | static inline void block_sigs(sigset_t *oldset) | ||
57 | { | ||
58 | sigset_t mask; | ||
59 | |||
60 | siginitsetinv(&mask, sigmask(SIGKILL)); | ||
61 | sigprocmask(SIG_BLOCK, &mask, oldset); | ||
62 | } | ||
63 | |||
64 | static inline void restore_sigs(sigset_t *oldset) | ||
65 | { | ||
66 | sigprocmask(SIG_SETMASK, oldset, NULL); | ||
67 | } | ||
68 | |||
69 | void fuse_reset_request(struct fuse_req *req) | ||
70 | { | ||
71 | int preallocated = req->preallocated; | ||
72 | BUG_ON(atomic_read(&req->count) != 1); | ||
73 | fuse_request_init(req); | ||
74 | req->preallocated = preallocated; | ||
75 | } | ||
76 | |||
77 | static void __fuse_get_request(struct fuse_req *req) | ||
78 | { | ||
79 | atomic_inc(&req->count); | ||
80 | } | ||
81 | |||
82 | /* Must be called with > 1 refcount */ | ||
83 | static void __fuse_put_request(struct fuse_req *req) | ||
84 | { | ||
85 | BUG_ON(atomic_read(&req->count) < 2); | ||
86 | atomic_dec(&req->count); | ||
87 | } | ||
88 | |||
89 | static struct fuse_req *do_get_request(struct fuse_conn *fc) | ||
90 | { | ||
91 | struct fuse_req *req; | ||
92 | |||
93 | spin_lock(&fuse_lock); | ||
94 | BUG_ON(list_empty(&fc->unused_list)); | ||
95 | req = list_entry(fc->unused_list.next, struct fuse_req, list); | ||
96 | list_del_init(&req->list); | ||
97 | spin_unlock(&fuse_lock); | ||
98 | fuse_request_init(req); | ||
99 | req->preallocated = 1; | ||
100 | req->in.h.uid = current->fsuid; | ||
101 | req->in.h.gid = current->fsgid; | ||
102 | req->in.h.pid = current->pid; | ||
103 | return req; | ||
104 | } | ||
105 | |||
106 | struct fuse_req *fuse_get_request(struct fuse_conn *fc) | ||
107 | { | ||
108 | if (down_interruptible(&fc->outstanding_sem)) | ||
109 | return NULL; | ||
110 | return do_get_request(fc); | ||
111 | } | ||
112 | |||
113 | /* | ||
114 | * Non-interruptible version of the above function is for operations | ||
115 | * which can't legally return -ERESTART{SYS,NOINTR}. This can still | ||
116 | * return NULL, but only in case the signal is SIGKILL. | ||
117 | */ | ||
118 | struct fuse_req *fuse_get_request_nonint(struct fuse_conn *fc) | ||
119 | { | ||
120 | int intr; | ||
121 | sigset_t oldset; | ||
122 | |||
123 | block_sigs(&oldset); | ||
124 | intr = down_interruptible(&fc->outstanding_sem); | ||
125 | restore_sigs(&oldset); | ||
126 | return intr ? NULL : do_get_request(fc); | ||
127 | } | ||
128 | |||
129 | static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) | ||
130 | { | ||
131 | spin_lock(&fuse_lock); | ||
132 | if (req->preallocated) | ||
133 | list_add(&req->list, &fc->unused_list); | ||
134 | else | ||
135 | fuse_request_free(req); | ||
136 | |||
137 | /* If we are in debt decrease that first */ | ||
138 | if (fc->outstanding_debt) | ||
139 | fc->outstanding_debt--; | ||
140 | else | ||
141 | up(&fc->outstanding_sem); | ||
142 | spin_unlock(&fuse_lock); | ||
143 | } | ||
144 | |||
145 | void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) | ||
146 | { | ||
147 | if (atomic_dec_and_test(&req->count)) | ||
148 | fuse_putback_request(fc, req); | ||
149 | } | ||
150 | |||
151 | /* | ||
152 | * This function is called when a request is finished. Either a reply | ||
153 | * has arrived or it was interrupted (and not yet sent) or some error | ||
154 | * occured during communication with userspace, or the device file was | ||
155 | * closed. It decreases the referece count for the request. In case | ||
156 | * of a background request the referece to the stored objects are | ||
157 | * released. The requester thread is woken up (if still waiting), and | ||
158 | * finally the request is either freed or put on the unused_list | ||
159 | * | ||
160 | * Called with fuse_lock, unlocks it | ||
161 | */ | ||
162 | static void request_end(struct fuse_conn *fc, struct fuse_req *req) | ||
163 | { | ||
164 | int putback; | ||
165 | req->finished = 1; | ||
166 | putback = atomic_dec_and_test(&req->count); | ||
167 | spin_unlock(&fuse_lock); | ||
168 | if (req->background) { | ||
169 | if (req->inode) | ||
170 | iput(req->inode); | ||
171 | if (req->inode2) | ||
172 | iput(req->inode2); | ||
173 | if (req->file) | ||
174 | fput(req->file); | ||
175 | } | ||
176 | wake_up(&req->waitq); | ||
177 | if (req->in.h.opcode == FUSE_INIT) { | ||
178 | int i; | ||
179 | |||
180 | if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION) | ||
181 | fc->conn_error = 1; | ||
182 | |||
183 | /* After INIT reply is received other requests can go | ||
184 | out. So do (FUSE_MAX_OUTSTANDING - 1) number of | ||
185 | up()s on outstanding_sem. The last up() is done in | ||
186 | fuse_putback_request() */ | ||
187 | for (i = 1; i < FUSE_MAX_OUTSTANDING; i++) | ||
188 | up(&fc->outstanding_sem); | ||
189 | } | ||
190 | if (putback) | ||
191 | fuse_putback_request(fc, req); | ||
192 | } | ||
193 | |||
194 | static void background_request(struct fuse_req *req) | ||
195 | { | ||
196 | /* Need to get hold of the inode(s) and/or file used in the | ||
197 | request, so FORGET and RELEASE are not sent too early */ | ||
198 | req->background = 1; | ||
199 | if (req->inode) | ||
200 | req->inode = igrab(req->inode); | ||
201 | if (req->inode2) | ||
202 | req->inode2 = igrab(req->inode2); | ||
203 | if (req->file) | ||
204 | get_file(req->file); | ||
205 | } | ||
206 | |||
207 | static int request_wait_answer_nonint(struct fuse_req *req) | ||
208 | { | ||
209 | int err; | ||
210 | sigset_t oldset; | ||
211 | block_sigs(&oldset); | ||
212 | err = wait_event_interruptible(req->waitq, req->finished); | ||
213 | restore_sigs(&oldset); | ||
214 | return err; | ||
215 | } | ||
216 | |||
217 | /* Called with fuse_lock held. Releases, and then reacquires it. */ | ||
218 | static void request_wait_answer(struct fuse_req *req, int interruptible) | ||
219 | { | ||
220 | int intr; | ||
221 | |||
222 | spin_unlock(&fuse_lock); | ||
223 | if (interruptible) | ||
224 | intr = wait_event_interruptible(req->waitq, req->finished); | ||
225 | else | ||
226 | intr = request_wait_answer_nonint(req); | ||
227 | spin_lock(&fuse_lock); | ||
228 | if (intr && interruptible && req->sent) { | ||
229 | /* If request is already in userspace, only allow KILL | ||
230 | signal to interrupt */ | ||
231 | spin_unlock(&fuse_lock); | ||
232 | intr = request_wait_answer_nonint(req); | ||
233 | spin_lock(&fuse_lock); | ||
234 | } | ||
235 | if (!intr) | ||
236 | return; | ||
237 | |||
238 | if (!interruptible || req->sent) | ||
239 | req->out.h.error = -EINTR; | ||
240 | else | ||
241 | req->out.h.error = -ERESTARTNOINTR; | ||
242 | |||
243 | req->interrupted = 1; | ||
244 | if (req->locked) { | ||
245 | /* This is uninterruptible sleep, because data is | ||
246 | being copied to/from the buffers of req. During | ||
247 | locked state, there mustn't be any filesystem | ||
248 | operation (e.g. page fault), since that could lead | ||
249 | to deadlock */ | ||
250 | spin_unlock(&fuse_lock); | ||
251 | wait_event(req->waitq, !req->locked); | ||
252 | spin_lock(&fuse_lock); | ||
253 | } | ||
254 | if (!req->sent && !list_empty(&req->list)) { | ||
255 | list_del(&req->list); | ||
256 | __fuse_put_request(req); | ||
257 | } else if (!req->finished && req->sent) | ||
258 | background_request(req); | ||
259 | } | ||
260 | |||
261 | static unsigned len_args(unsigned numargs, struct fuse_arg *args) | ||
262 | { | ||
263 | unsigned nbytes = 0; | ||
264 | unsigned i; | ||
265 | |||
266 | for (i = 0; i < numargs; i++) | ||
267 | nbytes += args[i].size; | ||
268 | |||
269 | return nbytes; | ||
270 | } | ||
271 | |||
272 | static void queue_request(struct fuse_conn *fc, struct fuse_req *req) | ||
273 | { | ||
274 | fc->reqctr++; | ||
275 | /* zero is special */ | ||
276 | if (fc->reqctr == 0) | ||
277 | fc->reqctr = 1; | ||
278 | req->in.h.unique = fc->reqctr; | ||
279 | req->in.h.len = sizeof(struct fuse_in_header) + | ||
280 | len_args(req->in.numargs, (struct fuse_arg *) req->in.args); | ||
281 | if (!req->preallocated) { | ||
282 | /* If request is not preallocated (either FORGET or | ||
283 | RELEASE), then still decrease outstanding_sem, so | ||
284 | user can't open infinite number of files while not | ||
285 | processing the RELEASE requests. However for | ||
286 | efficiency do it without blocking, so if down() | ||
287 | would block, just increase the debt instead */ | ||
288 | if (down_trylock(&fc->outstanding_sem)) | ||
289 | fc->outstanding_debt++; | ||
290 | } | ||
291 | list_add_tail(&req->list, &fc->pending); | ||
292 | wake_up(&fc->waitq); | ||
293 | } | ||
294 | |||
295 | static void request_send_wait(struct fuse_conn *fc, struct fuse_req *req, | ||
296 | int interruptible) | ||
297 | { | ||
298 | req->isreply = 1; | ||
299 | spin_lock(&fuse_lock); | ||
300 | if (!fc->file) | ||
301 | req->out.h.error = -ENOTCONN; | ||
302 | else if (fc->conn_error) | ||
303 | req->out.h.error = -ECONNREFUSED; | ||
304 | else { | ||
305 | queue_request(fc, req); | ||
306 | /* acquire extra reference, since request is still needed | ||
307 | after request_end() */ | ||
308 | __fuse_get_request(req); | ||
309 | |||
310 | request_wait_answer(req, interruptible); | ||
311 | } | ||
312 | spin_unlock(&fuse_lock); | ||
313 | } | ||
314 | |||
315 | void request_send(struct fuse_conn *fc, struct fuse_req *req) | ||
316 | { | ||
317 | request_send_wait(fc, req, 1); | ||
318 | } | ||
319 | |||
320 | /* | ||
321 | * Non-interruptible version of the above function is for operations | ||
322 | * which can't legally return -ERESTART{SYS,NOINTR}. This can still | ||
323 | * be interrupted but only with SIGKILL. | ||
324 | */ | ||
325 | void request_send_nonint(struct fuse_conn *fc, struct fuse_req *req) | ||
326 | { | ||
327 | request_send_wait(fc, req, 0); | ||
328 | } | ||
329 | |||
330 | static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) | ||
331 | { | ||
332 | spin_lock(&fuse_lock); | ||
333 | if (fc->file) { | ||
334 | queue_request(fc, req); | ||
335 | spin_unlock(&fuse_lock); | ||
336 | } else { | ||
337 | req->out.h.error = -ENOTCONN; | ||
338 | request_end(fc, req); | ||
339 | } | ||
340 | } | ||
341 | |||
342 | void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) | ||
343 | { | ||
344 | req->isreply = 0; | ||
345 | request_send_nowait(fc, req); | ||
346 | } | ||
347 | |||
348 | void request_send_background(struct fuse_conn *fc, struct fuse_req *req) | ||
349 | { | ||
350 | req->isreply = 1; | ||
351 | background_request(req); | ||
352 | request_send_nowait(fc, req); | ||
353 | } | ||
354 | |||
355 | void fuse_send_init(struct fuse_conn *fc) | ||
356 | { | ||
357 | /* This is called from fuse_read_super() so there's guaranteed | ||
358 | to be a request available */ | ||
359 | struct fuse_req *req = do_get_request(fc); | ||
360 | struct fuse_init_in_out *arg = &req->misc.init_in_out; | ||
361 | arg->major = FUSE_KERNEL_VERSION; | ||
362 | arg->minor = FUSE_KERNEL_MINOR_VERSION; | ||
363 | req->in.h.opcode = FUSE_INIT; | ||
364 | req->in.numargs = 1; | ||
365 | req->in.args[0].size = sizeof(*arg); | ||
366 | req->in.args[0].value = arg; | ||
367 | req->out.numargs = 1; | ||
368 | req->out.args[0].size = sizeof(*arg); | ||
369 | req->out.args[0].value = arg; | ||
370 | request_send_background(fc, req); | ||
371 | } | ||
372 | |||
373 | /* | ||
374 | * Lock the request. Up to the next unlock_request() there mustn't be | ||
375 | * anything that could cause a page-fault. If the request was already | ||
376 | * interrupted bail out. | ||
377 | */ | ||
378 | static inline int lock_request(struct fuse_req *req) | ||
379 | { | ||
380 | int err = 0; | ||
381 | if (req) { | ||
382 | spin_lock(&fuse_lock); | ||
383 | if (req->interrupted) | ||
384 | err = -ENOENT; | ||
385 | else | ||
386 | req->locked = 1; | ||
387 | spin_unlock(&fuse_lock); | ||
388 | } | ||
389 | return err; | ||
390 | } | ||
391 | |||
392 | /* | ||
393 | * Unlock request. If it was interrupted during being locked, the | ||
394 | * requester thread is currently waiting for it to be unlocked, so | ||
395 | * wake it up. | ||
396 | */ | ||
397 | static inline void unlock_request(struct fuse_req *req) | ||
398 | { | ||
399 | if (req) { | ||
400 | spin_lock(&fuse_lock); | ||
401 | req->locked = 0; | ||
402 | if (req->interrupted) | ||
403 | wake_up(&req->waitq); | ||
404 | spin_unlock(&fuse_lock); | ||
405 | } | ||
406 | } | ||
407 | |||
408 | struct fuse_copy_state { | ||
409 | int write; | ||
410 | struct fuse_req *req; | ||
411 | const struct iovec *iov; | ||
412 | unsigned long nr_segs; | ||
413 | unsigned long seglen; | ||
414 | unsigned long addr; | ||
415 | struct page *pg; | ||
416 | void *mapaddr; | ||
417 | void *buf; | ||
418 | unsigned len; | ||
419 | }; | ||
420 | |||
421 | static void fuse_copy_init(struct fuse_copy_state *cs, int write, | ||
422 | struct fuse_req *req, const struct iovec *iov, | ||
423 | unsigned long nr_segs) | ||
424 | { | ||
425 | memset(cs, 0, sizeof(*cs)); | ||
426 | cs->write = write; | ||
427 | cs->req = req; | ||
428 | cs->iov = iov; | ||
429 | cs->nr_segs = nr_segs; | ||
430 | } | ||
431 | |||
432 | /* Unmap and put previous page of userspace buffer */ | ||
433 | static inline void fuse_copy_finish(struct fuse_copy_state *cs) | ||
434 | { | ||
435 | if (cs->mapaddr) { | ||
436 | kunmap_atomic(cs->mapaddr, KM_USER0); | ||
437 | if (cs->write) { | ||
438 | flush_dcache_page(cs->pg); | ||
439 | set_page_dirty_lock(cs->pg); | ||
440 | } | ||
441 | put_page(cs->pg); | ||
442 | cs->mapaddr = NULL; | ||
443 | } | ||
444 | } | ||
445 | |||
446 | /* | ||
447 | * Get another pagefull of userspace buffer, and map it to kernel | ||
448 | * address space, and lock request | ||
449 | */ | ||
450 | static int fuse_copy_fill(struct fuse_copy_state *cs) | ||
451 | { | ||
452 | unsigned long offset; | ||
453 | int err; | ||
454 | |||
455 | unlock_request(cs->req); | ||
456 | fuse_copy_finish(cs); | ||
457 | if (!cs->seglen) { | ||
458 | BUG_ON(!cs->nr_segs); | ||
459 | cs->seglen = cs->iov[0].iov_len; | ||
460 | cs->addr = (unsigned long) cs->iov[0].iov_base; | ||
461 | cs->iov ++; | ||
462 | cs->nr_segs --; | ||
463 | } | ||
464 | down_read(¤t->mm->mmap_sem); | ||
465 | err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0, | ||
466 | &cs->pg, NULL); | ||
467 | up_read(¤t->mm->mmap_sem); | ||
468 | if (err < 0) | ||
469 | return err; | ||
470 | BUG_ON(err != 1); | ||
471 | offset = cs->addr % PAGE_SIZE; | ||
472 | cs->mapaddr = kmap_atomic(cs->pg, KM_USER0); | ||
473 | cs->buf = cs->mapaddr + offset; | ||
474 | cs->len = min(PAGE_SIZE - offset, cs->seglen); | ||
475 | cs->seglen -= cs->len; | ||
476 | cs->addr += cs->len; | ||
477 | |||
478 | return lock_request(cs->req); | ||
479 | } | ||
480 | |||
481 | /* Do as much copy to/from userspace buffer as we can */ | ||
482 | static inline int fuse_copy_do(struct fuse_copy_state *cs, void **val, | ||
483 | unsigned *size) | ||
484 | { | ||
485 | unsigned ncpy = min(*size, cs->len); | ||
486 | if (val) { | ||
487 | if (cs->write) | ||
488 | memcpy(cs->buf, *val, ncpy); | ||
489 | else | ||
490 | memcpy(*val, cs->buf, ncpy); | ||
491 | *val += ncpy; | ||
492 | } | ||
493 | *size -= ncpy; | ||
494 | cs->len -= ncpy; | ||
495 | cs->buf += ncpy; | ||
496 | return ncpy; | ||
497 | } | ||
498 | |||
499 | /* | ||
500 | * Copy a page in the request to/from the userspace buffer. Must be | ||
501 | * done atomically | ||
502 | */ | ||
503 | static inline int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, | ||
504 | unsigned offset, unsigned count, int zeroing) | ||
505 | { | ||
506 | if (page && zeroing && count < PAGE_SIZE) { | ||
507 | void *mapaddr = kmap_atomic(page, KM_USER1); | ||
508 | memset(mapaddr, 0, PAGE_SIZE); | ||
509 | kunmap_atomic(mapaddr, KM_USER1); | ||
510 | } | ||
511 | while (count) { | ||
512 | int err; | ||
513 | if (!cs->len && (err = fuse_copy_fill(cs))) | ||
514 | return err; | ||
515 | if (page) { | ||
516 | void *mapaddr = kmap_atomic(page, KM_USER1); | ||
517 | void *buf = mapaddr + offset; | ||
518 | offset += fuse_copy_do(cs, &buf, &count); | ||
519 | kunmap_atomic(mapaddr, KM_USER1); | ||
520 | } else | ||
521 | offset += fuse_copy_do(cs, NULL, &count); | ||
522 | } | ||
523 | if (page && !cs->write) | ||
524 | flush_dcache_page(page); | ||
525 | return 0; | ||
526 | } | ||
527 | |||
528 | /* Copy pages in the request to/from userspace buffer */ | ||
529 | static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, | ||
530 | int zeroing) | ||
531 | { | ||
532 | unsigned i; | ||
533 | struct fuse_req *req = cs->req; | ||
534 | unsigned offset = req->page_offset; | ||
535 | unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); | ||
536 | |||
537 | for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { | ||
538 | struct page *page = req->pages[i]; | ||
539 | int err = fuse_copy_page(cs, page, offset, count, zeroing); | ||
540 | if (err) | ||
541 | return err; | ||
542 | |||
543 | nbytes -= count; | ||
544 | count = min(nbytes, (unsigned) PAGE_SIZE); | ||
545 | offset = 0; | ||
546 | } | ||
547 | return 0; | ||
548 | } | ||
549 | |||
550 | /* Copy a single argument in the request to/from userspace buffer */ | ||
551 | static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) | ||
552 | { | ||
553 | while (size) { | ||
554 | int err; | ||
555 | if (!cs->len && (err = fuse_copy_fill(cs))) | ||
556 | return err; | ||
557 | fuse_copy_do(cs, &val, &size); | ||
558 | } | ||
559 | return 0; | ||
560 | } | ||
561 | |||
562 | /* Copy request arguments to/from userspace buffer */ | ||
563 | static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, | ||
564 | unsigned argpages, struct fuse_arg *args, | ||
565 | int zeroing) | ||
566 | { | ||
567 | int err = 0; | ||
568 | unsigned i; | ||
569 | |||
570 | for (i = 0; !err && i < numargs; i++) { | ||
571 | struct fuse_arg *arg = &args[i]; | ||
572 | if (i == numargs - 1 && argpages) | ||
573 | err = fuse_copy_pages(cs, arg->size, zeroing); | ||
574 | else | ||
575 | err = fuse_copy_one(cs, arg->value, arg->size); | ||
576 | } | ||
577 | return err; | ||
578 | } | ||
579 | |||
580 | /* Wait until a request is available on the pending list */ | ||
581 | static void request_wait(struct fuse_conn *fc) | ||
582 | { | ||
583 | DECLARE_WAITQUEUE(wait, current); | ||
584 | |||
585 | add_wait_queue_exclusive(&fc->waitq, &wait); | ||
586 | while (fc->sb && list_empty(&fc->pending)) { | ||
587 | set_current_state(TASK_INTERRUPTIBLE); | ||
588 | if (signal_pending(current)) | ||
589 | break; | ||
590 | |||
591 | spin_unlock(&fuse_lock); | ||
592 | schedule(); | ||
593 | spin_lock(&fuse_lock); | ||
594 | } | ||
595 | set_current_state(TASK_RUNNING); | ||
596 | remove_wait_queue(&fc->waitq, &wait); | ||
597 | } | ||
598 | |||
599 | /* | ||
600 | * Read a single request into the userspace filesystem's buffer. This | ||
601 | * function waits until a request is available, then removes it from | ||
602 | * the pending list and copies request data to userspace buffer. If | ||
603 | * no reply is needed (FORGET) or request has been interrupted or | ||
604 | * there was an error during the copying then it's finished by calling | ||
605 | * request_end(). Otherwise add it to the processing list, and set | ||
606 | * the 'sent' flag. | ||
607 | */ | ||
608 | static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, | ||
609 | unsigned long nr_segs, loff_t *off) | ||
610 | { | ||
611 | int err; | ||
612 | struct fuse_conn *fc; | ||
613 | struct fuse_req *req; | ||
614 | struct fuse_in *in; | ||
615 | struct fuse_copy_state cs; | ||
616 | unsigned reqsize; | ||
617 | |||
618 | spin_lock(&fuse_lock); | ||
619 | fc = file->private_data; | ||
620 | err = -EPERM; | ||
621 | if (!fc) | ||
622 | goto err_unlock; | ||
623 | request_wait(fc); | ||
624 | err = -ENODEV; | ||
625 | if (!fc->sb) | ||
626 | goto err_unlock; | ||
627 | err = -ERESTARTSYS; | ||
628 | if (list_empty(&fc->pending)) | ||
629 | goto err_unlock; | ||
630 | |||
631 | req = list_entry(fc->pending.next, struct fuse_req, list); | ||
632 | list_del_init(&req->list); | ||
633 | spin_unlock(&fuse_lock); | ||
634 | |||
635 | in = &req->in; | ||
636 | reqsize = req->in.h.len; | ||
637 | fuse_copy_init(&cs, 1, req, iov, nr_segs); | ||
638 | err = -EINVAL; | ||
639 | if (iov_length(iov, nr_segs) >= reqsize) { | ||
640 | err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); | ||
641 | if (!err) | ||
642 | err = fuse_copy_args(&cs, in->numargs, in->argpages, | ||
643 | (struct fuse_arg *) in->args, 0); | ||
644 | } | ||
645 | fuse_copy_finish(&cs); | ||
646 | |||
647 | spin_lock(&fuse_lock); | ||
648 | req->locked = 0; | ||
649 | if (!err && req->interrupted) | ||
650 | err = -ENOENT; | ||
651 | if (err) { | ||
652 | if (!req->interrupted) | ||
653 | req->out.h.error = -EIO; | ||
654 | request_end(fc, req); | ||
655 | return err; | ||
656 | } | ||
657 | if (!req->isreply) | ||
658 | request_end(fc, req); | ||
659 | else { | ||
660 | req->sent = 1; | ||
661 | list_add_tail(&req->list, &fc->processing); | ||
662 | spin_unlock(&fuse_lock); | ||
663 | } | ||
664 | return reqsize; | ||
665 | |||
666 | err_unlock: | ||
667 | spin_unlock(&fuse_lock); | ||
668 | return err; | ||
669 | } | ||
670 | |||
671 | static ssize_t fuse_dev_read(struct file *file, char __user *buf, | ||
672 | size_t nbytes, loff_t *off) | ||
673 | { | ||
674 | struct iovec iov; | ||
675 | iov.iov_len = nbytes; | ||
676 | iov.iov_base = buf; | ||
677 | return fuse_dev_readv(file, &iov, 1, off); | ||
678 | } | ||
679 | |||
680 | /* Look up request on processing list by unique ID */ | ||
681 | static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) | ||
682 | { | ||
683 | struct list_head *entry; | ||
684 | |||
685 | list_for_each(entry, &fc->processing) { | ||
686 | struct fuse_req *req; | ||
687 | req = list_entry(entry, struct fuse_req, list); | ||
688 | if (req->in.h.unique == unique) | ||
689 | return req; | ||
690 | } | ||
691 | return NULL; | ||
692 | } | ||
693 | |||
694 | static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, | ||
695 | unsigned nbytes) | ||
696 | { | ||
697 | unsigned reqsize = sizeof(struct fuse_out_header); | ||
698 | |||
699 | if (out->h.error) | ||
700 | return nbytes != reqsize ? -EINVAL : 0; | ||
701 | |||
702 | reqsize += len_args(out->numargs, out->args); | ||
703 | |||
704 | if (reqsize < nbytes || (reqsize > nbytes && !out->argvar)) | ||
705 | return -EINVAL; | ||
706 | else if (reqsize > nbytes) { | ||
707 | struct fuse_arg *lastarg = &out->args[out->numargs-1]; | ||
708 | unsigned diffsize = reqsize - nbytes; | ||
709 | if (diffsize > lastarg->size) | ||
710 | return -EINVAL; | ||
711 | lastarg->size -= diffsize; | ||
712 | } | ||
713 | return fuse_copy_args(cs, out->numargs, out->argpages, out->args, | ||
714 | out->page_zeroing); | ||
715 | } | ||
716 | |||
717 | /* | ||
718 | * Write a single reply to a request. First the header is copied from | ||
719 | * the write buffer. The request is then searched on the processing | ||
720 | * list by the unique ID found in the header. If found, then remove | ||
721 | * it from the list and copy the rest of the buffer to the request. | ||
722 | * The request is finished by calling request_end() | ||
723 | */ | ||
724 | static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, | ||
725 | unsigned long nr_segs, loff_t *off) | ||
726 | { | ||
727 | int err; | ||
728 | unsigned nbytes = iov_length(iov, nr_segs); | ||
729 | struct fuse_req *req; | ||
730 | struct fuse_out_header oh; | ||
731 | struct fuse_copy_state cs; | ||
732 | struct fuse_conn *fc = fuse_get_conn(file); | ||
733 | if (!fc) | ||
734 | return -ENODEV; | ||
735 | |||
736 | fuse_copy_init(&cs, 0, NULL, iov, nr_segs); | ||
737 | if (nbytes < sizeof(struct fuse_out_header)) | ||
738 | return -EINVAL; | ||
739 | |||
740 | err = fuse_copy_one(&cs, &oh, sizeof(oh)); | ||
741 | if (err) | ||
742 | goto err_finish; | ||
743 | err = -EINVAL; | ||
744 | if (!oh.unique || oh.error <= -1000 || oh.error > 0 || | ||
745 | oh.len != nbytes) | ||
746 | goto err_finish; | ||
747 | |||
748 | spin_lock(&fuse_lock); | ||
749 | req = request_find(fc, oh.unique); | ||
750 | err = -EINVAL; | ||
751 | if (!req) | ||
752 | goto err_unlock; | ||
753 | |||
754 | list_del_init(&req->list); | ||
755 | if (req->interrupted) { | ||
756 | request_end(fc, req); | ||
757 | fuse_copy_finish(&cs); | ||
758 | return -ENOENT; | ||
759 | } | ||
760 | req->out.h = oh; | ||
761 | req->locked = 1; | ||
762 | cs.req = req; | ||
763 | spin_unlock(&fuse_lock); | ||
764 | |||
765 | err = copy_out_args(&cs, &req->out, nbytes); | ||
766 | fuse_copy_finish(&cs); | ||
767 | |||
768 | spin_lock(&fuse_lock); | ||
769 | req->locked = 0; | ||
770 | if (!err) { | ||
771 | if (req->interrupted) | ||
772 | err = -ENOENT; | ||
773 | } else if (!req->interrupted) | ||
774 | req->out.h.error = -EIO; | ||
775 | request_end(fc, req); | ||
776 | |||
777 | return err ? err : nbytes; | ||
778 | |||
779 | err_unlock: | ||
780 | spin_unlock(&fuse_lock); | ||
781 | err_finish: | ||
782 | fuse_copy_finish(&cs); | ||
783 | return err; | ||
784 | } | ||
785 | |||
786 | static ssize_t fuse_dev_write(struct file *file, const char __user *buf, | ||
787 | size_t nbytes, loff_t *off) | ||
788 | { | ||
789 | struct iovec iov; | ||
790 | iov.iov_len = nbytes; | ||
791 | iov.iov_base = (char __user *) buf; | ||
792 | return fuse_dev_writev(file, &iov, 1, off); | ||
793 | } | ||
794 | |||
795 | static unsigned fuse_dev_poll(struct file *file, poll_table *wait) | ||
796 | { | ||
797 | struct fuse_conn *fc = fuse_get_conn(file); | ||
798 | unsigned mask = POLLOUT | POLLWRNORM; | ||
799 | |||
800 | if (!fc) | ||
801 | return -ENODEV; | ||
802 | |||
803 | poll_wait(file, &fc->waitq, wait); | ||
804 | |||
805 | spin_lock(&fuse_lock); | ||
806 | if (!list_empty(&fc->pending)) | ||
807 | mask |= POLLIN | POLLRDNORM; | ||
808 | spin_unlock(&fuse_lock); | ||
809 | |||
810 | return mask; | ||
811 | } | ||
812 | |||
813 | /* Abort all requests on the given list (pending or processing) */ | ||
814 | static void end_requests(struct fuse_conn *fc, struct list_head *head) | ||
815 | { | ||
816 | while (!list_empty(head)) { | ||
817 | struct fuse_req *req; | ||
818 | req = list_entry(head->next, struct fuse_req, list); | ||
819 | list_del_init(&req->list); | ||
820 | req->out.h.error = -ECONNABORTED; | ||
821 | request_end(fc, req); | ||
822 | spin_lock(&fuse_lock); | ||
823 | } | ||
824 | } | ||
825 | |||
826 | static int fuse_dev_release(struct inode *inode, struct file *file) | ||
827 | { | ||
828 | struct fuse_conn *fc; | ||
829 | |||
830 | spin_lock(&fuse_lock); | ||
831 | fc = file->private_data; | ||
832 | if (fc) { | ||
833 | fc->file = NULL; | ||
834 | end_requests(fc, &fc->pending); | ||
835 | end_requests(fc, &fc->processing); | ||
836 | fuse_release_conn(fc); | ||
837 | } | ||
838 | spin_unlock(&fuse_lock); | ||
839 | return 0; | ||
840 | } | ||
841 | |||
842 | struct file_operations fuse_dev_operations = { | ||
843 | .owner = THIS_MODULE, | ||
844 | .llseek = no_llseek, | ||
845 | .read = fuse_dev_read, | ||
846 | .readv = fuse_dev_readv, | ||
847 | .write = fuse_dev_write, | ||
848 | .writev = fuse_dev_writev, | ||
849 | .poll = fuse_dev_poll, | ||
850 | .release = fuse_dev_release, | ||
851 | }; | ||
852 | |||
853 | static struct miscdevice fuse_miscdevice = { | ||
854 | .minor = FUSE_MINOR, | ||
855 | .name = "fuse", | ||
856 | .fops = &fuse_dev_operations, | ||
857 | }; | ||
858 | |||
859 | int __init fuse_dev_init(void) | ||
860 | { | ||
861 | int err = -ENOMEM; | ||
862 | fuse_req_cachep = kmem_cache_create("fuse_request", | ||
863 | sizeof(struct fuse_req), | ||
864 | 0, 0, NULL, NULL); | ||
865 | if (!fuse_req_cachep) | ||
866 | goto out; | ||
867 | |||
868 | err = misc_register(&fuse_miscdevice); | ||
869 | if (err) | ||
870 | goto out_cache_clean; | ||
871 | |||
872 | return 0; | ||
873 | |||
874 | out_cache_clean: | ||
875 | kmem_cache_destroy(fuse_req_cachep); | ||
876 | out: | ||
877 | return err; | ||
878 | } | ||
879 | |||
880 | void fuse_dev_cleanup(void) | ||
881 | { | ||
882 | misc_deregister(&fuse_miscdevice); | ||
883 | kmem_cache_destroy(fuse_req_cachep); | ||
884 | } | ||
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index eed6e89ce01f..50ad6a0c39bf 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h | |||
@@ -15,6 +15,12 @@ | |||
15 | #include <linux/backing-dev.h> | 15 | #include <linux/backing-dev.h> |
16 | #include <asm/semaphore.h> | 16 | #include <asm/semaphore.h> |
17 | 17 | ||
18 | /** Max number of pages that can be used in a single read request */ | ||
19 | #define FUSE_MAX_PAGES_PER_REQ 32 | ||
20 | |||
21 | /** If more requests are outstanding, then the operation will block */ | ||
22 | #define FUSE_MAX_OUTSTANDING 10 | ||
23 | |||
18 | /** FUSE inode */ | 24 | /** FUSE inode */ |
19 | struct fuse_inode { | 25 | struct fuse_inode { |
20 | /** Inode data */ | 26 | /** Inode data */ |
@@ -28,6 +34,123 @@ struct fuse_inode { | |||
28 | unsigned long i_time; | 34 | unsigned long i_time; |
29 | }; | 35 | }; |
30 | 36 | ||
37 | /** One input argument of a request */ | ||
38 | struct fuse_in_arg { | ||
39 | unsigned size; | ||
40 | const void *value; | ||
41 | }; | ||
42 | |||
43 | /** The request input */ | ||
44 | struct fuse_in { | ||
45 | /** The request header */ | ||
46 | struct fuse_in_header h; | ||
47 | |||
48 | /** True if the data for the last argument is in req->pages */ | ||
49 | unsigned argpages:1; | ||
50 | |||
51 | /** Number of arguments */ | ||
52 | unsigned numargs; | ||
53 | |||
54 | /** Array of arguments */ | ||
55 | struct fuse_in_arg args[3]; | ||
56 | }; | ||
57 | |||
58 | /** One output argument of a request */ | ||
59 | struct fuse_arg { | ||
60 | unsigned size; | ||
61 | void *value; | ||
62 | }; | ||
63 | |||
64 | /** The request output */ | ||
65 | struct fuse_out { | ||
66 | /** Header returned from userspace */ | ||
67 | struct fuse_out_header h; | ||
68 | |||
69 | /** Last argument is variable length (can be shorter than | ||
70 | arg->size) */ | ||
71 | unsigned argvar:1; | ||
72 | |||
73 | /** Last argument is a list of pages to copy data to */ | ||
74 | unsigned argpages:1; | ||
75 | |||
76 | /** Zero partially or not copied pages */ | ||
77 | unsigned page_zeroing:1; | ||
78 | |||
79 | /** Number or arguments */ | ||
80 | unsigned numargs; | ||
81 | |||
82 | /** Array of arguments */ | ||
83 | struct fuse_arg args[3]; | ||
84 | }; | ||
85 | |||
86 | struct fuse_req; | ||
87 | struct fuse_conn; | ||
88 | |||
89 | /** | ||
90 | * A request to the client | ||
91 | */ | ||
92 | struct fuse_req { | ||
93 | /** This can be on either unused_list, pending or processing | ||
94 | lists in fuse_conn */ | ||
95 | struct list_head list; | ||
96 | |||
97 | /** refcount */ | ||
98 | atomic_t count; | ||
99 | |||
100 | /** True if the request has reply */ | ||
101 | unsigned isreply:1; | ||
102 | |||
103 | /** The request is preallocated */ | ||
104 | unsigned preallocated:1; | ||
105 | |||
106 | /** The request was interrupted */ | ||
107 | unsigned interrupted:1; | ||
108 | |||
109 | /** Request is sent in the background */ | ||
110 | unsigned background:1; | ||
111 | |||
112 | /** Data is being copied to/from the request */ | ||
113 | unsigned locked:1; | ||
114 | |||
115 | /** Request has been sent to userspace */ | ||
116 | unsigned sent:1; | ||
117 | |||
118 | /** The request is finished */ | ||
119 | unsigned finished:1; | ||
120 | |||
121 | /** The request input */ | ||
122 | struct fuse_in in; | ||
123 | |||
124 | /** The request output */ | ||
125 | struct fuse_out out; | ||
126 | |||
127 | /** Used to wake up the task waiting for completion of request*/ | ||
128 | wait_queue_head_t waitq; | ||
129 | |||
130 | /** Data for asynchronous requests */ | ||
131 | union { | ||
132 | struct fuse_init_in_out init_in_out; | ||
133 | } misc; | ||
134 | |||
135 | /** page vector */ | ||
136 | struct page *pages[FUSE_MAX_PAGES_PER_REQ]; | ||
137 | |||
138 | /** number of pages in vector */ | ||
139 | unsigned num_pages; | ||
140 | |||
141 | /** offset of data on first page */ | ||
142 | unsigned page_offset; | ||
143 | |||
144 | /** Inode used in the request */ | ||
145 | struct inode *inode; | ||
146 | |||
147 | /** Second inode used in the request (or NULL) */ | ||
148 | struct inode *inode2; | ||
149 | |||
150 | /** File used in the request (or NULL) */ | ||
151 | struct file *file; | ||
152 | }; | ||
153 | |||
31 | /** | 154 | /** |
32 | * A Fuse connection. | 155 | * A Fuse connection. |
33 | * | 156 | * |
@@ -39,9 +162,37 @@ struct fuse_conn { | |||
39 | /** The superblock of the mounted filesystem */ | 162 | /** The superblock of the mounted filesystem */ |
40 | struct super_block *sb; | 163 | struct super_block *sb; |
41 | 164 | ||
165 | /** The opened client device */ | ||
166 | struct file *file; | ||
167 | |||
42 | /** The user id for this mount */ | 168 | /** The user id for this mount */ |
43 | uid_t user_id; | 169 | uid_t user_id; |
44 | 170 | ||
171 | /** Readers of the connection are waiting on this */ | ||
172 | wait_queue_head_t waitq; | ||
173 | |||
174 | /** The list of pending requests */ | ||
175 | struct list_head pending; | ||
176 | |||
177 | /** The list of requests being processed */ | ||
178 | struct list_head processing; | ||
179 | |||
180 | /** Controls the maximum number of outstanding requests */ | ||
181 | struct semaphore outstanding_sem; | ||
182 | |||
183 | /** This counts the number of outstanding requests if | ||
184 | outstanding_sem would go negative */ | ||
185 | unsigned outstanding_debt; | ||
186 | |||
187 | /** The list of unused requests */ | ||
188 | struct list_head unused_list; | ||
189 | |||
190 | /** The next unique request id */ | ||
191 | u64 reqctr; | ||
192 | |||
193 | /** Connection failed (version mismatch) */ | ||
194 | unsigned conn_error : 1; | ||
195 | |||
45 | /** Backing dev info */ | 196 | /** Backing dev info */ |
46 | struct backing_dev_info bdi; | 197 | struct backing_dev_info bdi; |
47 | }; | 198 | }; |
@@ -71,13 +222,20 @@ static inline u64 get_node_id(struct inode *inode) | |||
71 | return get_fuse_inode(inode)->nodeid; | 222 | return get_fuse_inode(inode)->nodeid; |
72 | } | 223 | } |
73 | 224 | ||
225 | /** Device operations */ | ||
226 | extern struct file_operations fuse_dev_operations; | ||
227 | |||
74 | /** | 228 | /** |
75 | * This is the single global spinlock which protects FUSE's structures | 229 | * This is the single global spinlock which protects FUSE's structures |
76 | * | 230 | * |
77 | * The following data is protected by this lock: | 231 | * The following data is protected by this lock: |
78 | * | 232 | * |
233 | * - the private_data field of the device file | ||
79 | * - the s_fs_info field of the super block | 234 | * - the s_fs_info field of the super block |
235 | * - unused_list, pending, processing lists in fuse_conn | ||
236 | * - the unique request ID counter reqctr in fuse_conn | ||
80 | * - the sb (super_block) field in fuse_conn | 237 | * - the sb (super_block) field in fuse_conn |
238 | * - the file (device file) field in fuse_conn | ||
81 | */ | 239 | */ |
82 | extern spinlock_t fuse_lock; | 240 | extern spinlock_t fuse_lock; |
83 | 241 | ||
@@ -87,3 +245,68 @@ extern spinlock_t fuse_lock; | |||
87 | */ | 245 | */ |
88 | void fuse_release_conn(struct fuse_conn *fc); | 246 | void fuse_release_conn(struct fuse_conn *fc); |
89 | 247 | ||
248 | /** | ||
249 | * Initialize the client device | ||
250 | */ | ||
251 | int fuse_dev_init(void); | ||
252 | |||
253 | /** | ||
254 | * Cleanup the client device | ||
255 | */ | ||
256 | void fuse_dev_cleanup(void); | ||
257 | |||
258 | /** | ||
259 | * Allocate a request | ||
260 | */ | ||
261 | struct fuse_req *fuse_request_alloc(void); | ||
262 | |||
263 | /** | ||
264 | * Free a request | ||
265 | */ | ||
266 | void fuse_request_free(struct fuse_req *req); | ||
267 | |||
268 | /** | ||
269 | * Reinitialize a request, the preallocated flag is left unmodified | ||
270 | */ | ||
271 | void fuse_reset_request(struct fuse_req *req); | ||
272 | |||
273 | /** | ||
274 | * Reserve a preallocated request | ||
275 | */ | ||
276 | struct fuse_req *fuse_get_request(struct fuse_conn *fc); | ||
277 | |||
278 | /** | ||
279 | * Reserve a preallocated request, only interruptible by SIGKILL | ||
280 | */ | ||
281 | struct fuse_req *fuse_get_request_nonint(struct fuse_conn *fc); | ||
282 | |||
283 | /** | ||
284 | * Decrement reference count of a request. If count goes to zero put | ||
285 | * on unused list (preallocated) or free reqest (not preallocated). | ||
286 | */ | ||
287 | void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); | ||
288 | |||
289 | /** | ||
290 | * Send a request (synchronous, interruptible) | ||
291 | */ | ||
292 | void request_send(struct fuse_conn *fc, struct fuse_req *req); | ||
293 | |||
294 | /** | ||
295 | * Send a request (synchronous, non-interruptible except by SIGKILL) | ||
296 | */ | ||
297 | void request_send_nonint(struct fuse_conn *fc, struct fuse_req *req); | ||
298 | |||
299 | /** | ||
300 | * Send a request with no reply | ||
301 | */ | ||
302 | void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); | ||
303 | |||
304 | /** | ||
305 | * Send a request in the background | ||
306 | */ | ||
307 | void request_send_background(struct fuse_conn *fc, struct fuse_req *req); | ||
308 | |||
309 | /** | ||
310 | * Send the INIT message | ||
311 | */ | ||
312 | void fuse_send_init(struct fuse_conn *fc); | ||
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ea6339c2b6a1..33fad334ba70 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c | |||
@@ -151,6 +151,8 @@ static void fuse_put_super(struct super_block *sb) | |||
151 | mount_count --; | 151 | mount_count --; |
152 | fc->sb = NULL; | 152 | fc->sb = NULL; |
153 | fc->user_id = 0; | 153 | fc->user_id = 0; |
154 | /* Flush all readers on this fs */ | ||
155 | wake_up_all(&fc->waitq); | ||
154 | fuse_release_conn(fc); | 156 | fuse_release_conn(fc); |
155 | *get_fuse_conn_super_p(sb) = NULL; | 157 | *get_fuse_conn_super_p(sb) = NULL; |
156 | spin_unlock(&fuse_lock); | 158 | spin_unlock(&fuse_lock); |
@@ -229,22 +231,51 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) | |||
229 | return 0; | 231 | return 0; |
230 | } | 232 | } |
231 | 233 | ||
232 | void fuse_release_conn(struct fuse_conn *fc) | 234 | static void free_conn(struct fuse_conn *fc) |
233 | { | 235 | { |
236 | while (!list_empty(&fc->unused_list)) { | ||
237 | struct fuse_req *req; | ||
238 | req = list_entry(fc->unused_list.next, struct fuse_req, list); | ||
239 | list_del(&req->list); | ||
240 | fuse_request_free(req); | ||
241 | } | ||
234 | kfree(fc); | 242 | kfree(fc); |
235 | } | 243 | } |
236 | 244 | ||
245 | /* Must be called with the fuse lock held */ | ||
246 | void fuse_release_conn(struct fuse_conn *fc) | ||
247 | { | ||
248 | if (!fc->sb && !fc->file) | ||
249 | free_conn(fc); | ||
250 | } | ||
251 | |||
237 | static struct fuse_conn *new_conn(void) | 252 | static struct fuse_conn *new_conn(void) |
238 | { | 253 | { |
239 | struct fuse_conn *fc; | 254 | struct fuse_conn *fc; |
240 | 255 | ||
241 | fc = kmalloc(sizeof(*fc), GFP_KERNEL); | 256 | fc = kmalloc(sizeof(*fc), GFP_KERNEL); |
242 | if (fc != NULL) { | 257 | if (fc != NULL) { |
258 | int i; | ||
243 | memset(fc, 0, sizeof(*fc)); | 259 | memset(fc, 0, sizeof(*fc)); |
244 | fc->sb = NULL; | 260 | fc->sb = NULL; |
261 | fc->file = NULL; | ||
245 | fc->user_id = 0; | 262 | fc->user_id = 0; |
263 | init_waitqueue_head(&fc->waitq); | ||
264 | INIT_LIST_HEAD(&fc->pending); | ||
265 | INIT_LIST_HEAD(&fc->processing); | ||
266 | INIT_LIST_HEAD(&fc->unused_list); | ||
267 | sema_init(&fc->outstanding_sem, 0); | ||
268 | for (i = 0; i < FUSE_MAX_OUTSTANDING; i++) { | ||
269 | struct fuse_req *req = fuse_request_alloc(); | ||
270 | if (!req) { | ||
271 | free_conn(fc); | ||
272 | return NULL; | ||
273 | } | ||
274 | list_add(&req->list, &fc->unused_list); | ||
275 | } | ||
246 | fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; | 276 | fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; |
247 | fc->bdi.unplug_io_fn = default_unplug_io_fn; | 277 | fc->bdi.unplug_io_fn = default_unplug_io_fn; |
278 | fc->reqctr = 0; | ||
248 | } | 279 | } |
249 | return fc; | 280 | return fc; |
250 | } | 281 | } |
@@ -253,11 +284,20 @@ static struct fuse_conn *get_conn(struct file *file, struct super_block *sb) | |||
253 | { | 284 | { |
254 | struct fuse_conn *fc; | 285 | struct fuse_conn *fc; |
255 | 286 | ||
287 | if (file->f_op != &fuse_dev_operations) | ||
288 | return ERR_PTR(-EINVAL); | ||
256 | fc = new_conn(); | 289 | fc = new_conn(); |
257 | if (fc == NULL) | 290 | if (fc == NULL) |
258 | return NULL; | 291 | return ERR_PTR(-ENOMEM); |
259 | spin_lock(&fuse_lock); | 292 | spin_lock(&fuse_lock); |
260 | fc->sb = sb; | 293 | if (file->private_data) { |
294 | free_conn(fc); | ||
295 | fc = ERR_PTR(-EINVAL); | ||
296 | } else { | ||
297 | file->private_data = fc; | ||
298 | fc->sb = sb; | ||
299 | fc->file = file; | ||
300 | } | ||
261 | spin_unlock(&fuse_lock); | 301 | spin_unlock(&fuse_lock); |
262 | return fc; | 302 | return fc; |
263 | } | 303 | } |
@@ -315,8 +355,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) | |||
315 | 355 | ||
316 | fc = get_conn(file, sb); | 356 | fc = get_conn(file, sb); |
317 | fput(file); | 357 | fput(file); |
318 | if (fc == NULL) | 358 | if (IS_ERR(fc)) |
319 | return -EINVAL; | 359 | return PTR_ERR(fc); |
320 | 360 | ||
321 | fc->user_id = d.user_id; | 361 | fc->user_id = d.user_id; |
322 | 362 | ||
@@ -336,6 +376,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) | |||
336 | iput(root); | 376 | iput(root); |
337 | goto err; | 377 | goto err; |
338 | } | 378 | } |
379 | fuse_send_init(fc); | ||
339 | return 0; | 380 | return 0; |
340 | 381 | ||
341 | err: | 382 | err: |
@@ -411,8 +452,14 @@ static int __init fuse_init(void) | |||
411 | if (res) | 452 | if (res) |
412 | goto err; | 453 | goto err; |
413 | 454 | ||
455 | res = fuse_dev_init(); | ||
456 | if (res) | ||
457 | goto err_fs_cleanup; | ||
458 | |||
414 | return 0; | 459 | return 0; |
415 | 460 | ||
461 | err_fs_cleanup: | ||
462 | fuse_fs_cleanup(); | ||
416 | err: | 463 | err: |
417 | return res; | 464 | return res; |
418 | } | 465 | } |
@@ -422,6 +469,7 @@ static void __exit fuse_exit(void) | |||
422 | printk(KERN_DEBUG "fuse exit\n"); | 469 | printk(KERN_DEBUG "fuse exit\n"); |
423 | 470 | ||
424 | fuse_fs_cleanup(); | 471 | fuse_fs_cleanup(); |
472 | fuse_dev_cleanup(); | ||
425 | } | 473 | } |
426 | 474 | ||
427 | module_init(fuse_init); | 475 | module_init(fuse_init); |
diff --git a/include/linux/fuse.h b/include/linux/fuse.h index 2b1f4ae01e9d..a1aebd7104c4 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h | |||
@@ -11,7 +11,7 @@ | |||
11 | #include <asm/types.h> | 11 | #include <asm/types.h> |
12 | 12 | ||
13 | /** Version number of this interface */ | 13 | /** Version number of this interface */ |
14 | #define FUSE_KERNEL_VERSION 5 | 14 | #define FUSE_KERNEL_VERSION 6 |
15 | 15 | ||
16 | /** Minor version number of this interface */ | 16 | /** Minor version number of this interface */ |
17 | #define FUSE_KERNEL_MINOR_VERSION 1 | 17 | #define FUSE_KERNEL_MINOR_VERSION 1 |
@@ -19,6 +19,12 @@ | |||
19 | /** The node ID of the root inode */ | 19 | /** The node ID of the root inode */ |
20 | #define FUSE_ROOT_ID 1 | 20 | #define FUSE_ROOT_ID 1 |
21 | 21 | ||
22 | /** The major number of the fuse character device */ | ||
23 | #define FUSE_MAJOR 10 | ||
24 | |||
25 | /** The minor number of the fuse character device */ | ||
26 | #define FUSE_MINOR 229 | ||
27 | |||
22 | struct fuse_attr { | 28 | struct fuse_attr { |
23 | __u64 ino; | 29 | __u64 ino; |
24 | __u64 size; | 30 | __u64 size; |
@@ -36,3 +42,31 @@ struct fuse_attr { | |||
36 | __u32 rdev; | 42 | __u32 rdev; |
37 | }; | 43 | }; |
38 | 44 | ||
45 | enum fuse_opcode { | ||
46 | FUSE_INIT = 26 | ||
47 | }; | ||
48 | |||
49 | /* Conservative buffer size for the client */ | ||
50 | #define FUSE_MAX_IN 8192 | ||
51 | |||
52 | struct fuse_init_in_out { | ||
53 | __u32 major; | ||
54 | __u32 minor; | ||
55 | }; | ||
56 | |||
57 | struct fuse_in_header { | ||
58 | __u32 len; | ||
59 | __u32 opcode; | ||
60 | __u64 unique; | ||
61 | __u64 nodeid; | ||
62 | __u32 uid; | ||
63 | __u32 gid; | ||
64 | __u32 pid; | ||
65 | }; | ||
66 | |||
67 | struct fuse_out_header { | ||
68 | __u32 len; | ||
69 | __s32 error; | ||
70 | __u64 unique; | ||
71 | }; | ||
72 | |||