diff options
Diffstat (limited to 'fs/namespace.c')
-rw-r--r-- | fs/namespace.c | 316 |
1 files changed, 314 insertions, 2 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index 94f026ec990a..678f7ce060f2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/quotaops.h> | 17 | #include <linux/quotaops.h> |
18 | #include <linux/acct.h> | 18 | #include <linux/acct.h> |
19 | #include <linux/capability.h> | 19 | #include <linux/capability.h> |
20 | #include <linux/cpumask.h> | ||
20 | #include <linux/module.h> | 21 | #include <linux/module.h> |
21 | #include <linux/sysfs.h> | 22 | #include <linux/sysfs.h> |
22 | #include <linux/seq_file.h> | 23 | #include <linux/seq_file.h> |
@@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) | |||
55 | return tmp & (HASH_SIZE - 1); | 56 | return tmp & (HASH_SIZE - 1); |
56 | } | 57 | } |
57 | 58 | ||
59 | #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) | ||
60 | |||
58 | struct vfsmount *alloc_vfsmnt(const char *name) | 61 | struct vfsmount *alloc_vfsmnt(const char *name) |
59 | { | 62 | { |
60 | struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); | 63 | struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); |
@@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char *name) | |||
68 | INIT_LIST_HEAD(&mnt->mnt_share); | 71 | INIT_LIST_HEAD(&mnt->mnt_share); |
69 | INIT_LIST_HEAD(&mnt->mnt_slave_list); | 72 | INIT_LIST_HEAD(&mnt->mnt_slave_list); |
70 | INIT_LIST_HEAD(&mnt->mnt_slave); | 73 | INIT_LIST_HEAD(&mnt->mnt_slave); |
74 | atomic_set(&mnt->__mnt_writers, 0); | ||
71 | if (name) { | 75 | if (name) { |
72 | int size = strlen(name) + 1; | 76 | int size = strlen(name) + 1; |
73 | char *newname = kmalloc(size, GFP_KERNEL); | 77 | char *newname = kmalloc(size, GFP_KERNEL); |
@@ -80,6 +84,263 @@ struct vfsmount *alloc_vfsmnt(const char *name) | |||
80 | return mnt; | 84 | return mnt; |
81 | } | 85 | } |
82 | 86 | ||
87 | /* | ||
88 | * Most r/o checks on a fs are for operations that take | ||
89 | * discrete amounts of time, like a write() or unlink(). | ||
90 | * We must keep track of when those operations start | ||
91 | * (for permission checks) and when they end, so that | ||
92 | * we can determine when writes are able to occur to | ||
93 | * a filesystem. | ||
94 | */ | ||
95 | /* | ||
96 | * __mnt_is_readonly: check whether a mount is read-only | ||
97 | * @mnt: the mount to check for its write status | ||
98 | * | ||
99 | * This shouldn't be used directly ouside of the VFS. | ||
100 | * It does not guarantee that the filesystem will stay | ||
101 | * r/w, just that it is right *now*. This can not and | ||
102 | * should not be used in place of IS_RDONLY(inode). | ||
103 | * mnt_want/drop_write() will _keep_ the filesystem | ||
104 | * r/w. | ||
105 | */ | ||
106 | int __mnt_is_readonly(struct vfsmount *mnt) | ||
107 | { | ||
108 | if (mnt->mnt_flags & MNT_READONLY) | ||
109 | return 1; | ||
110 | if (mnt->mnt_sb->s_flags & MS_RDONLY) | ||
111 | return 1; | ||
112 | return 0; | ||
113 | } | ||
114 | EXPORT_SYMBOL_GPL(__mnt_is_readonly); | ||
115 | |||
116 | struct mnt_writer { | ||
117 | /* | ||
118 | * If holding multiple instances of this lock, they | ||
119 | * must be ordered by cpu number. | ||
120 | */ | ||
121 | spinlock_t lock; | ||
122 | struct lock_class_key lock_class; /* compiles out with !lockdep */ | ||
123 | unsigned long count; | ||
124 | struct vfsmount *mnt; | ||
125 | } ____cacheline_aligned_in_smp; | ||
126 | static DEFINE_PER_CPU(struct mnt_writer, mnt_writers); | ||
127 | |||
128 | static int __init init_mnt_writers(void) | ||
129 | { | ||
130 | int cpu; | ||
131 | for_each_possible_cpu(cpu) { | ||
132 | struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); | ||
133 | spin_lock_init(&writer->lock); | ||
134 | lockdep_set_class(&writer->lock, &writer->lock_class); | ||
135 | writer->count = 0; | ||
136 | } | ||
137 | return 0; | ||
138 | } | ||
139 | fs_initcall(init_mnt_writers); | ||
140 | |||
141 | static void unlock_mnt_writers(void) | ||
142 | { | ||
143 | int cpu; | ||
144 | struct mnt_writer *cpu_writer; | ||
145 | |||
146 | for_each_possible_cpu(cpu) { | ||
147 | cpu_writer = &per_cpu(mnt_writers, cpu); | ||
148 | spin_unlock(&cpu_writer->lock); | ||
149 | } | ||
150 | } | ||
151 | |||
152 | static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) | ||
153 | { | ||
154 | if (!cpu_writer->mnt) | ||
155 | return; | ||
156 | /* | ||
157 | * This is in case anyone ever leaves an invalid, | ||
158 | * old ->mnt and a count of 0. | ||
159 | */ | ||
160 | if (!cpu_writer->count) | ||
161 | return; | ||
162 | atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers); | ||
163 | cpu_writer->count = 0; | ||
164 | } | ||
165 | /* | ||
166 | * must hold cpu_writer->lock | ||
167 | */ | ||
168 | static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, | ||
169 | struct vfsmount *mnt) | ||
170 | { | ||
171 | if (cpu_writer->mnt == mnt) | ||
172 | return; | ||
173 | __clear_mnt_count(cpu_writer); | ||
174 | cpu_writer->mnt = mnt; | ||
175 | } | ||
176 | |||
177 | /* | ||
178 | * Most r/o checks on a fs are for operations that take | ||
179 | * discrete amounts of time, like a write() or unlink(). | ||
180 | * We must keep track of when those operations start | ||
181 | * (for permission checks) and when they end, so that | ||
182 | * we can determine when writes are able to occur to | ||
183 | * a filesystem. | ||
184 | */ | ||
185 | /** | ||
186 | * mnt_want_write - get write access to a mount | ||
187 | * @mnt: the mount on which to take a write | ||
188 | * | ||
189 | * This tells the low-level filesystem that a write is | ||
190 | * about to be performed to it, and makes sure that | ||
191 | * writes are allowed before returning success. When | ||
192 | * the write operation is finished, mnt_drop_write() | ||
193 | * must be called. This is effectively a refcount. | ||
194 | */ | ||
195 | int mnt_want_write(struct vfsmount *mnt) | ||
196 | { | ||
197 | int ret = 0; | ||
198 | struct mnt_writer *cpu_writer; | ||
199 | |||
200 | cpu_writer = &get_cpu_var(mnt_writers); | ||
201 | spin_lock(&cpu_writer->lock); | ||
202 | if (__mnt_is_readonly(mnt)) { | ||
203 | ret = -EROFS; | ||
204 | goto out; | ||
205 | } | ||
206 | use_cpu_writer_for_mount(cpu_writer, mnt); | ||
207 | cpu_writer->count++; | ||
208 | out: | ||
209 | spin_unlock(&cpu_writer->lock); | ||
210 | put_cpu_var(mnt_writers); | ||
211 | return ret; | ||
212 | } | ||
213 | EXPORT_SYMBOL_GPL(mnt_want_write); | ||
214 | |||
215 | static void lock_mnt_writers(void) | ||
216 | { | ||
217 | int cpu; | ||
218 | struct mnt_writer *cpu_writer; | ||
219 | |||
220 | for_each_possible_cpu(cpu) { | ||
221 | cpu_writer = &per_cpu(mnt_writers, cpu); | ||
222 | spin_lock(&cpu_writer->lock); | ||
223 | __clear_mnt_count(cpu_writer); | ||
224 | cpu_writer->mnt = NULL; | ||
225 | } | ||
226 | } | ||
227 | |||
228 | /* | ||
229 | * These per-cpu write counts are not guaranteed to have | ||
230 | * matched increments and decrements on any given cpu. | ||
231 | * A file open()ed for write on one cpu and close()d on | ||
232 | * another cpu will imbalance this count. Make sure it | ||
233 | * does not get too far out of whack. | ||
234 | */ | ||
235 | static void handle_write_count_underflow(struct vfsmount *mnt) | ||
236 | { | ||
237 | if (atomic_read(&mnt->__mnt_writers) >= | ||
238 | MNT_WRITER_UNDERFLOW_LIMIT) | ||
239 | return; | ||
240 | /* | ||
241 | * It isn't necessary to hold all of the locks | ||
242 | * at the same time, but doing it this way makes | ||
243 | * us share a lot more code. | ||
244 | */ | ||
245 | lock_mnt_writers(); | ||
246 | /* | ||
247 | * vfsmount_lock is for mnt_flags. | ||
248 | */ | ||
249 | spin_lock(&vfsmount_lock); | ||
250 | /* | ||
251 | * If coalescing the per-cpu writer counts did not | ||
252 | * get us back to a positive writer count, we have | ||
253 | * a bug. | ||
254 | */ | ||
255 | if ((atomic_read(&mnt->__mnt_writers) < 0) && | ||
256 | !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) { | ||
257 | printk(KERN_DEBUG "leak detected on mount(%p) writers " | ||
258 | "count: %d\n", | ||
259 | mnt, atomic_read(&mnt->__mnt_writers)); | ||
260 | WARN_ON(1); | ||
261 | /* use the flag to keep the dmesg spam down */ | ||
262 | mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT; | ||
263 | } | ||
264 | spin_unlock(&vfsmount_lock); | ||
265 | unlock_mnt_writers(); | ||
266 | } | ||
267 | |||
268 | /** | ||
269 | * mnt_drop_write - give up write access to a mount | ||
270 | * @mnt: the mount on which to give up write access | ||
271 | * | ||
272 | * Tells the low-level filesystem that we are done | ||
273 | * performing writes to it. Must be matched with | ||
274 | * mnt_want_write() call above. | ||
275 | */ | ||
276 | void mnt_drop_write(struct vfsmount *mnt) | ||
277 | { | ||
278 | int must_check_underflow = 0; | ||
279 | struct mnt_writer *cpu_writer; | ||
280 | |||
281 | cpu_writer = &get_cpu_var(mnt_writers); | ||
282 | spin_lock(&cpu_writer->lock); | ||
283 | |||
284 | use_cpu_writer_for_mount(cpu_writer, mnt); | ||
285 | if (cpu_writer->count > 0) { | ||
286 | cpu_writer->count--; | ||
287 | } else { | ||
288 | must_check_underflow = 1; | ||
289 | atomic_dec(&mnt->__mnt_writers); | ||
290 | } | ||
291 | |||
292 | spin_unlock(&cpu_writer->lock); | ||
293 | /* | ||
294 | * Logically, we could call this each time, | ||
295 | * but the __mnt_writers cacheline tends to | ||
296 | * be cold, and makes this expensive. | ||
297 | */ | ||
298 | if (must_check_underflow) | ||
299 | handle_write_count_underflow(mnt); | ||
300 | /* | ||
301 | * This could be done right after the spinlock | ||
302 | * is taken because the spinlock keeps us on | ||
303 | * the cpu, and disables preemption. However, | ||
304 | * putting it here bounds the amount that | ||
305 | * __mnt_writers can underflow. Without it, | ||
306 | * we could theoretically wrap __mnt_writers. | ||
307 | */ | ||
308 | put_cpu_var(mnt_writers); | ||
309 | } | ||
310 | EXPORT_SYMBOL_GPL(mnt_drop_write); | ||
311 | |||
312 | static int mnt_make_readonly(struct vfsmount *mnt) | ||
313 | { | ||
314 | int ret = 0; | ||
315 | |||
316 | lock_mnt_writers(); | ||
317 | /* | ||
318 | * With all the locks held, this value is stable | ||
319 | */ | ||
320 | if (atomic_read(&mnt->__mnt_writers) > 0) { | ||
321 | ret = -EBUSY; | ||
322 | goto out; | ||
323 | } | ||
324 | /* | ||
325 | * nobody can do a successful mnt_want_write() with all | ||
326 | * of the counts in MNT_DENIED_WRITE and the locks held. | ||
327 | */ | ||
328 | spin_lock(&vfsmount_lock); | ||
329 | if (!ret) | ||
330 | mnt->mnt_flags |= MNT_READONLY; | ||
331 | spin_unlock(&vfsmount_lock); | ||
332 | out: | ||
333 | unlock_mnt_writers(); | ||
334 | return ret; | ||
335 | } | ||
336 | |||
337 | static void __mnt_unmake_readonly(struct vfsmount *mnt) | ||
338 | { | ||
339 | spin_lock(&vfsmount_lock); | ||
340 | mnt->mnt_flags &= ~MNT_READONLY; | ||
341 | spin_unlock(&vfsmount_lock); | ||
342 | } | ||
343 | |||
83 | int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) | 344 | int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) |
84 | { | 345 | { |
85 | mnt->mnt_sb = sb; | 346 | mnt->mnt_sb = sb; |
@@ -271,7 +532,36 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, | |||
271 | 532 | ||
272 | static inline void __mntput(struct vfsmount *mnt) | 533 | static inline void __mntput(struct vfsmount *mnt) |
273 | { | 534 | { |
535 | int cpu; | ||
274 | struct super_block *sb = mnt->mnt_sb; | 536 | struct super_block *sb = mnt->mnt_sb; |
537 | /* | ||
538 | * We don't have to hold all of the locks at the | ||
539 | * same time here because we know that we're the | ||
540 | * last reference to mnt and that no new writers | ||
541 | * can come in. | ||
542 | */ | ||
543 | for_each_possible_cpu(cpu) { | ||
544 | struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); | ||
545 | if (cpu_writer->mnt != mnt) | ||
546 | continue; | ||
547 | spin_lock(&cpu_writer->lock); | ||
548 | atomic_add(cpu_writer->count, &mnt->__mnt_writers); | ||
549 | cpu_writer->count = 0; | ||
550 | /* | ||
551 | * Might as well do this so that no one | ||
552 | * ever sees the pointer and expects | ||
553 | * it to be valid. | ||
554 | */ | ||
555 | cpu_writer->mnt = NULL; | ||
556 | spin_unlock(&cpu_writer->lock); | ||
557 | } | ||
558 | /* | ||
559 | * This probably indicates that somebody messed | ||
560 | * up a mnt_want/drop_write() pair. If this | ||
561 | * happens, the filesystem was probably unable | ||
562 | * to make r/w->r/o transitions. | ||
563 | */ | ||
564 | WARN_ON(atomic_read(&mnt->__mnt_writers)); | ||
275 | dput(mnt->mnt_root); | 565 | dput(mnt->mnt_root); |
276 | free_vfsmnt(mnt); | 566 | free_vfsmnt(mnt); |
277 | deactivate_super(sb); | 567 | deactivate_super(sb); |
@@ -417,7 +707,7 @@ static int show_vfsmnt(struct seq_file *m, void *v) | |||
417 | seq_putc(m, '.'); | 707 | seq_putc(m, '.'); |
418 | mangle(m, mnt->mnt_sb->s_subtype); | 708 | mangle(m, mnt->mnt_sb->s_subtype); |
419 | } | 709 | } |
420 | seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw"); | 710 | seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); |
421 | for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { | 711 | for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { |
422 | if (mnt->mnt_sb->s_flags & fs_infop->flag) | 712 | if (mnt->mnt_sb->s_flags & fs_infop->flag) |
423 | seq_puts(m, fs_infop->str); | 713 | seq_puts(m, fs_infop->str); |
@@ -1019,6 +1309,23 @@ out: | |||
1019 | return err; | 1309 | return err; |
1020 | } | 1310 | } |
1021 | 1311 | ||
1312 | static int change_mount_flags(struct vfsmount *mnt, int ms_flags) | ||
1313 | { | ||
1314 | int error = 0; | ||
1315 | int readonly_request = 0; | ||
1316 | |||
1317 | if (ms_flags & MS_RDONLY) | ||
1318 | readonly_request = 1; | ||
1319 | if (readonly_request == __mnt_is_readonly(mnt)) | ||
1320 | return 0; | ||
1321 | |||
1322 | if (readonly_request) | ||
1323 | error = mnt_make_readonly(mnt); | ||
1324 | else | ||
1325 | __mnt_unmake_readonly(mnt); | ||
1326 | return error; | ||
1327 | } | ||
1328 | |||
1022 | /* | 1329 | /* |
1023 | * change filesystem flags. dir should be a physical root of filesystem. | 1330 | * change filesystem flags. dir should be a physical root of filesystem. |
1024 | * If you've mounted a non-root directory somewhere and want to do remount | 1331 | * If you've mounted a non-root directory somewhere and want to do remount |
@@ -1041,7 +1348,10 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, | |||
1041 | return -EINVAL; | 1348 | return -EINVAL; |
1042 | 1349 | ||
1043 | down_write(&sb->s_umount); | 1350 | down_write(&sb->s_umount); |
1044 | err = do_remount_sb(sb, flags, data, 0); | 1351 | if (flags & MS_BIND) |
1352 | err = change_mount_flags(nd->path.mnt, flags); | ||
1353 | else | ||
1354 | err = do_remount_sb(sb, flags, data, 0); | ||
1045 | if (!err) | 1355 | if (!err) |
1046 | nd->path.mnt->mnt_flags = mnt_flags; | 1356 | nd->path.mnt->mnt_flags = mnt_flags; |
1047 | up_write(&sb->s_umount); | 1357 | up_write(&sb->s_umount); |
@@ -1425,6 +1735,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, | |||
1425 | mnt_flags |= MNT_NODIRATIME; | 1735 | mnt_flags |= MNT_NODIRATIME; |
1426 | if (flags & MS_RELATIME) | 1736 | if (flags & MS_RELATIME) |
1427 | mnt_flags |= MNT_RELATIME; | 1737 | mnt_flags |= MNT_RELATIME; |
1738 | if (flags & MS_RDONLY) | ||
1739 | mnt_flags |= MNT_READONLY; | ||
1428 | 1740 | ||
1429 | flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | | 1741 | flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | |
1430 | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); | 1742 | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); |