aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/filesystems
diff options
context:
space:
mode:
authorChristoph Hellwig <hch>2010-12-16 06:04:54 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-12-30 13:00:50 -0500
commitb83be6f20a0e468f715b14225c9f897538dfe5ad (patch)
tree30a1f540cdfdbe08245cbea29f170a21bb23b009 /Documentation/filesystems
parent4ef9e11d6867f88951e30db910fa015300e31871 (diff)
update Documentation/filesystems/Locking
Mostly inspired by all the recent BKL removal changes, but a lot of older updates also weren't properly recorded. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/Locking214
1 files changed, 102 insertions, 112 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index b6426f15b4ae..7686e7684495 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -18,7 +18,6 @@ prototypes:
18 char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); 18 char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
19 19
20locking rules: 20locking rules:
21 none have BKL
22 dcache_lock rename_lock ->d_lock may block 21 dcache_lock rename_lock ->d_lock may block
23d_revalidate: no no no yes 22d_revalidate: no no no yes
24d_hash no no no yes 23d_hash no no no yes
@@ -42,18 +41,23 @@ ata *);
42 int (*rename) (struct inode *, struct dentry *, 41 int (*rename) (struct inode *, struct dentry *,
43 struct inode *, struct dentry *); 42 struct inode *, struct dentry *);
44 int (*readlink) (struct dentry *, char __user *,int); 43 int (*readlink) (struct dentry *, char __user *,int);
45 int (*follow_link) (struct dentry *, struct nameidata *); 44 void * (*follow_link) (struct dentry *, struct nameidata *);
45 void (*put_link) (struct dentry *, struct nameidata *, void *);
46 void (*truncate) (struct inode *); 46 void (*truncate) (struct inode *);
47 int (*permission) (struct inode *, int, struct nameidata *); 47 int (*permission) (struct inode *, int, struct nameidata *);
48 int (*check_acl)(struct inode *, int);
48 int (*setattr) (struct dentry *, struct iattr *); 49 int (*setattr) (struct dentry *, struct iattr *);
49 int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); 50 int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
50 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); 51 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
51 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); 52 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
52 ssize_t (*listxattr) (struct dentry *, char *, size_t); 53 ssize_t (*listxattr) (struct dentry *, char *, size_t);
53 int (*removexattr) (struct dentry *, const char *); 54 int (*removexattr) (struct dentry *, const char *);
55 void (*truncate_range)(struct inode *, loff_t, loff_t);
56 long (*fallocate)(struct inode *inode, int mode, loff_t offset, loff_t len);
57 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
54 58
55locking rules: 59locking rules:
56 all may block, none have BKL 60 all may block
57 i_mutex(inode) 61 i_mutex(inode)
58lookup: yes 62lookup: yes
59create: yes 63create: yes
@@ -66,19 +70,24 @@ rmdir: yes (both) (see below)
66rename: yes (all) (see below) 70rename: yes (all) (see below)
67readlink: no 71readlink: no
68follow_link: no 72follow_link: no
73put_link: no
69truncate: yes (see below) 74truncate: yes (see below)
70setattr: yes 75setattr: yes
71permission: no 76permission: no
77check_acl: no
72getattr: no 78getattr: no
73setxattr: yes 79setxattr: yes
74getxattr: no 80getxattr: no
75listxattr: no 81listxattr: no
76removexattr: yes 82removexattr: yes
83truncate_range: yes
84fallocate: no
85fiemap: no
77 Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on 86 Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
78victim. 87victim.
79 cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. 88 cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
80 ->truncate() is never called directly - it's a callback, not a 89 ->truncate() is never called directly - it's a callback, not a
81method. It's called by vmtruncate() - library function normally used by 90method. It's called by vmtruncate() - deprecated library function used by
82->setattr(). Locking information above applies to that call (i.e. is 91->setattr(). Locking information above applies to that call (i.e. is
83inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been 92inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been
84passed). 93passed).
@@ -91,7 +100,7 @@ prototypes:
91 struct inode *(*alloc_inode)(struct super_block *sb); 100 struct inode *(*alloc_inode)(struct super_block *sb);
92 void (*destroy_inode)(struct inode *); 101 void (*destroy_inode)(struct inode *);
93 void (*dirty_inode) (struct inode *); 102 void (*dirty_inode) (struct inode *);
94 int (*write_inode) (struct inode *, int); 103 int (*write_inode) (struct inode *, struct writeback_control *wbc);
95 int (*drop_inode) (struct inode *); 104 int (*drop_inode) (struct inode *);
96 void (*evict_inode) (struct inode *); 105 void (*evict_inode) (struct inode *);
97 void (*put_super) (struct super_block *); 106 void (*put_super) (struct super_block *);
@@ -105,10 +114,11 @@ prototypes:
105 int (*show_options)(struct seq_file *, struct vfsmount *); 114 int (*show_options)(struct seq_file *, struct vfsmount *);
106 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); 115 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
107 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); 116 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
117 int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
118 int (*trim_fs) (struct super_block *, struct fstrim_range *);
108 119
109locking rules: 120locking rules:
110 All may block [not true, see below] 121 All may block [not true, see below]
111 None have BKL
112 s_umount 122 s_umount
113alloc_inode: 123alloc_inode:
114destroy_inode: 124destroy_inode:
@@ -127,6 +137,8 @@ umount_begin: no
127show_options: no (namespace_sem) 137show_options: no (namespace_sem)
128quota_read: no (see below) 138quota_read: no (see below)
129quota_write: no (see below) 139quota_write: no (see below)
140bdev_try_to_free_page: no (see below)
141trim_fs: no
130 142
131->statfs() has s_umount (shared) when called by ustat(2) (native or 143->statfs() has s_umount (shared) when called by ustat(2) (native or
132compat), but that's an accident of bad API; s_umount is used to pin 144compat), but that's an accident of bad API; s_umount is used to pin
@@ -139,19 +151,25 @@ be the only ones operating on the quota file by the quota code (via
139dqio_sem) (unless an admin really wants to screw up something and 151dqio_sem) (unless an admin really wants to screw up something and
140writes to quota files with quotas on). For other details about locking 152writes to quota files with quotas on). For other details about locking
141see also dquot_operations section. 153see also dquot_operations section.
154->bdev_try_to_free_page is called from the ->releasepage handler of
155the block device inode. See there for more details.
142 156
143--------------------------- file_system_type --------------------------- 157--------------------------- file_system_type ---------------------------
144prototypes: 158prototypes:
145 int (*get_sb) (struct file_system_type *, int, 159 int (*get_sb) (struct file_system_type *, int,
146 const char *, void *, struct vfsmount *); 160 const char *, void *, struct vfsmount *);
161 struct dentry *(*mount) (struct file_system_type *, int,
162 const char *, void *);
147 void (*kill_sb) (struct super_block *); 163 void (*kill_sb) (struct super_block *);
148locking rules: 164locking rules:
149 may block BKL 165 may block
150get_sb yes no 166get_sb yes
151kill_sb yes no 167mount yes
168kill_sb yes
152 169
153->get_sb() returns error or 0 with locked superblock attached to the vfsmount 170->get_sb() returns error or 0 with locked superblock attached to the vfsmount
154(exclusive on ->s_umount). 171(exclusive on ->s_umount).
172->mount() returns ERR_PTR or the root dentry.
155->kill_sb() takes a write-locked superblock, does all shutdown work on it, 173->kill_sb() takes a write-locked superblock, does all shutdown work on it,
156unlocks and drops the reference. 174unlocks and drops the reference.
157 175
@@ -176,27 +194,35 @@ prototypes:
176 void (*freepage)(struct page *); 194 void (*freepage)(struct page *);
177 int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, 195 int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
178 loff_t offset, unsigned long nr_segs); 196 loff_t offset, unsigned long nr_segs);
179 int (*launder_page) (struct page *); 197 int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **,
198 unsigned long *);
199 int (*migratepage)(struct address_space *, struct page *, struct page *);
200 int (*launder_page)(struct page *);
201 int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long);
202 int (*error_remove_page)(struct address_space *, struct page *);
180 203
181locking rules: 204locking rules:
182 All except set_page_dirty and freepage may block 205 All except set_page_dirty and freepage may block
183 206
184 BKL PageLocked(page) i_mutex 207 PageLocked(page) i_mutex
185writepage: no yes, unlocks (see below) 208writepage: yes, unlocks (see below)
186readpage: no yes, unlocks 209readpage: yes, unlocks
187sync_page: no maybe 210sync_page: maybe
188writepages: no 211writepages:
189set_page_dirty no no 212set_page_dirty no
190readpages: no 213readpages:
191write_begin: no locks the page yes 214write_begin: locks the page yes
192write_end: no yes, unlocks yes 215write_end: yes, unlocks yes
193perform_write: no n/a yes 216bmap:
194bmap: no 217invalidatepage: yes
195invalidatepage: no yes 218releasepage: yes
196releasepage: no yes 219freepage: yes
197freepage: no yes 220direct_IO:
198direct_IO: no 221get_xip_mem: maybe
199launder_page: no yes 222migratepage: yes (both)
223launder_page: yes
224is_partially_uptodate: yes
225error_remove_page: yes
200 226
201 ->write_begin(), ->write_end(), ->sync_page() and ->readpage() 227 ->write_begin(), ->write_end(), ->sync_page() and ->readpage()
202may be called from the request handler (/dev/loop). 228may be called from the request handler (/dev/loop).
@@ -276,9 +302,8 @@ under spinlock (it cannot block) and is sometimes called with the page
276not locked. 302not locked.
277 303
278 ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some 304 ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some
279filesystems and by the swapper. The latter will eventually go away. All 305filesystems and by the swapper. The latter will eventually go away. Please,
280instances do not actually need the BKL. Please, keep it that way and don't 306keep it that way and don't breed new callers.
281breed new callers.
282 307
283 ->invalidatepage() is called when the filesystem must attempt to drop 308 ->invalidatepage() is called when the filesystem must attempt to drop
284some or all of the buffers from the page when it is being truncated. It 309some or all of the buffers from the page when it is being truncated. It
@@ -299,47 +324,37 @@ cleaned, or an error value if not. Note that in order to prevent the page
299getting mapped back in and redirtied, it needs to be kept locked 324getting mapped back in and redirtied, it needs to be kept locked
300across the entire operation. 325across the entire operation.
301 326
302 Note: currently almost all instances of address_space methods are
303using BKL for internal serialization and that's one of the worst sources
304of contention. Normally they are calling library functions (in fs/buffer.c)
305and pass foo_get_block() as a callback (on local block-based filesystems,
306indeed). BKL is not needed for library stuff and is usually taken by
307foo_get_block(). It's an overkill, since block bitmaps can be protected by
308internal fs locking and real critical areas are much smaller than the areas
309filesystems protect now.
310
311----------------------- file_lock_operations ------------------------------ 327----------------------- file_lock_operations ------------------------------
312prototypes: 328prototypes:
313 void (*fl_insert)(struct file_lock *); /* lock insertion callback */
314 void (*fl_remove)(struct file_lock *); /* lock removal callback */
315 void (*fl_copy_lock)(struct file_lock *, struct file_lock *); 329 void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
316 void (*fl_release_private)(struct file_lock *); 330 void (*fl_release_private)(struct file_lock *);
317 331
318 332
319locking rules: 333locking rules:
320 BKL may block 334 file_lock_lock may block
321fl_insert: yes no 335fl_copy_lock: yes no
322fl_remove: yes no 336fl_release_private: maybe no
323fl_copy_lock: yes no
324fl_release_private: yes yes
325 337
326----------------------- lock_manager_operations --------------------------- 338----------------------- lock_manager_operations ---------------------------
327prototypes: 339prototypes:
328 int (*fl_compare_owner)(struct file_lock *, struct file_lock *); 340 int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
329 void (*fl_notify)(struct file_lock *); /* unblock callback */ 341 void (*fl_notify)(struct file_lock *); /* unblock callback */
342 int (*fl_grant)(struct file_lock *, struct file_lock *, int);
330 void (*fl_release_private)(struct file_lock *); 343 void (*fl_release_private)(struct file_lock *);
331 void (*fl_break)(struct file_lock *); /* break_lease callback */ 344 void (*fl_break)(struct file_lock *); /* break_lease callback */
345 int (*fl_mylease)(struct file_lock *, struct file_lock *);
346 int (*fl_change)(struct file_lock **, int);
332 347
333locking rules: 348locking rules:
334 BKL may block 349 file_lock_lock may block
335fl_compare_owner: yes no 350fl_compare_owner: yes no
336fl_notify: yes no 351fl_notify: yes no
337fl_release_private: yes yes 352fl_grant: no no
338fl_break: yes no 353fl_release_private: maybe no
339 354fl_break: yes no
340 Currently only NFSD and NLM provide instances of this class. None of the 355fl_mylease: yes no
341them block. If you have out-of-tree instances - please, show up. Locking 356fl_change yes no
342in that area will change. 357
343--------------------------- buffer_head ----------------------------------- 358--------------------------- buffer_head -----------------------------------
344prototypes: 359prototypes:
345 void (*b_end_io)(struct buffer_head *bh, int uptodate); 360 void (*b_end_io)(struct buffer_head *bh, int uptodate);
@@ -364,17 +379,17 @@ prototypes:
364 void (*swap_slot_free_notify) (struct block_device *, unsigned long); 379 void (*swap_slot_free_notify) (struct block_device *, unsigned long);
365 380
366locking rules: 381locking rules:
367 BKL bd_mutex 382 bd_mutex
368open: no yes 383open: yes
369release: no yes 384release: yes
370ioctl: no no 385ioctl: no
371compat_ioctl: no no 386compat_ioctl: no
372direct_access: no no 387direct_access: no
373media_changed: no no 388media_changed: no
374unlock_native_capacity: no no 389unlock_native_capacity: no
375revalidate_disk: no no 390revalidate_disk: no
376getgeo: no no 391getgeo: no
377swap_slot_free_notify: no no (see below) 392swap_slot_free_notify: no (see below)
378 393
379media_changed, unlock_native_capacity and revalidate_disk are called only from 394media_changed, unlock_native_capacity and revalidate_disk are called only from
380check_disk_change(). 395check_disk_change().
@@ -413,34 +428,21 @@ prototypes:
413 unsigned long (*get_unmapped_area)(struct file *, unsigned long, 428 unsigned long (*get_unmapped_area)(struct file *, unsigned long,
414 unsigned long, unsigned long, unsigned long); 429 unsigned long, unsigned long, unsigned long);
415 int (*check_flags)(int); 430 int (*check_flags)(int);
431 int (*flock) (struct file *, int, struct file_lock *);
432 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *,
433 size_t, unsigned int);
434 ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *,
435 size_t, unsigned int);
436 int (*setlease)(struct file *, long, struct file_lock **);
416}; 437};
417 438
418locking rules: 439locking rules:
419 All may block. 440 All may block except for ->setlease.
420 BKL 441 No VFS locks held on entry except for ->fsync and ->setlease.
421llseek: no (see below) 442
422read: no 443->fsync() has i_mutex on inode.
423aio_read: no 444
424write: no 445->setlease has the file_list_lock held and must not sleep.
425aio_write: no
426readdir: no
427poll: no
428unlocked_ioctl: no
429compat_ioctl: no
430mmap: no
431open: no
432flush: no
433release: no
434fsync: no (see below)
435aio_fsync: no
436fasync: no
437lock: yes
438readv: no
439writev: no
440sendfile: no
441sendpage: no
442get_unmapped_area: no
443check_flags: no
444 446
445->llseek() locking has moved from llseek to the individual llseek 447->llseek() locking has moved from llseek to the individual llseek
446implementations. If your fs is not using generic_file_llseek, you 448implementations. If your fs is not using generic_file_llseek, you
@@ -450,17 +452,10 @@ mutex or just to use i_size_read() instead.
450Note: this does not protect the file->f_pos against concurrent modifications 452Note: this does not protect the file->f_pos against concurrent modifications
451since this is something the userspace has to take care about. 453since this is something the userspace has to take care about.
452 454
453Note: ext2_release() was *the* source of contention on fs-intensive 455->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags.
454loads and dropping BKL on ->release() helps to get rid of that (we still 456Most instances call fasync_helper(), which does that maintenance, so it's
455grab BKL for cases when we close a file that had been opened r/w, but that 457not normally something one needs to worry about. Return values > 0 will be
456can and should be done using the internal locking with smaller critical areas). 458mapped to zero in the VFS layer.
457Current worst offender is ext2_get_block()...
458
459->fasync() is called without BKL protection, and is responsible for
460maintaining the FASYNC bit in filp->f_flags. Most instances call
461fasync_helper(), which does that maintenance, so it's not normally
462something one needs to worry about. Return values > 0 will be mapped to
463zero in the VFS layer.
464 459
465->readdir() and ->ioctl() on directories must be changed. Ideally we would 460->readdir() and ->ioctl() on directories must be changed. Ideally we would
466move ->readdir() to inode_operations and use a separate method for directory 461move ->readdir() to inode_operations and use a separate method for directory
@@ -471,8 +466,6 @@ components. And there are other reasons why the current interface is a mess...
471->read on directories probably must go away - we should just enforce -EISDIR 466->read on directories probably must go away - we should just enforce -EISDIR
472in sys_read() and friends. 467in sys_read() and friends.
473 468
474->fsync() has i_mutex on inode.
475
476--------------------------- dquot_operations ------------------------------- 469--------------------------- dquot_operations -------------------------------
477prototypes: 470prototypes:
478 int (*write_dquot) (struct dquot *); 471 int (*write_dquot) (struct dquot *);
@@ -507,12 +500,12 @@ prototypes:
507 int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); 500 int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
508 501
509locking rules: 502locking rules:
510 BKL mmap_sem PageLocked(page) 503 mmap_sem PageLocked(page)
511open: no yes 504open: yes
512close: no yes 505close: yes
513fault: no yes can return with page locked 506fault: yes can return with page locked
514page_mkwrite: no yes can return with page locked 507page_mkwrite: yes can return with page locked
515access: no yes 508access: yes
516 509
517 ->fault() is called when a previously not present pte is about 510 ->fault() is called when a previously not present pte is about
518to be faulted in. The filesystem must find and return the page associated 511to be faulted in. The filesystem must find and return the page associated
@@ -539,6 +532,3 @@ VM_IO | VM_PFNMAP VMAs.
539 532
540(if you break something or notice that it is broken and do not fix it yourself 533(if you break something or notice that it is broken and do not fix it yourself
541- at least put it here) 534- at least put it here)
542
543ipc/shm.c::shm_delete() - may need BKL.
544->read() and ->write() in many drivers are (probably) missing BKL.