aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/filesystems/Locking
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/filesystems/Locking')
-rw-r--r--Documentation/filesystems/Locking212
1 files changed, 100 insertions, 112 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index b6426f15b4ae..33fa3e5d38fd 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -18,7 +18,6 @@ prototypes:
18 char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); 18 char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
19 19
20locking rules: 20locking rules:
21 none have BKL
22 dcache_lock rename_lock ->d_lock may block 21 dcache_lock rename_lock ->d_lock may block
23d_revalidate: no no no yes 22d_revalidate: no no no yes
24d_hash no no no yes 23d_hash no no no yes
@@ -42,18 +41,23 @@ ata *);
42 int (*rename) (struct inode *, struct dentry *, 41 int (*rename) (struct inode *, struct dentry *,
43 struct inode *, struct dentry *); 42 struct inode *, struct dentry *);
44 int (*readlink) (struct dentry *, char __user *,int); 43 int (*readlink) (struct dentry *, char __user *,int);
45 int (*follow_link) (struct dentry *, struct nameidata *); 44 void * (*follow_link) (struct dentry *, struct nameidata *);
45 void (*put_link) (struct dentry *, struct nameidata *, void *);
46 void (*truncate) (struct inode *); 46 void (*truncate) (struct inode *);
47 int (*permission) (struct inode *, int, struct nameidata *); 47 int (*permission) (struct inode *, int, struct nameidata *);
48 int (*check_acl)(struct inode *, int);
48 int (*setattr) (struct dentry *, struct iattr *); 49 int (*setattr) (struct dentry *, struct iattr *);
49 int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); 50 int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
50 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); 51 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
51 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); 52 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
52 ssize_t (*listxattr) (struct dentry *, char *, size_t); 53 ssize_t (*listxattr) (struct dentry *, char *, size_t);
53 int (*removexattr) (struct dentry *, const char *); 54 int (*removexattr) (struct dentry *, const char *);
55 void (*truncate_range)(struct inode *, loff_t, loff_t);
56 long (*fallocate)(struct inode *inode, int mode, loff_t offset, loff_t len);
57 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
54 58
55locking rules: 59locking rules:
56 all may block, none have BKL 60 all may block
57 i_mutex(inode) 61 i_mutex(inode)
58lookup: yes 62lookup: yes
59create: yes 63create: yes
@@ -66,19 +70,24 @@ rmdir: yes (both) (see below)
66rename: yes (all) (see below) 70rename: yes (all) (see below)
67readlink: no 71readlink: no
68follow_link: no 72follow_link: no
73put_link: no
69truncate: yes (see below) 74truncate: yes (see below)
70setattr: yes 75setattr: yes
71permission: no 76permission: no
77check_acl: no
72getattr: no 78getattr: no
73setxattr: yes 79setxattr: yes
74getxattr: no 80getxattr: no
75listxattr: no 81listxattr: no
76removexattr: yes 82removexattr: yes
83truncate_range: yes
84fallocate: no
85fiemap: no
77 Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on 86 Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
78victim. 87victim.
79 cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. 88 cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
80 ->truncate() is never called directly - it's a callback, not a 89 ->truncate() is never called directly - it's a callback, not a
81method. It's called by vmtruncate() - library function normally used by 90method. It's called by vmtruncate() - deprecated library function used by
82->setattr(). Locking information above applies to that call (i.e. is 91->setattr(). Locking information above applies to that call (i.e. is
83inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been 92inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been
84passed). 93passed).
@@ -91,7 +100,7 @@ prototypes:
91 struct inode *(*alloc_inode)(struct super_block *sb); 100 struct inode *(*alloc_inode)(struct super_block *sb);
92 void (*destroy_inode)(struct inode *); 101 void (*destroy_inode)(struct inode *);
93 void (*dirty_inode) (struct inode *); 102 void (*dirty_inode) (struct inode *);
94 int (*write_inode) (struct inode *, int); 103 int (*write_inode) (struct inode *, struct writeback_control *wbc);
95 int (*drop_inode) (struct inode *); 104 int (*drop_inode) (struct inode *);
96 void (*evict_inode) (struct inode *); 105 void (*evict_inode) (struct inode *);
97 void (*put_super) (struct super_block *); 106 void (*put_super) (struct super_block *);
@@ -105,10 +114,10 @@ prototypes:
105 int (*show_options)(struct seq_file *, struct vfsmount *); 114 int (*show_options)(struct seq_file *, struct vfsmount *);
106 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); 115 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
107 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); 116 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
117 int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
108 118
109locking rules: 119locking rules:
110 All may block [not true, see below] 120 All may block [not true, see below]
111 None have BKL
112 s_umount 121 s_umount
113alloc_inode: 122alloc_inode:
114destroy_inode: 123destroy_inode:
@@ -127,6 +136,7 @@ umount_begin: no
127show_options: no (namespace_sem) 136show_options: no (namespace_sem)
128quota_read: no (see below) 137quota_read: no (see below)
129quota_write: no (see below) 138quota_write: no (see below)
139bdev_try_to_free_page: no (see below)
130 140
131->statfs() has s_umount (shared) when called by ustat(2) (native or 141->statfs() has s_umount (shared) when called by ustat(2) (native or
132compat), but that's an accident of bad API; s_umount is used to pin 142compat), but that's an accident of bad API; s_umount is used to pin
@@ -139,19 +149,25 @@ be the only ones operating on the quota file by the quota code (via
139dqio_sem) (unless an admin really wants to screw up something and 149dqio_sem) (unless an admin really wants to screw up something and
140writes to quota files with quotas on). For other details about locking 150writes to quota files with quotas on). For other details about locking
141see also dquot_operations section. 151see also dquot_operations section.
152->bdev_try_to_free_page is called from the ->releasepage handler of
153the block device inode. See there for more details.
142 154
143--------------------------- file_system_type --------------------------- 155--------------------------- file_system_type ---------------------------
144prototypes: 156prototypes:
145 int (*get_sb) (struct file_system_type *, int, 157 int (*get_sb) (struct file_system_type *, int,
146 const char *, void *, struct vfsmount *); 158 const char *, void *, struct vfsmount *);
159 struct dentry *(*mount) (struct file_system_type *, int,
160 const char *, void *);
147 void (*kill_sb) (struct super_block *); 161 void (*kill_sb) (struct super_block *);
148locking rules: 162locking rules:
149 may block BKL 163 may block
150get_sb yes no 164get_sb yes
151kill_sb yes no 165mount yes
166kill_sb yes
152 167
153->get_sb() returns error or 0 with locked superblock attached to the vfsmount 168->get_sb() returns error or 0 with locked superblock attached to the vfsmount
154(exclusive on ->s_umount). 169(exclusive on ->s_umount).
170->mount() returns ERR_PTR or the root dentry.
155->kill_sb() takes a write-locked superblock, does all shutdown work on it, 171->kill_sb() takes a write-locked superblock, does all shutdown work on it,
156unlocks and drops the reference. 172unlocks and drops the reference.
157 173
@@ -176,27 +192,35 @@ prototypes:
176 void (*freepage)(struct page *); 192 void (*freepage)(struct page *);
177 int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, 193 int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
178 loff_t offset, unsigned long nr_segs); 194 loff_t offset, unsigned long nr_segs);
179 int (*launder_page) (struct page *); 195 int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **,
196 unsigned long *);
197 int (*migratepage)(struct address_space *, struct page *, struct page *);
198 int (*launder_page)(struct page *);
199 int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long);
200 int (*error_remove_page)(struct address_space *, struct page *);
180 201
181locking rules: 202locking rules:
182 All except set_page_dirty and freepage may block 203 All except set_page_dirty and freepage may block
183 204
184 BKL PageLocked(page) i_mutex 205 PageLocked(page) i_mutex
185writepage: no yes, unlocks (see below) 206writepage: yes, unlocks (see below)
186readpage: no yes, unlocks 207readpage: yes, unlocks
187sync_page: no maybe 208sync_page: maybe
188writepages: no 209writepages:
189set_page_dirty no no 210set_page_dirty no
190readpages: no 211readpages:
191write_begin: no locks the page yes 212write_begin: locks the page yes
192write_end: no yes, unlocks yes 213write_end: yes, unlocks yes
193perform_write: no n/a yes 214bmap:
194bmap: no 215invalidatepage: yes
195invalidatepage: no yes 216releasepage: yes
196releasepage: no yes 217freepage: yes
197freepage: no yes 218direct_IO:
198direct_IO: no 219get_xip_mem: maybe
199launder_page: no yes 220migratepage: yes (both)
221launder_page: yes
222is_partially_uptodate: yes
223error_remove_page: yes
200 224
201 ->write_begin(), ->write_end(), ->sync_page() and ->readpage() 225 ->write_begin(), ->write_end(), ->sync_page() and ->readpage()
202may be called from the request handler (/dev/loop). 226may be called from the request handler (/dev/loop).
@@ -276,9 +300,8 @@ under spinlock (it cannot block) and is sometimes called with the page
276not locked. 300not locked.
277 301
278 ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some 302 ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some
279filesystems and by the swapper. The latter will eventually go away. All 303filesystems and by the swapper. The latter will eventually go away. Please,
280instances do not actually need the BKL. Please, keep it that way and don't 304keep it that way and don't breed new callers.
281breed new callers.
282 305
283 ->invalidatepage() is called when the filesystem must attempt to drop 306 ->invalidatepage() is called when the filesystem must attempt to drop
284some or all of the buffers from the page when it is being truncated. It 307some or all of the buffers from the page when it is being truncated. It
@@ -299,47 +322,37 @@ cleaned, or an error value if not. Note that in order to prevent the page
299getting mapped back in and redirtied, it needs to be kept locked 322getting mapped back in and redirtied, it needs to be kept locked
300across the entire operation. 323across the entire operation.
301 324
302 Note: currently almost all instances of address_space methods are
303using BKL for internal serialization and that's one of the worst sources
304of contention. Normally they are calling library functions (in fs/buffer.c)
305and pass foo_get_block() as a callback (on local block-based filesystems,
306indeed). BKL is not needed for library stuff and is usually taken by
307foo_get_block(). It's an overkill, since block bitmaps can be protected by
308internal fs locking and real critical areas are much smaller than the areas
309filesystems protect now.
310
311----------------------- file_lock_operations ------------------------------ 325----------------------- file_lock_operations ------------------------------
312prototypes: 326prototypes:
313 void (*fl_insert)(struct file_lock *); /* lock insertion callback */
314 void (*fl_remove)(struct file_lock *); /* lock removal callback */
315 void (*fl_copy_lock)(struct file_lock *, struct file_lock *); 327 void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
316 void (*fl_release_private)(struct file_lock *); 328 void (*fl_release_private)(struct file_lock *);
317 329
318 330
319locking rules: 331locking rules:
320 BKL may block 332 file_lock_lock may block
321fl_insert: yes no 333fl_copy_lock: yes no
322fl_remove: yes no 334fl_release_private: maybe no
323fl_copy_lock: yes no
324fl_release_private: yes yes
325 335
326----------------------- lock_manager_operations --------------------------- 336----------------------- lock_manager_operations ---------------------------
327prototypes: 337prototypes:
328 int (*fl_compare_owner)(struct file_lock *, struct file_lock *); 338 int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
329 void (*fl_notify)(struct file_lock *); /* unblock callback */ 339 void (*fl_notify)(struct file_lock *); /* unblock callback */
340 int (*fl_grant)(struct file_lock *, struct file_lock *, int);
330 void (*fl_release_private)(struct file_lock *); 341 void (*fl_release_private)(struct file_lock *);
331 void (*fl_break)(struct file_lock *); /* break_lease callback */ 342 void (*fl_break)(struct file_lock *); /* break_lease callback */
343 int (*fl_mylease)(struct file_lock *, struct file_lock *);
344 int (*fl_change)(struct file_lock **, int);
332 345
333locking rules: 346locking rules:
334 BKL may block 347 file_lock_lock may block
335fl_compare_owner: yes no 348fl_compare_owner: yes no
336fl_notify: yes no 349fl_notify: yes no
337fl_release_private: yes yes 350fl_grant: no no
338fl_break: yes no 351fl_release_private: maybe no
339 352fl_break: yes no
340 Currently only NFSD and NLM provide instances of this class. None of the 353fl_mylease: yes no
341them block. If you have out-of-tree instances - please, show up. Locking 354fl_change yes no
342in that area will change. 355
343--------------------------- buffer_head ----------------------------------- 356--------------------------- buffer_head -----------------------------------
344prototypes: 357prototypes:
345 void (*b_end_io)(struct buffer_head *bh, int uptodate); 358 void (*b_end_io)(struct buffer_head *bh, int uptodate);
@@ -364,17 +377,17 @@ prototypes:
364 void (*swap_slot_free_notify) (struct block_device *, unsigned long); 377 void (*swap_slot_free_notify) (struct block_device *, unsigned long);
365 378
366locking rules: 379locking rules:
367 BKL bd_mutex 380 bd_mutex
368open: no yes 381open: yes
369release: no yes 382release: yes
370ioctl: no no 383ioctl: no
371compat_ioctl: no no 384compat_ioctl: no
372direct_access: no no 385direct_access: no
373media_changed: no no 386media_changed: no
374unlock_native_capacity: no no 387unlock_native_capacity: no
375revalidate_disk: no no 388revalidate_disk: no
376getgeo: no no 389getgeo: no
377swap_slot_free_notify: no no (see below) 390swap_slot_free_notify: no (see below)
378 391
379media_changed, unlock_native_capacity and revalidate_disk are called only from 392media_changed, unlock_native_capacity and revalidate_disk are called only from
380check_disk_change(). 393check_disk_change().
@@ -413,34 +426,21 @@ prototypes:
413 unsigned long (*get_unmapped_area)(struct file *, unsigned long, 426 unsigned long (*get_unmapped_area)(struct file *, unsigned long,
414 unsigned long, unsigned long, unsigned long); 427 unsigned long, unsigned long, unsigned long);
415 int (*check_flags)(int); 428 int (*check_flags)(int);
429 int (*flock) (struct file *, int, struct file_lock *);
430 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *,
431 size_t, unsigned int);
432 ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *,
433 size_t, unsigned int);
434 int (*setlease)(struct file *, long, struct file_lock **);
416}; 435};
417 436
418locking rules: 437locking rules:
419 All may block. 438 All may block except for ->setlease.
420 BKL 439 No VFS locks held on entry except for ->fsync and ->setlease.
421llseek: no (see below) 440
422read: no 441->fsync() has i_mutex on inode.
423aio_read: no 442
424write: no 443->setlease has the file_list_lock held and must not sleep.
425aio_write: no
426readdir: no
427poll: no
428unlocked_ioctl: no
429compat_ioctl: no
430mmap: no
431open: no
432flush: no
433release: no
434fsync: no (see below)
435aio_fsync: no
436fasync: no
437lock: yes
438readv: no
439writev: no
440sendfile: no
441sendpage: no
442get_unmapped_area: no
443check_flags: no
444 444
445->llseek() locking has moved from llseek to the individual llseek 445->llseek() locking has moved from llseek to the individual llseek
446implementations. If your fs is not using generic_file_llseek, you 446implementations. If your fs is not using generic_file_llseek, you
@@ -450,17 +450,10 @@ mutex or just to use i_size_read() instead.
450Note: this does not protect the file->f_pos against concurrent modifications 450Note: this does not protect the file->f_pos against concurrent modifications
451since this is something the userspace has to take care about. 451since this is something the userspace has to take care about.
452 452
453Note: ext2_release() was *the* source of contention on fs-intensive 453->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags.
454loads and dropping BKL on ->release() helps to get rid of that (we still 454Most instances call fasync_helper(), which does that maintenance, so it's
455grab BKL for cases when we close a file that had been opened r/w, but that 455not normally something one needs to worry about. Return values > 0 will be
456can and should be done using the internal locking with smaller critical areas). 456mapped to zero in the VFS layer.
457Current worst offender is ext2_get_block()...
458
459->fasync() is called without BKL protection, and is responsible for
460maintaining the FASYNC bit in filp->f_flags. Most instances call
461fasync_helper(), which does that maintenance, so it's not normally
462something one needs to worry about. Return values > 0 will be mapped to
463zero in the VFS layer.
464 457
465->readdir() and ->ioctl() on directories must be changed. Ideally we would 458->readdir() and ->ioctl() on directories must be changed. Ideally we would
466move ->readdir() to inode_operations and use a separate method for directory 459move ->readdir() to inode_operations and use a separate method for directory
@@ -471,8 +464,6 @@ components. And there are other reasons why the current interface is a mess...
471->read on directories probably must go away - we should just enforce -EISDIR 464->read on directories probably must go away - we should just enforce -EISDIR
472in sys_read() and friends. 465in sys_read() and friends.
473 466
474->fsync() has i_mutex on inode.
475
476--------------------------- dquot_operations ------------------------------- 467--------------------------- dquot_operations -------------------------------
477prototypes: 468prototypes:
478 int (*write_dquot) (struct dquot *); 469 int (*write_dquot) (struct dquot *);
@@ -507,12 +498,12 @@ prototypes:
507 int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); 498 int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
508 499
509locking rules: 500locking rules:
510 BKL mmap_sem PageLocked(page) 501 mmap_sem PageLocked(page)
511open: no yes 502open: yes
512close: no yes 503close: yes
513fault: no yes can return with page locked 504fault: yes can return with page locked
514page_mkwrite: no yes can return with page locked 505page_mkwrite: yes can return with page locked
515access: no yes 506access: yes
516 507
517 ->fault() is called when a previously not present pte is about 508 ->fault() is called when a previously not present pte is about
518to be faulted in. The filesystem must find and return the page associated 509to be faulted in. The filesystem must find and return the page associated
@@ -539,6 +530,3 @@ VM_IO | VM_PFNMAP VMAs.
539 530
540(if you break something or notice that it is broken and do not fix it yourself 531(if you break something or notice that it is broken and do not fix it yourself
541- at least put it here) 532- at least put it here)
542
543ipc/shm.c::shm_delete() - may need BKL.
544->read() and ->write() in many drivers are (probably) missing BKL.