diff options
Diffstat (limited to 'Documentation/filesystems/Locking')
-rw-r--r-- | Documentation/filesystems/Locking | 212 |
1 files changed, 100 insertions, 112 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index b6426f15b4ae..33fa3e5d38fd 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -18,7 +18,6 @@ prototypes: | |||
18 | char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); | 18 | char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); |
19 | 19 | ||
20 | locking rules: | 20 | locking rules: |
21 | none have BKL | ||
22 | dcache_lock rename_lock ->d_lock may block | 21 | dcache_lock rename_lock ->d_lock may block |
23 | d_revalidate: no no no yes | 22 | d_revalidate: no no no yes |
24 | d_hash no no no yes | 23 | d_hash no no no yes |
@@ -42,18 +41,23 @@ ata *); | |||
42 | int (*rename) (struct inode *, struct dentry *, | 41 | int (*rename) (struct inode *, struct dentry *, |
43 | struct inode *, struct dentry *); | 42 | struct inode *, struct dentry *); |
44 | int (*readlink) (struct dentry *, char __user *,int); | 43 | int (*readlink) (struct dentry *, char __user *,int); |
45 | int (*follow_link) (struct dentry *, struct nameidata *); | 44 | void * (*follow_link) (struct dentry *, struct nameidata *); |
45 | void (*put_link) (struct dentry *, struct nameidata *, void *); | ||
46 | void (*truncate) (struct inode *); | 46 | void (*truncate) (struct inode *); |
47 | int (*permission) (struct inode *, int, struct nameidata *); | 47 | int (*permission) (struct inode *, int, struct nameidata *); |
48 | int (*check_acl)(struct inode *, int); | ||
48 | int (*setattr) (struct dentry *, struct iattr *); | 49 | int (*setattr) (struct dentry *, struct iattr *); |
49 | int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); | 50 | int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); |
50 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); | 51 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); |
51 | ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); | 52 | ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); |
52 | ssize_t (*listxattr) (struct dentry *, char *, size_t); | 53 | ssize_t (*listxattr) (struct dentry *, char *, size_t); |
53 | int (*removexattr) (struct dentry *, const char *); | 54 | int (*removexattr) (struct dentry *, const char *); |
55 | void (*truncate_range)(struct inode *, loff_t, loff_t); | ||
56 | long (*fallocate)(struct inode *inode, int mode, loff_t offset, loff_t len); | ||
57 | int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); | ||
54 | 58 | ||
55 | locking rules: | 59 | locking rules: |
56 | all may block, none have BKL | 60 | all may block |
57 | i_mutex(inode) | 61 | i_mutex(inode) |
58 | lookup: yes | 62 | lookup: yes |
59 | create: yes | 63 | create: yes |
@@ -66,19 +70,24 @@ rmdir: yes (both) (see below) | |||
66 | rename: yes (all) (see below) | 70 | rename: yes (all) (see below) |
67 | readlink: no | 71 | readlink: no |
68 | follow_link: no | 72 | follow_link: no |
73 | put_link: no | ||
69 | truncate: yes (see below) | 74 | truncate: yes (see below) |
70 | setattr: yes | 75 | setattr: yes |
71 | permission: no | 76 | permission: no |
77 | check_acl: no | ||
72 | getattr: no | 78 | getattr: no |
73 | setxattr: yes | 79 | setxattr: yes |
74 | getxattr: no | 80 | getxattr: no |
75 | listxattr: no | 81 | listxattr: no |
76 | removexattr: yes | 82 | removexattr: yes |
83 | truncate_range: yes | ||
84 | fallocate: no | ||
85 | fiemap: no | ||
77 | Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on | 86 | Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on |
78 | victim. | 87 | victim. |
79 | cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. | 88 | cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. |
80 | ->truncate() is never called directly - it's a callback, not a | 89 | ->truncate() is never called directly - it's a callback, not a |
81 | method. It's called by vmtruncate() - library function normally used by | 90 | method. It's called by vmtruncate() - deprecated library function used by |
82 | ->setattr(). Locking information above applies to that call (i.e. is | 91 | ->setattr(). Locking information above applies to that call (i.e. is |
83 | inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been | 92 | inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been |
84 | passed). | 93 | passed). |
@@ -91,7 +100,7 @@ prototypes: | |||
91 | struct inode *(*alloc_inode)(struct super_block *sb); | 100 | struct inode *(*alloc_inode)(struct super_block *sb); |
92 | void (*destroy_inode)(struct inode *); | 101 | void (*destroy_inode)(struct inode *); |
93 | void (*dirty_inode) (struct inode *); | 102 | void (*dirty_inode) (struct inode *); |
94 | int (*write_inode) (struct inode *, int); | 103 | int (*write_inode) (struct inode *, struct writeback_control *wbc); |
95 | int (*drop_inode) (struct inode *); | 104 | int (*drop_inode) (struct inode *); |
96 | void (*evict_inode) (struct inode *); | 105 | void (*evict_inode) (struct inode *); |
97 | void (*put_super) (struct super_block *); | 106 | void (*put_super) (struct super_block *); |
@@ -105,10 +114,10 @@ prototypes: | |||
105 | int (*show_options)(struct seq_file *, struct vfsmount *); | 114 | int (*show_options)(struct seq_file *, struct vfsmount *); |
106 | ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); | 115 | ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); |
107 | ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); | 116 | ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); |
117 | int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); | ||
108 | 118 | ||
109 | locking rules: | 119 | locking rules: |
110 | All may block [not true, see below] | 120 | All may block [not true, see below] |
111 | None have BKL | ||
112 | s_umount | 121 | s_umount |
113 | alloc_inode: | 122 | alloc_inode: |
114 | destroy_inode: | 123 | destroy_inode: |
@@ -127,6 +136,7 @@ umount_begin: no | |||
127 | show_options: no (namespace_sem) | 136 | show_options: no (namespace_sem) |
128 | quota_read: no (see below) | 137 | quota_read: no (see below) |
129 | quota_write: no (see below) | 138 | quota_write: no (see below) |
139 | bdev_try_to_free_page: no (see below) | ||
130 | 140 | ||
131 | ->statfs() has s_umount (shared) when called by ustat(2) (native or | 141 | ->statfs() has s_umount (shared) when called by ustat(2) (native or |
132 | compat), but that's an accident of bad API; s_umount is used to pin | 142 | compat), but that's an accident of bad API; s_umount is used to pin |
@@ -139,19 +149,25 @@ be the only ones operating on the quota file by the quota code (via | |||
139 | dqio_sem) (unless an admin really wants to screw up something and | 149 | dqio_sem) (unless an admin really wants to screw up something and |
140 | writes to quota files with quotas on). For other details about locking | 150 | writes to quota files with quotas on). For other details about locking |
141 | see also dquot_operations section. | 151 | see also dquot_operations section. |
152 | ->bdev_try_to_free_page is called from the ->releasepage handler of | ||
153 | the block device inode. See there for more details. | ||
142 | 154 | ||
143 | --------------------------- file_system_type --------------------------- | 155 | --------------------------- file_system_type --------------------------- |
144 | prototypes: | 156 | prototypes: |
145 | int (*get_sb) (struct file_system_type *, int, | 157 | int (*get_sb) (struct file_system_type *, int, |
146 | const char *, void *, struct vfsmount *); | 158 | const char *, void *, struct vfsmount *); |
159 | struct dentry *(*mount) (struct file_system_type *, int, | ||
160 | const char *, void *); | ||
147 | void (*kill_sb) (struct super_block *); | 161 | void (*kill_sb) (struct super_block *); |
148 | locking rules: | 162 | locking rules: |
149 | may block BKL | 163 | may block |
150 | get_sb yes no | 164 | get_sb yes |
151 | kill_sb yes no | 165 | mount yes |
166 | kill_sb yes | ||
152 | 167 | ||
153 | ->get_sb() returns error or 0 with locked superblock attached to the vfsmount | 168 | ->get_sb() returns error or 0 with locked superblock attached to the vfsmount |
154 | (exclusive on ->s_umount). | 169 | (exclusive on ->s_umount). |
170 | ->mount() returns ERR_PTR or the root dentry. | ||
155 | ->kill_sb() takes a write-locked superblock, does all shutdown work on it, | 171 | ->kill_sb() takes a write-locked superblock, does all shutdown work on it, |
156 | unlocks and drops the reference. | 172 | unlocks and drops the reference. |
157 | 173 | ||
@@ -176,27 +192,35 @@ prototypes: | |||
176 | void (*freepage)(struct page *); | 192 | void (*freepage)(struct page *); |
177 | int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, | 193 | int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, |
178 | loff_t offset, unsigned long nr_segs); | 194 | loff_t offset, unsigned long nr_segs); |
179 | int (*launder_page) (struct page *); | 195 | int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, |
196 | unsigned long *); | ||
197 | int (*migratepage)(struct address_space *, struct page *, struct page *); | ||
198 | int (*launder_page)(struct page *); | ||
199 | int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long); | ||
200 | int (*error_remove_page)(struct address_space *, struct page *); | ||
180 | 201 | ||
181 | locking rules: | 202 | locking rules: |
182 | All except set_page_dirty and freepage may block | 203 | All except set_page_dirty and freepage may block |
183 | 204 | ||
184 | BKL PageLocked(page) i_mutex | 205 | PageLocked(page) i_mutex |
185 | writepage: no yes, unlocks (see below) | 206 | writepage: yes, unlocks (see below) |
186 | readpage: no yes, unlocks | 207 | readpage: yes, unlocks |
187 | sync_page: no maybe | 208 | sync_page: maybe |
188 | writepages: no | 209 | writepages: |
189 | set_page_dirty no no | 210 | set_page_dirty no |
190 | readpages: no | 211 | readpages: |
191 | write_begin: no locks the page yes | 212 | write_begin: locks the page yes |
192 | write_end: no yes, unlocks yes | 213 | write_end: yes, unlocks yes |
193 | perform_write: no n/a yes | 214 | bmap: |
194 | bmap: no | 215 | invalidatepage: yes |
195 | invalidatepage: no yes | 216 | releasepage: yes |
196 | releasepage: no yes | 217 | freepage: yes |
197 | freepage: no yes | 218 | direct_IO: |
198 | direct_IO: no | 219 | get_xip_mem: maybe |
199 | launder_page: no yes | 220 | migratepage: yes (both) |
221 | launder_page: yes | ||
222 | is_partially_uptodate: yes | ||
223 | error_remove_page: yes | ||
200 | 224 | ||
201 | ->write_begin(), ->write_end(), ->sync_page() and ->readpage() | 225 | ->write_begin(), ->write_end(), ->sync_page() and ->readpage() |
202 | may be called from the request handler (/dev/loop). | 226 | may be called from the request handler (/dev/loop). |
@@ -276,9 +300,8 @@ under spinlock (it cannot block) and is sometimes called with the page | |||
276 | not locked. | 300 | not locked. |
277 | 301 | ||
278 | ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some | 302 | ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some |
279 | filesystems and by the swapper. The latter will eventually go away. All | 303 | filesystems and by the swapper. The latter will eventually go away. Please, |
280 | instances do not actually need the BKL. Please, keep it that way and don't | 304 | keep it that way and don't breed new callers. |
281 | breed new callers. | ||
282 | 305 | ||
283 | ->invalidatepage() is called when the filesystem must attempt to drop | 306 | ->invalidatepage() is called when the filesystem must attempt to drop |
284 | some or all of the buffers from the page when it is being truncated. It | 307 | some or all of the buffers from the page when it is being truncated. It |
@@ -299,47 +322,37 @@ cleaned, or an error value if not. Note that in order to prevent the page | |||
299 | getting mapped back in and redirtied, it needs to be kept locked | 322 | getting mapped back in and redirtied, it needs to be kept locked |
300 | across the entire operation. | 323 | across the entire operation. |
301 | 324 | ||
302 | Note: currently almost all instances of address_space methods are | ||
303 | using BKL for internal serialization and that's one of the worst sources | ||
304 | of contention. Normally they are calling library functions (in fs/buffer.c) | ||
305 | and pass foo_get_block() as a callback (on local block-based filesystems, | ||
306 | indeed). BKL is not needed for library stuff and is usually taken by | ||
307 | foo_get_block(). It's an overkill, since block bitmaps can be protected by | ||
308 | internal fs locking and real critical areas are much smaller than the areas | ||
309 | filesystems protect now. | ||
310 | |||
311 | ----------------------- file_lock_operations ------------------------------ | 325 | ----------------------- file_lock_operations ------------------------------ |
312 | prototypes: | 326 | prototypes: |
313 | void (*fl_insert)(struct file_lock *); /* lock insertion callback */ | ||
314 | void (*fl_remove)(struct file_lock *); /* lock removal callback */ | ||
315 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); | 327 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); |
316 | void (*fl_release_private)(struct file_lock *); | 328 | void (*fl_release_private)(struct file_lock *); |
317 | 329 | ||
318 | 330 | ||
319 | locking rules: | 331 | locking rules: |
320 | BKL may block | 332 | file_lock_lock may block |
321 | fl_insert: yes no | 333 | fl_copy_lock: yes no |
322 | fl_remove: yes no | 334 | fl_release_private: maybe no |
323 | fl_copy_lock: yes no | ||
324 | fl_release_private: yes yes | ||
325 | 335 | ||
326 | ----------------------- lock_manager_operations --------------------------- | 336 | ----------------------- lock_manager_operations --------------------------- |
327 | prototypes: | 337 | prototypes: |
328 | int (*fl_compare_owner)(struct file_lock *, struct file_lock *); | 338 | int (*fl_compare_owner)(struct file_lock *, struct file_lock *); |
329 | void (*fl_notify)(struct file_lock *); /* unblock callback */ | 339 | void (*fl_notify)(struct file_lock *); /* unblock callback */ |
340 | int (*fl_grant)(struct file_lock *, struct file_lock *, int); | ||
330 | void (*fl_release_private)(struct file_lock *); | 341 | void (*fl_release_private)(struct file_lock *); |
331 | void (*fl_break)(struct file_lock *); /* break_lease callback */ | 342 | void (*fl_break)(struct file_lock *); /* break_lease callback */ |
343 | int (*fl_mylease)(struct file_lock *, struct file_lock *); | ||
344 | int (*fl_change)(struct file_lock **, int); | ||
332 | 345 | ||
333 | locking rules: | 346 | locking rules: |
334 | BKL may block | 347 | file_lock_lock may block |
335 | fl_compare_owner: yes no | 348 | fl_compare_owner: yes no |
336 | fl_notify: yes no | 349 | fl_notify: yes no |
337 | fl_release_private: yes yes | 350 | fl_grant: no no |
338 | fl_break: yes no | 351 | fl_release_private: maybe no |
339 | 352 | fl_break: yes no | |
340 | Currently only NFSD and NLM provide instances of this class. None of the | 353 | fl_mylease: yes no |
341 | them block. If you have out-of-tree instances - please, show up. Locking | 354 | fl_change yes no |
342 | in that area will change. | 355 | |
343 | --------------------------- buffer_head ----------------------------------- | 356 | --------------------------- buffer_head ----------------------------------- |
344 | prototypes: | 357 | prototypes: |
345 | void (*b_end_io)(struct buffer_head *bh, int uptodate); | 358 | void (*b_end_io)(struct buffer_head *bh, int uptodate); |
@@ -364,17 +377,17 @@ prototypes: | |||
364 | void (*swap_slot_free_notify) (struct block_device *, unsigned long); | 377 | void (*swap_slot_free_notify) (struct block_device *, unsigned long); |
365 | 378 | ||
366 | locking rules: | 379 | locking rules: |
367 | BKL bd_mutex | 380 | bd_mutex |
368 | open: no yes | 381 | open: yes |
369 | release: no yes | 382 | release: yes |
370 | ioctl: no no | 383 | ioctl: no |
371 | compat_ioctl: no no | 384 | compat_ioctl: no |
372 | direct_access: no no | 385 | direct_access: no |
373 | media_changed: no no | 386 | media_changed: no |
374 | unlock_native_capacity: no no | 387 | unlock_native_capacity: no |
375 | revalidate_disk: no no | 388 | revalidate_disk: no |
376 | getgeo: no no | 389 | getgeo: no |
377 | swap_slot_free_notify: no no (see below) | 390 | swap_slot_free_notify: no (see below) |
378 | 391 | ||
379 | media_changed, unlock_native_capacity and revalidate_disk are called only from | 392 | media_changed, unlock_native_capacity and revalidate_disk are called only from |
380 | check_disk_change(). | 393 | check_disk_change(). |
@@ -413,34 +426,21 @@ prototypes: | |||
413 | unsigned long (*get_unmapped_area)(struct file *, unsigned long, | 426 | unsigned long (*get_unmapped_area)(struct file *, unsigned long, |
414 | unsigned long, unsigned long, unsigned long); | 427 | unsigned long, unsigned long, unsigned long); |
415 | int (*check_flags)(int); | 428 | int (*check_flags)(int); |
429 | int (*flock) (struct file *, int, struct file_lock *); | ||
430 | ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, | ||
431 | size_t, unsigned int); | ||
432 | ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, | ||
433 | size_t, unsigned int); | ||
434 | int (*setlease)(struct file *, long, struct file_lock **); | ||
416 | }; | 435 | }; |
417 | 436 | ||
418 | locking rules: | 437 | locking rules: |
419 | All may block. | 438 | All may block except for ->setlease. |
420 | BKL | 439 | No VFS locks held on entry except for ->fsync and ->setlease. |
421 | llseek: no (see below) | 440 | |
422 | read: no | 441 | ->fsync() has i_mutex on inode. |
423 | aio_read: no | 442 | |
424 | write: no | 443 | ->setlease has the file_list_lock held and must not sleep. |
425 | aio_write: no | ||
426 | readdir: no | ||
427 | poll: no | ||
428 | unlocked_ioctl: no | ||
429 | compat_ioctl: no | ||
430 | mmap: no | ||
431 | open: no | ||
432 | flush: no | ||
433 | release: no | ||
434 | fsync: no (see below) | ||
435 | aio_fsync: no | ||
436 | fasync: no | ||
437 | lock: yes | ||
438 | readv: no | ||
439 | writev: no | ||
440 | sendfile: no | ||
441 | sendpage: no | ||
442 | get_unmapped_area: no | ||
443 | check_flags: no | ||
444 | 444 | ||
445 | ->llseek() locking has moved from llseek to the individual llseek | 445 | ->llseek() locking has moved from llseek to the individual llseek |
446 | implementations. If your fs is not using generic_file_llseek, you | 446 | implementations. If your fs is not using generic_file_llseek, you |
@@ -450,17 +450,10 @@ mutex or just to use i_size_read() instead. | |||
450 | Note: this does not protect the file->f_pos against concurrent modifications | 450 | Note: this does not protect the file->f_pos against concurrent modifications |
451 | since this is something the userspace has to take care about. | 451 | since this is something the userspace has to take care about. |
452 | 452 | ||
453 | Note: ext2_release() was *the* source of contention on fs-intensive | 453 | ->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags. |
454 | loads and dropping BKL on ->release() helps to get rid of that (we still | 454 | Most instances call fasync_helper(), which does that maintenance, so it's |
455 | grab BKL for cases when we close a file that had been opened r/w, but that | 455 | not normally something one needs to worry about. Return values > 0 will be |
456 | can and should be done using the internal locking with smaller critical areas). | 456 | mapped to zero in the VFS layer. |
457 | Current worst offender is ext2_get_block()... | ||
458 | |||
459 | ->fasync() is called without BKL protection, and is responsible for | ||
460 | maintaining the FASYNC bit in filp->f_flags. Most instances call | ||
461 | fasync_helper(), which does that maintenance, so it's not normally | ||
462 | something one needs to worry about. Return values > 0 will be mapped to | ||
463 | zero in the VFS layer. | ||
464 | 457 | ||
465 | ->readdir() and ->ioctl() on directories must be changed. Ideally we would | 458 | ->readdir() and ->ioctl() on directories must be changed. Ideally we would |
466 | move ->readdir() to inode_operations and use a separate method for directory | 459 | move ->readdir() to inode_operations and use a separate method for directory |
@@ -471,8 +464,6 @@ components. And there are other reasons why the current interface is a mess... | |||
471 | ->read on directories probably must go away - we should just enforce -EISDIR | 464 | ->read on directories probably must go away - we should just enforce -EISDIR |
472 | in sys_read() and friends. | 465 | in sys_read() and friends. |
473 | 466 | ||
474 | ->fsync() has i_mutex on inode. | ||
475 | |||
476 | --------------------------- dquot_operations ------------------------------- | 467 | --------------------------- dquot_operations ------------------------------- |
477 | prototypes: | 468 | prototypes: |
478 | int (*write_dquot) (struct dquot *); | 469 | int (*write_dquot) (struct dquot *); |
@@ -507,12 +498,12 @@ prototypes: | |||
507 | int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); | 498 | int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); |
508 | 499 | ||
509 | locking rules: | 500 | locking rules: |
510 | BKL mmap_sem PageLocked(page) | 501 | mmap_sem PageLocked(page) |
511 | open: no yes | 502 | open: yes |
512 | close: no yes | 503 | close: yes |
513 | fault: no yes can return with page locked | 504 | fault: yes can return with page locked |
514 | page_mkwrite: no yes can return with page locked | 505 | page_mkwrite: yes can return with page locked |
515 | access: no yes | 506 | access: yes |
516 | 507 | ||
517 | ->fault() is called when a previously not present pte is about | 508 | ->fault() is called when a previously not present pte is about |
518 | to be faulted in. The filesystem must find and return the page associated | 509 | to be faulted in. The filesystem must find and return the page associated |
@@ -539,6 +530,3 @@ VM_IO | VM_PFNMAP VMAs. | |||
539 | 530 | ||
540 | (if you break something or notice that it is broken and do not fix it yourself | 531 | (if you break something or notice that it is broken and do not fix it yourself |
541 | - at least put it here) | 532 | - at least put it here) |
542 | |||
543 | ipc/shm.c::shm_delete() - may need BKL. | ||
544 | ->read() and ->write() in many drivers are (probably) missing BKL. | ||