diff options
author | Jan Kara <jack@suse.cz> | 2012-06-12 10:20:34 -0400 |
---|---|---|
committer | Al Viro <viro@zeniv.linux.org.uk> | 2012-07-31 01:30:13 -0400 |
commit | 5accdf82ba25cacefd6c1867f1704beb4d244cdd (patch) | |
tree | 7125b01d9bf0f23d5c5eaed0cbafa9a1cbe544d5 /include/linux | |
parent | d87aae2f3c8e90bd0fe03f5309b4d066b712b8ec (diff) |
fs: Improve filesystem freezing handling
vfs_check_frozen() tests are racy since the filesystem can be frozen just after
the test is performed. Thus in write paths we can end up marking some pages or
inodes dirty even though the file system is already frozen. This creates
problems with flusher thread hanging on frozen filesystem.
Another problem is that exclusion between ->page_mkwrite() and filesystem
freezing has been handled by setting page dirty and then verifying s_frozen.
This guaranteed that either the freezing code sees the faulted page, writes it,
and writeprotects it again or we see s_frozen set and bail out of page fault.
This works to protect from page being marked writeable while filesystem
freezing is running but has an unpleasant artefact of leaving dirty (although
unmodified and writeprotected) pages on frozen filesystem resulting in similar
problems with flusher thread as the first problem.
This patch aims at providing exclusion between write paths and filesystem
freezing. We implement a writer-freeze read-write semaphore in the superblock.
Actually, there are three such semaphores because of lock ranking reasons - one
for page fault handlers (->page_mkwrite), one for all other writers, and one of
internal filesystem purposes (used e.g. to track running transactions). Write
paths which should block freezing (e.g. directory operations, ->aio_write(),
->page_mkwrite) hold reader side of the semaphore. Code freezing the filesystem
takes the writer side.
Only that we don't really want to bounce cachelines of the semaphores between
CPUs for each write happening. So we implement the reader side of the semaphore
as a per-cpu counter and the writer side is implemented using s_writers.frozen
superblock field.
[AV: microoptimize sb_start_write(); we want it fast in normal case]
BugLink: https://bugs.launchpad.net/bugs/897421
Tested-by: Kamal Mostafa <kamal@canonical.com>
Tested-by: Peter M. Petrakis <peter.petrakis@canonical.com>
Tested-by: Dann Frazier <dann.frazier@canonical.com>
Tested-by: Massimo Morana <massimo.morana@canonical.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'include/linux')
-rw-r--r-- | include/linux/fs.h | 150 |
1 files changed, 143 insertions, 7 deletions
diff --git a/include/linux/fs.h b/include/linux/fs.h index 80c819cbe272..aefed9426b03 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -412,6 +412,7 @@ struct inodes_stat_t { | |||
412 | #include <linux/shrinker.h> | 412 | #include <linux/shrinker.h> |
413 | #include <linux/migrate_mode.h> | 413 | #include <linux/migrate_mode.h> |
414 | #include <linux/uidgid.h> | 414 | #include <linux/uidgid.h> |
415 | #include <linux/lockdep.h> | ||
415 | 416 | ||
416 | #include <asm/byteorder.h> | 417 | #include <asm/byteorder.h> |
417 | 418 | ||
@@ -1439,6 +1440,8 @@ extern void f_delown(struct file *filp); | |||
1439 | extern pid_t f_getown(struct file *filp); | 1440 | extern pid_t f_getown(struct file *filp); |
1440 | extern int send_sigurg(struct fown_struct *fown); | 1441 | extern int send_sigurg(struct fown_struct *fown); |
1441 | 1442 | ||
1443 | struct mm_struct; | ||
1444 | |||
1442 | /* | 1445 | /* |
1443 | * Umount options | 1446 | * Umount options |
1444 | */ | 1447 | */ |
@@ -1452,6 +1455,32 @@ extern int send_sigurg(struct fown_struct *fown); | |||
1452 | extern struct list_head super_blocks; | 1455 | extern struct list_head super_blocks; |
1453 | extern spinlock_t sb_lock; | 1456 | extern spinlock_t sb_lock; |
1454 | 1457 | ||
1458 | /* Possible states of 'frozen' field */ | ||
1459 | enum { | ||
1460 | SB_UNFROZEN = 0, /* FS is unfrozen */ | ||
1461 | SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ | ||
1462 | SB_FREEZE_TRANS = 2, | ||
1463 | SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ | ||
1464 | SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop | ||
1465 | * internal threads if needed) */ | ||
1466 | SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ | ||
1467 | }; | ||
1468 | |||
1469 | #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) | ||
1470 | |||
1471 | struct sb_writers { | ||
1472 | /* Counters for counting writers at each level */ | ||
1473 | struct percpu_counter counter[SB_FREEZE_LEVELS]; | ||
1474 | wait_queue_head_t wait; /* queue for waiting for | ||
1475 | writers / faults to finish */ | ||
1476 | int frozen; /* Is sb frozen? */ | ||
1477 | wait_queue_head_t wait_unfrozen; /* queue for waiting for | ||
1478 | sb to be thawed */ | ||
1479 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
1480 | struct lockdep_map lock_map[SB_FREEZE_LEVELS]; | ||
1481 | #endif | ||
1482 | }; | ||
1483 | |||
1455 | struct super_block { | 1484 | struct super_block { |
1456 | struct list_head s_list; /* Keep this first */ | 1485 | struct list_head s_list; /* Keep this first */ |
1457 | dev_t s_dev; /* search index; _not_ kdev_t */ | 1486 | dev_t s_dev; /* search index; _not_ kdev_t */ |
@@ -1501,6 +1530,7 @@ struct super_block { | |||
1501 | 1530 | ||
1502 | int s_frozen; | 1531 | int s_frozen; |
1503 | wait_queue_head_t s_wait_unfrozen; | 1532 | wait_queue_head_t s_wait_unfrozen; |
1533 | struct sb_writers s_writers; | ||
1504 | 1534 | ||
1505 | char s_id[32]; /* Informational name */ | 1535 | char s_id[32]; /* Informational name */ |
1506 | u8 s_uuid[16]; /* UUID */ | 1536 | u8 s_uuid[16]; /* UUID */ |
@@ -1555,14 +1585,119 @@ extern struct timespec current_fs_time(struct super_block *sb); | |||
1555 | /* | 1585 | /* |
1556 | * Snapshotting support. | 1586 | * Snapshotting support. |
1557 | */ | 1587 | */ |
1558 | enum { | 1588 | /* Will go away when all users are converted */ |
1559 | SB_UNFROZEN = 0, | 1589 | #define vfs_check_frozen(sb, level) do { } while (0) |
1560 | SB_FREEZE_WRITE = 1, | 1590 | |
1561 | SB_FREEZE_TRANS = 2, | 1591 | void __sb_end_write(struct super_block *sb, int level); |
1562 | }; | 1592 | int __sb_start_write(struct super_block *sb, int level, bool wait); |
1593 | |||
1594 | /** | ||
1595 | * sb_end_write - drop write access to a superblock | ||
1596 | * @sb: the super we wrote to | ||
1597 | * | ||
1598 | * Decrement number of writers to the filesystem. Wake up possible waiters | ||
1599 | * wanting to freeze the filesystem. | ||
1600 | */ | ||
1601 | static inline void sb_end_write(struct super_block *sb) | ||
1602 | { | ||
1603 | __sb_end_write(sb, SB_FREEZE_WRITE); | ||
1604 | } | ||
1605 | |||
1606 | /** | ||
1607 | * sb_end_pagefault - drop write access to a superblock from a page fault | ||
1608 | * @sb: the super we wrote to | ||
1609 | * | ||
1610 | * Decrement number of processes handling write page fault to the filesystem. | ||
1611 | * Wake up possible waiters wanting to freeze the filesystem. | ||
1612 | */ | ||
1613 | static inline void sb_end_pagefault(struct super_block *sb) | ||
1614 | { | ||
1615 | __sb_end_write(sb, SB_FREEZE_PAGEFAULT); | ||
1616 | } | ||
1617 | |||
1618 | /** | ||
1619 | * sb_end_intwrite - drop write access to a superblock for internal fs purposes | ||
1620 | * @sb: the super we wrote to | ||
1621 | * | ||
1622 | * Decrement fs-internal number of writers to the filesystem. Wake up possible | ||
1623 | * waiters wanting to freeze the filesystem. | ||
1624 | */ | ||
1625 | static inline void sb_end_intwrite(struct super_block *sb) | ||
1626 | { | ||
1627 | __sb_end_write(sb, SB_FREEZE_FS); | ||
1628 | } | ||
1629 | |||
1630 | /** | ||
1631 | * sb_start_write - get write access to a superblock | ||
1632 | * @sb: the super we write to | ||
1633 | * | ||
1634 | * When a process wants to write data or metadata to a file system (i.e. dirty | ||
1635 | * a page or an inode), it should embed the operation in a sb_start_write() - | ||
1636 | * sb_end_write() pair to get exclusion against file system freezing. This | ||
1637 | * function increments number of writers preventing freezing. If the file | ||
1638 | * system is already frozen, the function waits until the file system is | ||
1639 | * thawed. | ||
1640 | * | ||
1641 | * Since freeze protection behaves as a lock, users have to preserve | ||
1642 | * ordering of freeze protection and other filesystem locks. Generally, | ||
1643 | * freeze protection should be the outermost lock. In particular, we have: | ||
1644 | * | ||
1645 | * sb_start_write | ||
1646 | * -> i_mutex (write path, truncate, directory ops, ...) | ||
1647 | * -> s_umount (freeze_super, thaw_super) | ||
1648 | */ | ||
1649 | static inline void sb_start_write(struct super_block *sb) | ||
1650 | { | ||
1651 | __sb_start_write(sb, SB_FREEZE_WRITE, true); | ||
1652 | } | ||
1653 | |||
1654 | static inline int sb_start_write_trylock(struct super_block *sb) | ||
1655 | { | ||
1656 | return __sb_start_write(sb, SB_FREEZE_WRITE, false); | ||
1657 | } | ||
1658 | |||
1659 | /** | ||
1660 | * sb_start_pagefault - get write access to a superblock from a page fault | ||
1661 | * @sb: the super we write to | ||
1662 | * | ||
1663 | * When a process starts handling write page fault, it should embed the | ||
1664 | * operation into sb_start_pagefault() - sb_end_pagefault() pair to get | ||
1665 | * exclusion against file system freezing. This is needed since the page fault | ||
1666 | * is going to dirty a page. This function increments number of running page | ||
1667 | * faults preventing freezing. If the file system is already frozen, the | ||
1668 | * function waits until the file system is thawed. | ||
1669 | * | ||
1670 | * Since page fault freeze protection behaves as a lock, users have to preserve | ||
1671 | * ordering of freeze protection and other filesystem locks. It is advised to | ||
1672 | * put sb_start_pagefault() close to mmap_sem in lock ordering. Page fault | ||
1673 | * handling code implies lock dependency: | ||
1674 | * | ||
1675 | * mmap_sem | ||
1676 | * -> sb_start_pagefault | ||
1677 | */ | ||
1678 | static inline void sb_start_pagefault(struct super_block *sb) | ||
1679 | { | ||
1680 | __sb_start_write(sb, SB_FREEZE_PAGEFAULT, true); | ||
1681 | } | ||
1682 | |||
1683 | /* | ||
1684 | * sb_start_intwrite - get write access to a superblock for internal fs purposes | ||
1685 | * @sb: the super we write to | ||
1686 | * | ||
1687 | * This is the third level of protection against filesystem freezing. It is | ||
1688 | * free for use by a filesystem. The only requirement is that it must rank | ||
1689 | * below sb_start_pagefault. | ||
1690 | * | ||
1691 | * For example filesystem can call sb_start_intwrite() when starting a | ||
1692 | * transaction which somewhat eases handling of freezing for internal sources | ||
1693 | * of filesystem changes (internal fs threads, discarding preallocation on file | ||
1694 | * close, etc.). | ||
1695 | */ | ||
1696 | static inline void sb_start_intwrite(struct super_block *sb) | ||
1697 | { | ||
1698 | __sb_start_write(sb, SB_FREEZE_FS, true); | ||
1699 | } | ||
1563 | 1700 | ||
1564 | #define vfs_check_frozen(sb, level) \ | ||
1565 | wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) | ||
1566 | 1701 | ||
1567 | extern bool inode_owner_or_capable(const struct inode *inode); | 1702 | extern bool inode_owner_or_capable(const struct inode *inode); |
1568 | 1703 | ||
@@ -1886,6 +2021,7 @@ struct file_system_type { | |||
1886 | struct lock_class_key s_lock_key; | 2021 | struct lock_class_key s_lock_key; |
1887 | struct lock_class_key s_umount_key; | 2022 | struct lock_class_key s_umount_key; |
1888 | struct lock_class_key s_vfs_rename_key; | 2023 | struct lock_class_key s_vfs_rename_key; |
2024 | struct lock_class_key s_writers_key[SB_FREEZE_LEVELS]; | ||
1889 | 2025 | ||
1890 | struct lock_class_key i_lock_key; | 2026 | struct lock_class_key i_lock_key; |
1891 | struct lock_class_key i_mutex_key; | 2027 | struct lock_class_key i_mutex_key; |