aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux/fs.h
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2012-06-12 10:20:34 -0400
committerAl Viro <viro@zeniv.linux.org.uk>2012-07-31 01:30:13 -0400
commit5accdf82ba25cacefd6c1867f1704beb4d244cdd (patch)
tree7125b01d9bf0f23d5c5eaed0cbafa9a1cbe544d5 /include/linux/fs.h
parentd87aae2f3c8e90bd0fe03f5309b4d066b712b8ec (diff)
fs: Improve filesystem freezing handling
vfs_check_frozen() tests are racy since the filesystem can be frozen just after the test is performed. Thus in write paths we can end up marking some pages or inodes dirty even though the file system is already frozen. This creates problems with flusher thread hanging on frozen filesystem. Another problem is that exclusion between ->page_mkwrite() and filesystem freezing has been handled by setting page dirty and then verifying s_frozen. This guaranteed that either the freezing code sees the faulted page, writes it, and writeprotects it again or we see s_frozen set and bail out of page fault. This works to protect from page being marked writeable while filesystem freezing is running but has an unpleasant artefact of leaving dirty (although unmodified and writeprotected) pages on frozen filesystem resulting in similar problems with flusher thread as the first problem. This patch aims at providing exclusion between write paths and filesystem freezing. We implement a writer-freeze read-write semaphore in the superblock. Actually, there are three such semaphores because of lock ranking reasons - one for page fault handlers (->page_mkwrite), one for all other writers, and one of internal filesystem purposes (used e.g. to track running transactions). Write paths which should block freezing (e.g. directory operations, ->aio_write(), ->page_mkwrite) hold reader side of the semaphore. Code freezing the filesystem takes the writer side. Only that we don't really want to bounce cachelines of the semaphores between CPUs for each write happening. So we implement the reader side of the semaphore as a per-cpu counter and the writer side is implemented using s_writers.frozen superblock field. [AV: microoptimize sb_start_write(); we want it fast in normal case] BugLink: https://bugs.launchpad.net/bugs/897421 Tested-by: Kamal Mostafa <kamal@canonical.com> Tested-by: Peter M. Petrakis <peter.petrakis@canonical.com> Tested-by: Dann Frazier <dann.frazier@canonical.com> Tested-by: Massimo Morana <massimo.morana@canonical.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'include/linux/fs.h')
-rw-r--r--include/linux/fs.h150
1 files changed, 143 insertions, 7 deletions
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 80c819cbe272..aefed9426b03 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -412,6 +412,7 @@ struct inodes_stat_t {
412#include <linux/shrinker.h> 412#include <linux/shrinker.h>
413#include <linux/migrate_mode.h> 413#include <linux/migrate_mode.h>
414#include <linux/uidgid.h> 414#include <linux/uidgid.h>
415#include <linux/lockdep.h>
415 416
416#include <asm/byteorder.h> 417#include <asm/byteorder.h>
417 418
@@ -1439,6 +1440,8 @@ extern void f_delown(struct file *filp);
1439extern pid_t f_getown(struct file *filp); 1440extern pid_t f_getown(struct file *filp);
1440extern int send_sigurg(struct fown_struct *fown); 1441extern int send_sigurg(struct fown_struct *fown);
1441 1442
1443struct mm_struct;
1444
1442/* 1445/*
1443 * Umount options 1446 * Umount options
1444 */ 1447 */
@@ -1452,6 +1455,32 @@ extern int send_sigurg(struct fown_struct *fown);
1452extern struct list_head super_blocks; 1455extern struct list_head super_blocks;
1453extern spinlock_t sb_lock; 1456extern spinlock_t sb_lock;
1454 1457
1458/* Possible states of 'frozen' field */
1459enum {
1460 SB_UNFROZEN = 0, /* FS is unfrozen */
1461 SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */
1462 SB_FREEZE_TRANS = 2,
1463 SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */
1464 SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop
1465 * internal threads if needed) */
1466 SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */
1467};
1468
1469#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
1470
1471struct sb_writers {
1472 /* Counters for counting writers at each level */
1473 struct percpu_counter counter[SB_FREEZE_LEVELS];
1474 wait_queue_head_t wait; /* queue for waiting for
1475 writers / faults to finish */
1476 int frozen; /* Is sb frozen? */
1477 wait_queue_head_t wait_unfrozen; /* queue for waiting for
1478 sb to be thawed */
1479#ifdef CONFIG_DEBUG_LOCK_ALLOC
1480 struct lockdep_map lock_map[SB_FREEZE_LEVELS];
1481#endif
1482};
1483
1455struct super_block { 1484struct super_block {
1456 struct list_head s_list; /* Keep this first */ 1485 struct list_head s_list; /* Keep this first */
1457 dev_t s_dev; /* search index; _not_ kdev_t */ 1486 dev_t s_dev; /* search index; _not_ kdev_t */
@@ -1501,6 +1530,7 @@ struct super_block {
1501 1530
1502 int s_frozen; 1531 int s_frozen;
1503 wait_queue_head_t s_wait_unfrozen; 1532 wait_queue_head_t s_wait_unfrozen;
1533 struct sb_writers s_writers;
1504 1534
1505 char s_id[32]; /* Informational name */ 1535 char s_id[32]; /* Informational name */
1506 u8 s_uuid[16]; /* UUID */ 1536 u8 s_uuid[16]; /* UUID */
@@ -1555,14 +1585,119 @@ extern struct timespec current_fs_time(struct super_block *sb);
1555/* 1585/*
1556 * Snapshotting support. 1586 * Snapshotting support.
1557 */ 1587 */
1558enum { 1588/* Will go away when all users are converted */
1559 SB_UNFROZEN = 0, 1589#define vfs_check_frozen(sb, level) do { } while (0)
1560 SB_FREEZE_WRITE = 1, 1590
1561 SB_FREEZE_TRANS = 2, 1591void __sb_end_write(struct super_block *sb, int level);
1562}; 1592int __sb_start_write(struct super_block *sb, int level, bool wait);
1593
1594/**
1595 * sb_end_write - drop write access to a superblock
1596 * @sb: the super we wrote to
1597 *
1598 * Decrement number of writers to the filesystem. Wake up possible waiters
1599 * wanting to freeze the filesystem.
1600 */
1601static inline void sb_end_write(struct super_block *sb)
1602{
1603 __sb_end_write(sb, SB_FREEZE_WRITE);
1604}
1605
1606/**
1607 * sb_end_pagefault - drop write access to a superblock from a page fault
1608 * @sb: the super we wrote to
1609 *
1610 * Decrement number of processes handling write page fault to the filesystem.
1611 * Wake up possible waiters wanting to freeze the filesystem.
1612 */
1613static inline void sb_end_pagefault(struct super_block *sb)
1614{
1615 __sb_end_write(sb, SB_FREEZE_PAGEFAULT);
1616}
1617
1618/**
1619 * sb_end_intwrite - drop write access to a superblock for internal fs purposes
1620 * @sb: the super we wrote to
1621 *
1622 * Decrement fs-internal number of writers to the filesystem. Wake up possible
1623 * waiters wanting to freeze the filesystem.
1624 */
1625static inline void sb_end_intwrite(struct super_block *sb)
1626{
1627 __sb_end_write(sb, SB_FREEZE_FS);
1628}
1629
1630/**
1631 * sb_start_write - get write access to a superblock
1632 * @sb: the super we write to
1633 *
1634 * When a process wants to write data or metadata to a file system (i.e. dirty
1635 * a page or an inode), it should embed the operation in a sb_start_write() -
1636 * sb_end_write() pair to get exclusion against file system freezing. This
1637 * function increments number of writers preventing freezing. If the file
1638 * system is already frozen, the function waits until the file system is
1639 * thawed.
1640 *
1641 * Since freeze protection behaves as a lock, users have to preserve
1642 * ordering of freeze protection and other filesystem locks. Generally,
1643 * freeze protection should be the outermost lock. In particular, we have:
1644 *
1645 * sb_start_write
1646 * -> i_mutex (write path, truncate, directory ops, ...)
1647 * -> s_umount (freeze_super, thaw_super)
1648 */
1649static inline void sb_start_write(struct super_block *sb)
1650{
1651 __sb_start_write(sb, SB_FREEZE_WRITE, true);
1652}
1653
1654static inline int sb_start_write_trylock(struct super_block *sb)
1655{
1656 return __sb_start_write(sb, SB_FREEZE_WRITE, false);
1657}
1658
1659/**
1660 * sb_start_pagefault - get write access to a superblock from a page fault
1661 * @sb: the super we write to
1662 *
1663 * When a process starts handling write page fault, it should embed the
1664 * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
1665 * exclusion against file system freezing. This is needed since the page fault
1666 * is going to dirty a page. This function increments number of running page
1667 * faults preventing freezing. If the file system is already frozen, the
1668 * function waits until the file system is thawed.
1669 *
1670 * Since page fault freeze protection behaves as a lock, users have to preserve
1671 * ordering of freeze protection and other filesystem locks. It is advised to
1672 * put sb_start_pagefault() close to mmap_sem in lock ordering. Page fault
1673 * handling code implies lock dependency:
1674 *
1675 * mmap_sem
1676 * -> sb_start_pagefault
1677 */
1678static inline void sb_start_pagefault(struct super_block *sb)
1679{
1680 __sb_start_write(sb, SB_FREEZE_PAGEFAULT, true);
1681}
1682
1683/*
1684 * sb_start_intwrite - get write access to a superblock for internal fs purposes
1685 * @sb: the super we write to
1686 *
1687 * This is the third level of protection against filesystem freezing. It is
1688 * free for use by a filesystem. The only requirement is that it must rank
1689 * below sb_start_pagefault.
1690 *
1691 * For example filesystem can call sb_start_intwrite() when starting a
1692 * transaction which somewhat eases handling of freezing for internal sources
1693 * of filesystem changes (internal fs threads, discarding preallocation on file
1694 * close, etc.).
1695 */
1696static inline void sb_start_intwrite(struct super_block *sb)
1697{
1698 __sb_start_write(sb, SB_FREEZE_FS, true);
1699}
1563 1700
1564#define vfs_check_frozen(sb, level) \
1565 wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
1566 1701
1567extern bool inode_owner_or_capable(const struct inode *inode); 1702extern bool inode_owner_or_capable(const struct inode *inode);
1568 1703
@@ -1886,6 +2021,7 @@ struct file_system_type {
1886 struct lock_class_key s_lock_key; 2021 struct lock_class_key s_lock_key;
1887 struct lock_class_key s_umount_key; 2022 struct lock_class_key s_umount_key;
1888 struct lock_class_key s_vfs_rename_key; 2023 struct lock_class_key s_vfs_rename_key;
2024 struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];
1889 2025
1890 struct lock_class_key i_lock_key; 2026 struct lock_class_key i_lock_key;
1891 struct lock_class_key i_mutex_key; 2027 struct lock_class_key i_mutex_key;