aboutsummaryrefslogtreecommitdiffstats
path: root/fs/dcache.c
diff options
context:
space:
mode:
authorNick Piggin <npiggin@kernel.dk>2011-01-07 01:49:52 -0500
committerNick Piggin <npiggin@kernel.dk>2011-01-07 01:50:27 -0500
commit31e6b01f4183ff419a6d1f86177cbf4662347cec (patch)
treee215ec9af88352c55e024f784f3d9f8eb13fab85 /fs/dcache.c
parent3c22cd5709e8143444a6d08682a87f4c57902df3 (diff)
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the ancestor dentry elements. This is called rcu-walk, as opposed to the current algorithm which is a refcount based walk, or ref-walk. This results in far fewer atomic operations on every path element, significantly improving path lookup performance. It also avoids cacheline bouncing on common dentries, significantly improving scalability. The overall design is like this: * LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk. * Take the RCU lock for the entire path walk, starting with the acquiring of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are not required for dentry persistence. * synchronize_rcu is called when unregistering a filesystem, so we can access d_ops and i_ops during rcu-walk. * Similarly take the vfsmount lock for the entire path walk. So now mnt refcounts are not required for persistence. Also we are free to perform mount lookups, and to assume dentry mount points and mount roots are stable up and down the path. * Have a per-dentry seqlock to protect the dentry name, parent, and inode, so we can load this tuple atomically, and also check whether any of its members have changed. * Dentry lookups (based on parent, candidate string tuple) recheck the parent sequence after the child is found in case anything changed in the parent during the path walk. * inode is also RCU protected so we can load d_inode and use the inode for limited things. * i_mode, i_uid, i_gid can be tested for exec permissions during path walk. * i_op can be loaded. When we reach the destination dentry, we lock it, recheck lookup sequence, and increment its refcount and mountpoint refcount. RCU and vfsmount locks are dropped. This is termed "dropping rcu-walk". If the dentry refcount does not match, we can not drop rcu-walk gracefully at the current point in the lokup, so instead return -ECHILD (for want of a better errno). This signals the path walking code to re-do the entire lookup with a ref-walk. Aside from the final dentry, there are other situations that may be encounted where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take a reference on the last good dentry) and continue with a ref-walk. Again, if we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup using ref-walk. But it is very important that we can continue with ref-walk for most cases, particularly to avoid the overhead of double lookups, and to gain the scalability advantages on common path elements (like cwd and root). The cases where rcu-walk cannot continue are: * NULL dentry (ie. any uncached path element) * parent with d_inode->i_op->permission or ACLs * dentries with d_revalidate * Following links In future patches, permission checks and d_revalidate become rcu-walk aware. It may be possible eventually to make following links rcu-walk aware. Uncached path elements will always require dropping to ref-walk mode, at the very least because i_mutex needs to be grabbed, and objects allocated. Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Diffstat (limited to 'fs/dcache.c')
-rw-r--r--fs/dcache.c203
1 files changed, 181 insertions, 22 deletions
diff --git a/fs/dcache.c b/fs/dcache.c
index dc0551c9755d..187fea040108 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -152,9 +152,23 @@ static void d_free(struct dentry *dentry)
152 call_rcu(&dentry->d_u.d_rcu, __d_free); 152 call_rcu(&dentry->d_u.d_rcu, __d_free);
153} 153}
154 154
155/**
156 * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
157 * After this call, in-progress rcu-walk path lookup will fail. This
158 * should be called after unhashing, and after changing d_inode (if
159 * the dentry has not already been unhashed).
160 */
161static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
162{
163 assert_spin_locked(&dentry->d_lock);
164 /* Go through a barrier */
165 write_seqcount_barrier(&dentry->d_seq);
166}
167
155/* 168/*
156 * Release the dentry's inode, using the filesystem 169 * Release the dentry's inode, using the filesystem
157 * d_iput() operation if defined. 170 * d_iput() operation if defined. Dentry has no refcount
171 * and is unhashed.
158 */ 172 */
159static void dentry_iput(struct dentry * dentry) 173static void dentry_iput(struct dentry * dentry)
160 __releases(dentry->d_lock) 174 __releases(dentry->d_lock)
@@ -179,6 +193,28 @@ static void dentry_iput(struct dentry * dentry)
179} 193}
180 194
181/* 195/*
196 * Release the dentry's inode, using the filesystem
197 * d_iput() operation if defined. dentry remains in-use.
198 */
199static void dentry_unlink_inode(struct dentry * dentry)
200 __releases(dentry->d_lock)
201 __releases(dcache_inode_lock)
202{
203 struct inode *inode = dentry->d_inode;
204 dentry->d_inode = NULL;
205 list_del_init(&dentry->d_alias);
206 dentry_rcuwalk_barrier(dentry);
207 spin_unlock(&dentry->d_lock);
208 spin_unlock(&dcache_inode_lock);
209 if (!inode->i_nlink)
210 fsnotify_inoderemove(inode);
211 if (dentry->d_op && dentry->d_op->d_iput)
212 dentry->d_op->d_iput(dentry, inode);
213 else
214 iput(inode);
215}
216
217/*
182 * dentry_lru_(add|del|move_tail) must be called with d_lock held. 218 * dentry_lru_(add|del|move_tail) must be called with d_lock held.
183 */ 219 */
184static void dentry_lru_add(struct dentry *dentry) 220static void dentry_lru_add(struct dentry *dentry)
@@ -272,6 +308,7 @@ void __d_drop(struct dentry *dentry)
272 spin_lock(&dcache_hash_lock); 308 spin_lock(&dcache_hash_lock);
273 hlist_del_rcu(&dentry->d_hash); 309 hlist_del_rcu(&dentry->d_hash);
274 spin_unlock(&dcache_hash_lock); 310 spin_unlock(&dcache_hash_lock);
311 dentry_rcuwalk_barrier(dentry);
275 } 312 }
276} 313}
277EXPORT_SYMBOL(__d_drop); 314EXPORT_SYMBOL(__d_drop);
@@ -309,6 +346,7 @@ relock:
309 spin_unlock(&dcache_inode_lock); 346 spin_unlock(&dcache_inode_lock);
310 goto relock; 347 goto relock;
311 } 348 }
349
312 if (ref) 350 if (ref)
313 dentry->d_count--; 351 dentry->d_count--;
314 /* if dentry was on the d_lru list delete it from there */ 352 /* if dentry was on the d_lru list delete it from there */
@@ -1221,6 +1259,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
1221 dentry->d_count = 1; 1259 dentry->d_count = 1;
1222 dentry->d_flags = DCACHE_UNHASHED; 1260 dentry->d_flags = DCACHE_UNHASHED;
1223 spin_lock_init(&dentry->d_lock); 1261 spin_lock_init(&dentry->d_lock);
1262 seqcount_init(&dentry->d_seq);
1224 dentry->d_inode = NULL; 1263 dentry->d_inode = NULL;
1225 dentry->d_parent = NULL; 1264 dentry->d_parent = NULL;
1226 dentry->d_sb = NULL; 1265 dentry->d_sb = NULL;
@@ -1269,6 +1308,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
1269 if (inode) 1308 if (inode)
1270 list_add(&dentry->d_alias, &inode->i_dentry); 1309 list_add(&dentry->d_alias, &inode->i_dentry);
1271 dentry->d_inode = inode; 1310 dentry->d_inode = inode;
1311 dentry_rcuwalk_barrier(dentry);
1272 spin_unlock(&dentry->d_lock); 1312 spin_unlock(&dentry->d_lock);
1273 fsnotify_d_instantiate(dentry, inode); 1313 fsnotify_d_instantiate(dentry, inode);
1274} 1314}
@@ -1611,6 +1651,111 @@ err_out:
1611EXPORT_SYMBOL(d_add_ci); 1651EXPORT_SYMBOL(d_add_ci);
1612 1652
1613/** 1653/**
1654 * __d_lookup_rcu - search for a dentry (racy, store-free)
1655 * @parent: parent dentry
1656 * @name: qstr of name we wish to find
1657 * @seq: returns d_seq value at the point where the dentry was found
1658 * @inode: returns dentry->d_inode when the inode was found valid.
1659 * Returns: dentry, or NULL
1660 *
1661 * __d_lookup_rcu is the dcache lookup function for rcu-walk name
1662 * resolution (store-free path walking) design described in
1663 * Documentation/filesystems/path-lookup.txt.
1664 *
1665 * This is not to be used outside core vfs.
1666 *
1667 * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
1668 * held, and rcu_read_lock held. The returned dentry must not be stored into
1669 * without taking d_lock and checking d_seq sequence count against @seq
1670 * returned here.
1671 *
1672 * A refcount may be taken on the found dentry with the __d_rcu_to_refcount
1673 * function.
1674 *
1675 * Alternatively, __d_lookup_rcu may be called again to look up the child of
1676 * the returned dentry, so long as its parent's seqlock is checked after the
1677 * child is looked up. Thus, an interlocking stepping of sequence lock checks
1678 * is formed, giving integrity down the path walk.
1679 */
1680struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
1681 unsigned *seq, struct inode **inode)
1682{
1683 unsigned int len = name->len;
1684 unsigned int hash = name->hash;
1685 const unsigned char *str = name->name;
1686 struct hlist_head *head = d_hash(parent, hash);
1687 struct hlist_node *node;
1688 struct dentry *dentry;
1689
1690 /*
1691 * Note: There is significant duplication with __d_lookup_rcu which is
1692 * required to prevent single threaded performance regressions
1693 * especially on architectures where smp_rmb (in seqcounts) are costly.
1694 * Keep the two functions in sync.
1695 */
1696
1697 /*
1698 * The hash list is protected using RCU.
1699 *
1700 * Carefully use d_seq when comparing a candidate dentry, to avoid
1701 * races with d_move().
1702 *
1703 * It is possible that concurrent renames can mess up our list
1704 * walk here and result in missing our dentry, resulting in the
1705 * false-negative result. d_lookup() protects against concurrent
1706 * renames using rename_lock seqlock.
1707 *
1708 * See Documentation/vfs/dcache-locking.txt for more details.
1709 */
1710 hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
1711 struct inode *i;
1712 const char *tname;
1713 int tlen;
1714
1715 if (dentry->d_name.hash != hash)
1716 continue;
1717
1718seqretry:
1719 *seq = read_seqcount_begin(&dentry->d_seq);
1720 if (dentry->d_parent != parent)
1721 continue;
1722 if (d_unhashed(dentry))
1723 continue;
1724 tlen = dentry->d_name.len;
1725 tname = dentry->d_name.name;
1726 i = dentry->d_inode;
1727 /*
1728 * This seqcount check is required to ensure name and
1729 * len are loaded atomically, so as not to walk off the
1730 * edge of memory when walking. If we could load this
1731 * atomically some other way, we could drop this check.
1732 */
1733 if (read_seqcount_retry(&dentry->d_seq, *seq))
1734 goto seqretry;
1735 if (parent->d_op && parent->d_op->d_compare) {
1736 if (parent->d_op->d_compare(parent, *inode,
1737 dentry, i,
1738 tlen, tname, name))
1739 continue;
1740 } else {
1741 if (tlen != len)
1742 continue;
1743 if (memcmp(tname, str, tlen))
1744 continue;
1745 }
1746 /*
1747 * No extra seqcount check is required after the name
1748 * compare. The caller must perform a seqcount check in
1749 * order to do anything useful with the returned dentry
1750 * anyway.
1751 */
1752 *inode = i;
1753 return dentry;
1754 }
1755 return NULL;
1756}
1757
1758/**
1614 * d_lookup - search for a dentry 1759 * d_lookup - search for a dentry
1615 * @parent: parent dentry 1760 * @parent: parent dentry
1616 * @name: qstr of name we wish to find 1761 * @name: qstr of name we wish to find
@@ -1621,9 +1766,9 @@ EXPORT_SYMBOL(d_add_ci);
1621 * dentry is returned. The caller must use dput to free the entry when it has 1766 * dentry is returned. The caller must use dput to free the entry when it has
1622 * finished using it. %NULL is returned if the dentry does not exist. 1767 * finished using it. %NULL is returned if the dentry does not exist.
1623 */ 1768 */
1624struct dentry * d_lookup(struct dentry * parent, struct qstr * name) 1769struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
1625{ 1770{
1626 struct dentry * dentry = NULL; 1771 struct dentry *dentry;
1627 unsigned seq; 1772 unsigned seq;
1628 1773
1629 do { 1774 do {
@@ -1636,7 +1781,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
1636} 1781}
1637EXPORT_SYMBOL(d_lookup); 1782EXPORT_SYMBOL(d_lookup);
1638 1783
1639/* 1784/**
1640 * __d_lookup - search for a dentry (racy) 1785 * __d_lookup - search for a dentry (racy)
1641 * @parent: parent dentry 1786 * @parent: parent dentry
1642 * @name: qstr of name we wish to find 1787 * @name: qstr of name we wish to find
@@ -1651,17 +1796,24 @@ EXPORT_SYMBOL(d_lookup);
1651 * 1796 *
1652 * __d_lookup callers must be commented. 1797 * __d_lookup callers must be commented.
1653 */ 1798 */
1654struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) 1799struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
1655{ 1800{
1656 unsigned int len = name->len; 1801 unsigned int len = name->len;
1657 unsigned int hash = name->hash; 1802 unsigned int hash = name->hash;
1658 const unsigned char *str = name->name; 1803 const unsigned char *str = name->name;
1659 struct hlist_head *head = d_hash(parent,hash); 1804 struct hlist_head *head = d_hash(parent,hash);
1660 struct dentry *found = NULL;
1661 struct hlist_node *node; 1805 struct hlist_node *node;
1806 struct dentry *found = NULL;
1662 struct dentry *dentry; 1807 struct dentry *dentry;
1663 1808
1664 /* 1809 /*
1810 * Note: There is significant duplication with __d_lookup_rcu which is
1811 * required to prevent single threaded performance regressions
1812 * especially on architectures where smp_rmb (in seqcounts) are costly.
1813 * Keep the two functions in sync.
1814 */
1815
1816 /*
1665 * The hash list is protected using RCU. 1817 * The hash list is protected using RCU.
1666 * 1818 *
1667 * Take d_lock when comparing a candidate dentry, to avoid races 1819 * Take d_lock when comparing a candidate dentry, to avoid races
@@ -1677,24 +1829,15 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1677 rcu_read_lock(); 1829 rcu_read_lock();
1678 1830
1679 hlist_for_each_entry_rcu(dentry, node, head, d_hash) { 1831 hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
1680 struct qstr *qstr; 1832 const char *tname;
1833 int tlen;
1681 1834
1682 if (dentry->d_name.hash != hash) 1835 if (dentry->d_name.hash != hash)
1683 continue; 1836 continue;
1684 if (dentry->d_parent != parent)
1685 continue;
1686 1837
1687 spin_lock(&dentry->d_lock); 1838 spin_lock(&dentry->d_lock);
1688
1689 /*
1690 * Recheck the dentry after taking the lock - d_move may have
1691 * changed things. Don't bother checking the hash because
1692 * we're about to compare the whole name anyway.
1693 */
1694 if (dentry->d_parent != parent) 1839 if (dentry->d_parent != parent)
1695 goto next; 1840 goto next;
1696
1697 /* non-existing due to RCU? */
1698 if (d_unhashed(dentry)) 1841 if (d_unhashed(dentry))
1699 goto next; 1842 goto next;
1700 1843
@@ -1702,16 +1845,17 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1702 * It is safe to compare names since d_move() cannot 1845 * It is safe to compare names since d_move() cannot
1703 * change the qstr (protected by d_lock). 1846 * change the qstr (protected by d_lock).
1704 */ 1847 */
1705 qstr = &dentry->d_name; 1848 tlen = dentry->d_name.len;
1849 tname = dentry->d_name.name;
1706 if (parent->d_op && parent->d_op->d_compare) { 1850 if (parent->d_op && parent->d_op->d_compare) {
1707 if (parent->d_op->d_compare(parent, parent->d_inode, 1851 if (parent->d_op->d_compare(parent, parent->d_inode,
1708 dentry, dentry->d_inode, 1852 dentry, dentry->d_inode,
1709 qstr->len, qstr->name, name)) 1853 tlen, tname, name))
1710 goto next; 1854 goto next;
1711 } else { 1855 } else {
1712 if (qstr->len != len) 1856 if (tlen != len)
1713 goto next; 1857 goto next;
1714 if (memcmp(qstr->name, str, len)) 1858 if (memcmp(tname, str, tlen))
1715 goto next; 1859 goto next;
1716 } 1860 }
1717 1861
@@ -1821,7 +1965,7 @@ again:
1821 goto again; 1965 goto again;
1822 } 1966 }
1823 dentry->d_flags &= ~DCACHE_CANT_MOUNT; 1967 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
1824 dentry_iput(dentry); 1968 dentry_unlink_inode(dentry);
1825 fsnotify_nameremove(dentry, isdir); 1969 fsnotify_nameremove(dentry, isdir);
1826 return; 1970 return;
1827 } 1971 }
@@ -1884,7 +2028,9 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
1884 BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ 2028 BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
1885 2029
1886 spin_lock(&dentry->d_lock); 2030 spin_lock(&dentry->d_lock);
2031 write_seqcount_begin(&dentry->d_seq);
1887 memcpy((unsigned char *)dentry->d_name.name, name->name, name->len); 2032 memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
2033 write_seqcount_end(&dentry->d_seq);
1888 spin_unlock(&dentry->d_lock); 2034 spin_unlock(&dentry->d_lock);
1889} 2035}
1890EXPORT_SYMBOL(dentry_update_name_case); 2036EXPORT_SYMBOL(dentry_update_name_case);
@@ -1997,6 +2143,9 @@ void d_move(struct dentry * dentry, struct dentry * target)
1997 2143
1998 dentry_lock_for_move(dentry, target); 2144 dentry_lock_for_move(dentry, target);
1999 2145
2146 write_seqcount_begin(&dentry->d_seq);
2147 write_seqcount_begin(&target->d_seq);
2148
2000 /* Move the dentry to the target hash queue, if on different bucket */ 2149 /* Move the dentry to the target hash queue, if on different bucket */
2001 spin_lock(&dcache_hash_lock); 2150 spin_lock(&dcache_hash_lock);
2002 if (!d_unhashed(dentry)) 2151 if (!d_unhashed(dentry))
@@ -2005,6 +2154,7 @@ void d_move(struct dentry * dentry, struct dentry * target)
2005 spin_unlock(&dcache_hash_lock); 2154 spin_unlock(&dcache_hash_lock);
2006 2155
2007 /* Unhash the target: dput() will then get rid of it */ 2156 /* Unhash the target: dput() will then get rid of it */
2157 /* __d_drop does write_seqcount_barrier, but they're OK to nest. */
2008 __d_drop(target); 2158 __d_drop(target);
2009 2159
2010 list_del(&dentry->d_u.d_child); 2160 list_del(&dentry->d_u.d_child);
@@ -2028,6 +2178,9 @@ void d_move(struct dentry * dentry, struct dentry * target)
2028 2178
2029 list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); 2179 list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
2030 2180
2181 write_seqcount_end(&target->d_seq);
2182 write_seqcount_end(&dentry->d_seq);
2183
2031 dentry_unlock_parents_for_move(dentry, target); 2184 dentry_unlock_parents_for_move(dentry, target);
2032 spin_unlock(&target->d_lock); 2185 spin_unlock(&target->d_lock);
2033 fsnotify_d_move(dentry); 2186 fsnotify_d_move(dentry);
@@ -2110,6 +2263,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
2110 2263
2111 dentry_lock_for_move(anon, dentry); 2264 dentry_lock_for_move(anon, dentry);
2112 2265
2266 write_seqcount_begin(&dentry->d_seq);
2267 write_seqcount_begin(&anon->d_seq);
2268
2113 dparent = dentry->d_parent; 2269 dparent = dentry->d_parent;
2114 aparent = anon->d_parent; 2270 aparent = anon->d_parent;
2115 2271
@@ -2130,6 +2286,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
2130 else 2286 else
2131 INIT_LIST_HEAD(&anon->d_u.d_child); 2287 INIT_LIST_HEAD(&anon->d_u.d_child);
2132 2288
2289 write_seqcount_end(&dentry->d_seq);
2290 write_seqcount_end(&anon->d_seq);
2291
2133 dentry_unlock_parents_for_move(anon, dentry); 2292 dentry_unlock_parents_for_move(anon, dentry);
2134 spin_unlock(&dentry->d_lock); 2293 spin_unlock(&dentry->d_lock);
2135 2294