aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorNick Piggin <npiggin@kernel.dk>2011-01-07 01:49:52 -0500
committerNick Piggin <npiggin@kernel.dk>2011-01-07 01:50:27 -0500
commit31e6b01f4183ff419a6d1f86177cbf4662347cec (patch)
treee215ec9af88352c55e024f784f3d9f8eb13fab85 /fs
parent3c22cd5709e8143444a6d08682a87f4c57902df3 (diff)
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the ancestor dentry elements. This is called rcu-walk, as opposed to the current algorithm which is a refcount based walk, or ref-walk. This results in far fewer atomic operations on every path element, significantly improving path lookup performance. It also avoids cacheline bouncing on common dentries, significantly improving scalability. The overall design is like this: * LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk. * Take the RCU lock for the entire path walk, starting with the acquiring of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are not required for dentry persistence. * synchronize_rcu is called when unregistering a filesystem, so we can access d_ops and i_ops during rcu-walk. * Similarly take the vfsmount lock for the entire path walk. So now mnt refcounts are not required for persistence. Also we are free to perform mount lookups, and to assume dentry mount points and mount roots are stable up and down the path. * Have a per-dentry seqlock to protect the dentry name, parent, and inode, so we can load this tuple atomically, and also check whether any of its members have changed. * Dentry lookups (based on parent, candidate string tuple) recheck the parent sequence after the child is found in case anything changed in the parent during the path walk. * inode is also RCU protected so we can load d_inode and use the inode for limited things. * i_mode, i_uid, i_gid can be tested for exec permissions during path walk. * i_op can be loaded. When we reach the destination dentry, we lock it, recheck lookup sequence, and increment its refcount and mountpoint refcount. RCU and vfsmount locks are dropped. This is termed "dropping rcu-walk". If the dentry refcount does not match, we can not drop rcu-walk gracefully at the current point in the lokup, so instead return -ECHILD (for want of a better errno). This signals the path walking code to re-do the entire lookup with a ref-walk. Aside from the final dentry, there are other situations that may be encounted where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take a reference on the last good dentry) and continue with a ref-walk. Again, if we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup using ref-walk. But it is very important that we can continue with ref-walk for most cases, particularly to avoid the overhead of double lookups, and to gain the scalability advantages on common path elements (like cwd and root). The cases where rcu-walk cannot continue are: * NULL dentry (ie. any uncached path element) * parent with d_inode->i_op->permission or ACLs * dentries with d_revalidate * Following links In future patches, permission checks and d_revalidate become rcu-walk aware. It may be possible eventually to make following links rcu-walk aware. Uncached path elements will always require dropping to ref-walk mode, at the very least because i_mutex needs to be grabbed, and objects allocated. Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Diffstat (limited to 'fs')
-rw-r--r--fs/dcache.c203
-rw-r--r--fs/filesystems.c3
-rw-r--r--fs/namei.c743
-rw-r--r--fs/proc/proc_sysctl.c4
4 files changed, 794 insertions, 159 deletions
diff --git a/fs/dcache.c b/fs/dcache.c
index dc0551c9755d..187fea040108 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -152,9 +152,23 @@ static void d_free(struct dentry *dentry)
152 call_rcu(&dentry->d_u.d_rcu, __d_free); 152 call_rcu(&dentry->d_u.d_rcu, __d_free);
153} 153}
154 154
155/**
156 * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
157 * After this call, in-progress rcu-walk path lookup will fail. This
158 * should be called after unhashing, and after changing d_inode (if
159 * the dentry has not already been unhashed).
160 */
161static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
162{
163 assert_spin_locked(&dentry->d_lock);
164 /* Go through a barrier */
165 write_seqcount_barrier(&dentry->d_seq);
166}
167
155/* 168/*
156 * Release the dentry's inode, using the filesystem 169 * Release the dentry's inode, using the filesystem
157 * d_iput() operation if defined. 170 * d_iput() operation if defined. Dentry has no refcount
171 * and is unhashed.
158 */ 172 */
159static void dentry_iput(struct dentry * dentry) 173static void dentry_iput(struct dentry * dentry)
160 __releases(dentry->d_lock) 174 __releases(dentry->d_lock)
@@ -179,6 +193,28 @@ static void dentry_iput(struct dentry * dentry)
179} 193}
180 194
181/* 195/*
196 * Release the dentry's inode, using the filesystem
197 * d_iput() operation if defined. dentry remains in-use.
198 */
199static void dentry_unlink_inode(struct dentry * dentry)
200 __releases(dentry->d_lock)
201 __releases(dcache_inode_lock)
202{
203 struct inode *inode = dentry->d_inode;
204 dentry->d_inode = NULL;
205 list_del_init(&dentry->d_alias);
206 dentry_rcuwalk_barrier(dentry);
207 spin_unlock(&dentry->d_lock);
208 spin_unlock(&dcache_inode_lock);
209 if (!inode->i_nlink)
210 fsnotify_inoderemove(inode);
211 if (dentry->d_op && dentry->d_op->d_iput)
212 dentry->d_op->d_iput(dentry, inode);
213 else
214 iput(inode);
215}
216
217/*
182 * dentry_lru_(add|del|move_tail) must be called with d_lock held. 218 * dentry_lru_(add|del|move_tail) must be called with d_lock held.
183 */ 219 */
184static void dentry_lru_add(struct dentry *dentry) 220static void dentry_lru_add(struct dentry *dentry)
@@ -272,6 +308,7 @@ void __d_drop(struct dentry *dentry)
272 spin_lock(&dcache_hash_lock); 308 spin_lock(&dcache_hash_lock);
273 hlist_del_rcu(&dentry->d_hash); 309 hlist_del_rcu(&dentry->d_hash);
274 spin_unlock(&dcache_hash_lock); 310 spin_unlock(&dcache_hash_lock);
311 dentry_rcuwalk_barrier(dentry);
275 } 312 }
276} 313}
277EXPORT_SYMBOL(__d_drop); 314EXPORT_SYMBOL(__d_drop);
@@ -309,6 +346,7 @@ relock:
309 spin_unlock(&dcache_inode_lock); 346 spin_unlock(&dcache_inode_lock);
310 goto relock; 347 goto relock;
311 } 348 }
349
312 if (ref) 350 if (ref)
313 dentry->d_count--; 351 dentry->d_count--;
314 /* if dentry was on the d_lru list delete it from there */ 352 /* if dentry was on the d_lru list delete it from there */
@@ -1221,6 +1259,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
1221 dentry->d_count = 1; 1259 dentry->d_count = 1;
1222 dentry->d_flags = DCACHE_UNHASHED; 1260 dentry->d_flags = DCACHE_UNHASHED;
1223 spin_lock_init(&dentry->d_lock); 1261 spin_lock_init(&dentry->d_lock);
1262 seqcount_init(&dentry->d_seq);
1224 dentry->d_inode = NULL; 1263 dentry->d_inode = NULL;
1225 dentry->d_parent = NULL; 1264 dentry->d_parent = NULL;
1226 dentry->d_sb = NULL; 1265 dentry->d_sb = NULL;
@@ -1269,6 +1308,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
1269 if (inode) 1308 if (inode)
1270 list_add(&dentry->d_alias, &inode->i_dentry); 1309 list_add(&dentry->d_alias, &inode->i_dentry);
1271 dentry->d_inode = inode; 1310 dentry->d_inode = inode;
1311 dentry_rcuwalk_barrier(dentry);
1272 spin_unlock(&dentry->d_lock); 1312 spin_unlock(&dentry->d_lock);
1273 fsnotify_d_instantiate(dentry, inode); 1313 fsnotify_d_instantiate(dentry, inode);
1274} 1314}
@@ -1611,6 +1651,111 @@ err_out:
1611EXPORT_SYMBOL(d_add_ci); 1651EXPORT_SYMBOL(d_add_ci);
1612 1652
1613/** 1653/**
1654 * __d_lookup_rcu - search for a dentry (racy, store-free)
1655 * @parent: parent dentry
1656 * @name: qstr of name we wish to find
1657 * @seq: returns d_seq value at the point where the dentry was found
1658 * @inode: returns dentry->d_inode when the inode was found valid.
1659 * Returns: dentry, or NULL
1660 *
1661 * __d_lookup_rcu is the dcache lookup function for rcu-walk name
1662 * resolution (store-free path walking) design described in
1663 * Documentation/filesystems/path-lookup.txt.
1664 *
1665 * This is not to be used outside core vfs.
1666 *
1667 * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
1668 * held, and rcu_read_lock held. The returned dentry must not be stored into
1669 * without taking d_lock and checking d_seq sequence count against @seq
1670 * returned here.
1671 *
1672 * A refcount may be taken on the found dentry with the __d_rcu_to_refcount
1673 * function.
1674 *
1675 * Alternatively, __d_lookup_rcu may be called again to look up the child of
1676 * the returned dentry, so long as its parent's seqlock is checked after the
1677 * child is looked up. Thus, an interlocking stepping of sequence lock checks
1678 * is formed, giving integrity down the path walk.
1679 */
1680struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
1681 unsigned *seq, struct inode **inode)
1682{
1683 unsigned int len = name->len;
1684 unsigned int hash = name->hash;
1685 const unsigned char *str = name->name;
1686 struct hlist_head *head = d_hash(parent, hash);
1687 struct hlist_node *node;
1688 struct dentry *dentry;
1689
1690 /*
1691 * Note: There is significant duplication with __d_lookup_rcu which is
1692 * required to prevent single threaded performance regressions
1693 * especially on architectures where smp_rmb (in seqcounts) are costly.
1694 * Keep the two functions in sync.
1695 */
1696
1697 /*
1698 * The hash list is protected using RCU.
1699 *
1700 * Carefully use d_seq when comparing a candidate dentry, to avoid
1701 * races with d_move().
1702 *
1703 * It is possible that concurrent renames can mess up our list
1704 * walk here and result in missing our dentry, resulting in the
1705 * false-negative result. d_lookup() protects against concurrent
1706 * renames using rename_lock seqlock.
1707 *
1708 * See Documentation/vfs/dcache-locking.txt for more details.
1709 */
1710 hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
1711 struct inode *i;
1712 const char *tname;
1713 int tlen;
1714
1715 if (dentry->d_name.hash != hash)
1716 continue;
1717
1718seqretry:
1719 *seq = read_seqcount_begin(&dentry->d_seq);
1720 if (dentry->d_parent != parent)
1721 continue;
1722 if (d_unhashed(dentry))
1723 continue;
1724 tlen = dentry->d_name.len;
1725 tname = dentry->d_name.name;
1726 i = dentry->d_inode;
1727 /*
1728 * This seqcount check is required to ensure name and
1729 * len are loaded atomically, so as not to walk off the
1730 * edge of memory when walking. If we could load this
1731 * atomically some other way, we could drop this check.
1732 */
1733 if (read_seqcount_retry(&dentry->d_seq, *seq))
1734 goto seqretry;
1735 if (parent->d_op && parent->d_op->d_compare) {
1736 if (parent->d_op->d_compare(parent, *inode,
1737 dentry, i,
1738 tlen, tname, name))
1739 continue;
1740 } else {
1741 if (tlen != len)
1742 continue;
1743 if (memcmp(tname, str, tlen))
1744 continue;
1745 }
1746 /*
1747 * No extra seqcount check is required after the name
1748 * compare. The caller must perform a seqcount check in
1749 * order to do anything useful with the returned dentry
1750 * anyway.
1751 */
1752 *inode = i;
1753 return dentry;
1754 }
1755 return NULL;
1756}
1757
1758/**
1614 * d_lookup - search for a dentry 1759 * d_lookup - search for a dentry
1615 * @parent: parent dentry 1760 * @parent: parent dentry
1616 * @name: qstr of name we wish to find 1761 * @name: qstr of name we wish to find
@@ -1621,9 +1766,9 @@ EXPORT_SYMBOL(d_add_ci);
1621 * dentry is returned. The caller must use dput to free the entry when it has 1766 * dentry is returned. The caller must use dput to free the entry when it has
1622 * finished using it. %NULL is returned if the dentry does not exist. 1767 * finished using it. %NULL is returned if the dentry does not exist.
1623 */ 1768 */
1624struct dentry * d_lookup(struct dentry * parent, struct qstr * name) 1769struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
1625{ 1770{
1626 struct dentry * dentry = NULL; 1771 struct dentry *dentry;
1627 unsigned seq; 1772 unsigned seq;
1628 1773
1629 do { 1774 do {
@@ -1636,7 +1781,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
1636} 1781}
1637EXPORT_SYMBOL(d_lookup); 1782EXPORT_SYMBOL(d_lookup);
1638 1783
1639/* 1784/**
1640 * __d_lookup - search for a dentry (racy) 1785 * __d_lookup - search for a dentry (racy)
1641 * @parent: parent dentry 1786 * @parent: parent dentry
1642 * @name: qstr of name we wish to find 1787 * @name: qstr of name we wish to find
@@ -1651,17 +1796,24 @@ EXPORT_SYMBOL(d_lookup);
1651 * 1796 *
1652 * __d_lookup callers must be commented. 1797 * __d_lookup callers must be commented.
1653 */ 1798 */
1654struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) 1799struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
1655{ 1800{
1656 unsigned int len = name->len; 1801 unsigned int len = name->len;
1657 unsigned int hash = name->hash; 1802 unsigned int hash = name->hash;
1658 const unsigned char *str = name->name; 1803 const unsigned char *str = name->name;
1659 struct hlist_head *head = d_hash(parent,hash); 1804 struct hlist_head *head = d_hash(parent,hash);
1660 struct dentry *found = NULL;
1661 struct hlist_node *node; 1805 struct hlist_node *node;
1806 struct dentry *found = NULL;
1662 struct dentry *dentry; 1807 struct dentry *dentry;
1663 1808
1664 /* 1809 /*
1810 * Note: There is significant duplication with __d_lookup_rcu which is
1811 * required to prevent single threaded performance regressions
1812 * especially on architectures where smp_rmb (in seqcounts) are costly.
1813 * Keep the two functions in sync.
1814 */
1815
1816 /*
1665 * The hash list is protected using RCU. 1817 * The hash list is protected using RCU.
1666 * 1818 *
1667 * Take d_lock when comparing a candidate dentry, to avoid races 1819 * Take d_lock when comparing a candidate dentry, to avoid races
@@ -1677,24 +1829,15 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1677 rcu_read_lock(); 1829 rcu_read_lock();
1678 1830
1679 hlist_for_each_entry_rcu(dentry, node, head, d_hash) { 1831 hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
1680 struct qstr *qstr; 1832 const char *tname;
1833 int tlen;
1681 1834
1682 if (dentry->d_name.hash != hash) 1835 if (dentry->d_name.hash != hash)
1683 continue; 1836 continue;
1684 if (dentry->d_parent != parent)
1685 continue;
1686 1837
1687 spin_lock(&dentry->d_lock); 1838 spin_lock(&dentry->d_lock);
1688
1689 /*
1690 * Recheck the dentry after taking the lock - d_move may have
1691 * changed things. Don't bother checking the hash because
1692 * we're about to compare the whole name anyway.
1693 */
1694 if (dentry->d_parent != parent) 1839 if (dentry->d_parent != parent)
1695 goto next; 1840 goto next;
1696
1697 /* non-existing due to RCU? */
1698 if (d_unhashed(dentry)) 1841 if (d_unhashed(dentry))
1699 goto next; 1842 goto next;
1700 1843
@@ -1702,16 +1845,17 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1702 * It is safe to compare names since d_move() cannot 1845 * It is safe to compare names since d_move() cannot
1703 * change the qstr (protected by d_lock). 1846 * change the qstr (protected by d_lock).
1704 */ 1847 */
1705 qstr = &dentry->d_name; 1848 tlen = dentry->d_name.len;
1849 tname = dentry->d_name.name;
1706 if (parent->d_op && parent->d_op->d_compare) { 1850 if (parent->d_op && parent->d_op->d_compare) {
1707 if (parent->d_op->d_compare(parent, parent->d_inode, 1851 if (parent->d_op->d_compare(parent, parent->d_inode,
1708 dentry, dentry->d_inode, 1852 dentry, dentry->d_inode,
1709 qstr->len, qstr->name, name)) 1853 tlen, tname, name))
1710 goto next; 1854 goto next;
1711 } else { 1855 } else {
1712 if (qstr->len != len) 1856 if (tlen != len)
1713 goto next; 1857 goto next;
1714 if (memcmp(qstr->name, str, len)) 1858 if (memcmp(tname, str, tlen))
1715 goto next; 1859 goto next;
1716 } 1860 }
1717 1861
@@ -1821,7 +1965,7 @@ again:
1821 goto again; 1965 goto again;
1822 } 1966 }
1823 dentry->d_flags &= ~DCACHE_CANT_MOUNT; 1967 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
1824 dentry_iput(dentry); 1968 dentry_unlink_inode(dentry);
1825 fsnotify_nameremove(dentry, isdir); 1969 fsnotify_nameremove(dentry, isdir);
1826 return; 1970 return;
1827 } 1971 }
@@ -1884,7 +2028,9 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
1884 BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ 2028 BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
1885 2029
1886 spin_lock(&dentry->d_lock); 2030 spin_lock(&dentry->d_lock);
2031 write_seqcount_begin(&dentry->d_seq);
1887 memcpy((unsigned char *)dentry->d_name.name, name->name, name->len); 2032 memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
2033 write_seqcount_end(&dentry->d_seq);
1888 spin_unlock(&dentry->d_lock); 2034 spin_unlock(&dentry->d_lock);
1889} 2035}
1890EXPORT_SYMBOL(dentry_update_name_case); 2036EXPORT_SYMBOL(dentry_update_name_case);
@@ -1997,6 +2143,9 @@ void d_move(struct dentry * dentry, struct dentry * target)
1997 2143
1998 dentry_lock_for_move(dentry, target); 2144 dentry_lock_for_move(dentry, target);
1999 2145
2146 write_seqcount_begin(&dentry->d_seq);
2147 write_seqcount_begin(&target->d_seq);
2148
2000 /* Move the dentry to the target hash queue, if on different bucket */ 2149 /* Move the dentry to the target hash queue, if on different bucket */
2001 spin_lock(&dcache_hash_lock); 2150 spin_lock(&dcache_hash_lock);
2002 if (!d_unhashed(dentry)) 2151 if (!d_unhashed(dentry))
@@ -2005,6 +2154,7 @@ void d_move(struct dentry * dentry, struct dentry * target)
2005 spin_unlock(&dcache_hash_lock); 2154 spin_unlock(&dcache_hash_lock);
2006 2155
2007 /* Unhash the target: dput() will then get rid of it */ 2156 /* Unhash the target: dput() will then get rid of it */
2157 /* __d_drop does write_seqcount_barrier, but they're OK to nest. */
2008 __d_drop(target); 2158 __d_drop(target);
2009 2159
2010 list_del(&dentry->d_u.d_child); 2160 list_del(&dentry->d_u.d_child);
@@ -2028,6 +2178,9 @@ void d_move(struct dentry * dentry, struct dentry * target)
2028 2178
2029 list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); 2179 list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
2030 2180
2181 write_seqcount_end(&target->d_seq);
2182 write_seqcount_end(&dentry->d_seq);
2183
2031 dentry_unlock_parents_for_move(dentry, target); 2184 dentry_unlock_parents_for_move(dentry, target);
2032 spin_unlock(&target->d_lock); 2185 spin_unlock(&target->d_lock);
2033 fsnotify_d_move(dentry); 2186 fsnotify_d_move(dentry);
@@ -2110,6 +2263,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
2110 2263
2111 dentry_lock_for_move(anon, dentry); 2264 dentry_lock_for_move(anon, dentry);
2112 2265
2266 write_seqcount_begin(&dentry->d_seq);
2267 write_seqcount_begin(&anon->d_seq);
2268
2113 dparent = dentry->d_parent; 2269 dparent = dentry->d_parent;
2114 aparent = anon->d_parent; 2270 aparent = anon->d_parent;
2115 2271
@@ -2130,6 +2286,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
2130 else 2286 else
2131 INIT_LIST_HEAD(&anon->d_u.d_child); 2287 INIT_LIST_HEAD(&anon->d_u.d_child);
2132 2288
2289 write_seqcount_end(&dentry->d_seq);
2290 write_seqcount_end(&anon->d_seq);
2291
2133 dentry_unlock_parents_for_move(anon, dentry); 2292 dentry_unlock_parents_for_move(anon, dentry);
2134 spin_unlock(&dentry->d_lock); 2293 spin_unlock(&dentry->d_lock);
2135 2294
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 68ba492d8eef..751d6b255a12 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -115,6 +115,9 @@ int unregister_filesystem(struct file_system_type * fs)
115 tmp = &(*tmp)->next; 115 tmp = &(*tmp)->next;
116 } 116 }
117 write_unlock(&file_systems_lock); 117 write_unlock(&file_systems_lock);
118
119 synchronize_rcu();
120
118 return -EINVAL; 121 return -EINVAL;
119} 122}
120 123
diff --git a/fs/namei.c b/fs/namei.c
index 5642bc2be418..8d3f15b3a541 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname);
169/* 169/*
170 * This does basic POSIX ACL permission checking 170 * This does basic POSIX ACL permission checking
171 */ 171 */
172static int acl_permission_check(struct inode *inode, int mask, 172static inline int __acl_permission_check(struct inode *inode, int mask,
173 int (*check_acl)(struct inode *inode, int mask)) 173 int (*check_acl)(struct inode *inode, int mask), int rcu)
174{ 174{
175 umode_t mode = inode->i_mode; 175 umode_t mode = inode->i_mode;
176 176
@@ -180,9 +180,13 @@ static int acl_permission_check(struct inode *inode, int mask,
180 mode >>= 6; 180 mode >>= 6;
181 else { 181 else {
182 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { 182 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
183 int error = check_acl(inode, mask); 183 if (rcu) {
184 if (error != -EAGAIN) 184 return -ECHILD;
185 return error; 185 } else {
186 int error = check_acl(inode, mask);
187 if (error != -EAGAIN)
188 return error;
189 }
186 } 190 }
187 191
188 if (in_group_p(inode->i_gid)) 192 if (in_group_p(inode->i_gid))
@@ -197,6 +201,12 @@ static int acl_permission_check(struct inode *inode, int mask,
197 return -EACCES; 201 return -EACCES;
198} 202}
199 203
204static inline int acl_permission_check(struct inode *inode, int mask,
205 int (*check_acl)(struct inode *inode, int mask))
206{
207 return __acl_permission_check(inode, mask, check_acl, 0);
208}
209
200/** 210/**
201 * generic_permission - check for access rights on a Posix-like filesystem 211 * generic_permission - check for access rights on a Posix-like filesystem
202 * @inode: inode to check access rights for 212 * @inode: inode to check access rights for
@@ -375,6 +385,173 @@ void path_put(struct path *path)
375EXPORT_SYMBOL(path_put); 385EXPORT_SYMBOL(path_put);
376 386
377/** 387/**
388 * nameidata_drop_rcu - drop this nameidata out of rcu-walk
389 * @nd: nameidata pathwalk data to drop
390 * @Returns: 0 on success, -ECHLID on failure
391 *
392 * Path walking has 2 modes, rcu-walk and ref-walk (see
393 * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
394 * to drop out of rcu-walk mode and take normal reference counts on dentries
395 * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
396 * refcounts at the last known good point before rcu-walk got stuck, so
397 * ref-walk may continue from there. If this is not successful (eg. a seqcount
398 * has changed), then failure is returned and path walk restarts from the
399 * beginning in ref-walk mode.
400 *
401 * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
402 * ref-walk. Must be called from rcu-walk context.
403 */
404static int nameidata_drop_rcu(struct nameidata *nd)
405{
406 struct fs_struct *fs = current->fs;
407 struct dentry *dentry = nd->path.dentry;
408
409 BUG_ON(!(nd->flags & LOOKUP_RCU));
410 if (nd->root.mnt) {
411 spin_lock(&fs->lock);
412 if (nd->root.mnt != fs->root.mnt ||
413 nd->root.dentry != fs->root.dentry)
414 goto err_root;
415 }
416 spin_lock(&dentry->d_lock);
417 if (!__d_rcu_to_refcount(dentry, nd->seq))
418 goto err;
419 BUG_ON(nd->inode != dentry->d_inode);
420 spin_unlock(&dentry->d_lock);
421 if (nd->root.mnt) {
422 path_get(&nd->root);
423 spin_unlock(&fs->lock);
424 }
425 mntget(nd->path.mnt);
426
427 rcu_read_unlock();
428 br_read_unlock(vfsmount_lock);
429 nd->flags &= ~LOOKUP_RCU;
430 return 0;
431err:
432 spin_unlock(&dentry->d_lock);
433err_root:
434 if (nd->root.mnt)
435 spin_unlock(&fs->lock);
436 return -ECHILD;
437}
438
439/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
440static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
441{
442 if (nd->flags & LOOKUP_RCU)
443 return nameidata_drop_rcu(nd);
444 return 0;
445}
446
447/**
448 * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
449 * @nd: nameidata pathwalk data to drop
450 * @dentry: dentry to drop
451 * @Returns: 0 on success, -ECHLID on failure
452 *
453 * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
454 * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
455 * @nd. Must be called from rcu-walk context.
456 */
457static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
458{
459 struct fs_struct *fs = current->fs;
460 struct dentry *parent = nd->path.dentry;
461
462 BUG_ON(!(nd->flags & LOOKUP_RCU));
463 if (nd->root.mnt) {
464 spin_lock(&fs->lock);
465 if (nd->root.mnt != fs->root.mnt ||
466 nd->root.dentry != fs->root.dentry)
467 goto err_root;
468 }
469 spin_lock(&parent->d_lock);
470 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
471 if (!__d_rcu_to_refcount(dentry, nd->seq))
472 goto err;
473 /*
474 * If the sequence check on the child dentry passed, then the child has
475 * not been removed from its parent. This means the parent dentry must
476 * be valid and able to take a reference at this point.
477 */
478 BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
479 BUG_ON(!parent->d_count);
480 parent->d_count++;
481 spin_unlock(&dentry->d_lock);
482 spin_unlock(&parent->d_lock);
483 if (nd->root.mnt) {
484 path_get(&nd->root);
485 spin_unlock(&fs->lock);
486 }
487 mntget(nd->path.mnt);
488
489 rcu_read_unlock();
490 br_read_unlock(vfsmount_lock);
491 nd->flags &= ~LOOKUP_RCU;
492 return 0;
493err:
494 spin_unlock(&dentry->d_lock);
495 spin_unlock(&parent->d_lock);
496err_root:
497 if (nd->root.mnt)
498 spin_unlock(&fs->lock);
499 return -ECHILD;
500}
501
502/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
503static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
504{
505 if (nd->flags & LOOKUP_RCU)
506 return nameidata_dentry_drop_rcu(nd, dentry);
507 return 0;
508}
509
510/**
511 * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
512 * @nd: nameidata pathwalk data to drop
513 * @Returns: 0 on success, -ECHLID on failure
514 *
515 * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
516 * nd->path should be the final element of the lookup, so nd->root is discarded.
517 * Must be called from rcu-walk context.
518 */
519static int nameidata_drop_rcu_last(struct nameidata *nd)
520{
521 struct dentry *dentry = nd->path.dentry;
522
523 BUG_ON(!(nd->flags & LOOKUP_RCU));
524 nd->flags &= ~LOOKUP_RCU;
525 nd->root.mnt = NULL;
526 spin_lock(&dentry->d_lock);
527 if (!__d_rcu_to_refcount(dentry, nd->seq))
528 goto err_unlock;
529 BUG_ON(nd->inode != dentry->d_inode);
530 spin_unlock(&dentry->d_lock);
531
532 mntget(nd->path.mnt);
533
534 rcu_read_unlock();
535 br_read_unlock(vfsmount_lock);
536
537 return 0;
538
539err_unlock:
540 spin_unlock(&dentry->d_lock);
541 rcu_read_unlock();
542 br_read_unlock(vfsmount_lock);
543 return -ECHILD;
544}
545
546/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
547static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
548{
549 if (likely(nd->flags & LOOKUP_RCU))
550 return nameidata_drop_rcu_last(nd);
551 return 0;
552}
553
554/**
378 * release_open_intent - free up open intent resources 555 * release_open_intent - free up open intent resources
379 * @nd: pointer to nameidata 556 * @nd: pointer to nameidata
380 */ 557 */
@@ -459,26 +636,40 @@ force_reval_path(struct path *path, struct nameidata *nd)
459 * short-cut DAC fails, then call ->permission() to do more 636 * short-cut DAC fails, then call ->permission() to do more
460 * complete permission check. 637 * complete permission check.
461 */ 638 */
462static int exec_permission(struct inode *inode) 639static inline int __exec_permission(struct inode *inode, int rcu)
463{ 640{
464 int ret; 641 int ret;
465 642
466 if (inode->i_op->permission) { 643 if (inode->i_op->permission) {
644 if (rcu)
645 return -ECHILD;
467 ret = inode->i_op->permission(inode, MAY_EXEC); 646 ret = inode->i_op->permission(inode, MAY_EXEC);
468 if (!ret) 647 if (!ret)
469 goto ok; 648 goto ok;
470 return ret; 649 return ret;
471 } 650 }
472 ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl); 651 ret = __acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl, rcu);
473 if (!ret) 652 if (!ret)
474 goto ok; 653 goto ok;
654 if (rcu && ret == -ECHILD)
655 return ret;
475 656
476 if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) 657 if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
477 goto ok; 658 goto ok;
478 659
479 return ret; 660 return ret;
480ok: 661ok:
481 return security_inode_permission(inode, MAY_EXEC); 662 return security_inode_exec_permission(inode, rcu);
663}
664
665static int exec_permission(struct inode *inode)
666{
667 return __exec_permission(inode, 0);
668}
669
670static int exec_permission_rcu(struct inode *inode)
671{
672 return __exec_permission(inode, 1);
482} 673}
483 674
484static __always_inline void set_root(struct nameidata *nd) 675static __always_inline void set_root(struct nameidata *nd)
@@ -489,8 +680,20 @@ static __always_inline void set_root(struct nameidata *nd)
489 680
490static int link_path_walk(const char *, struct nameidata *); 681static int link_path_walk(const char *, struct nameidata *);
491 682
683static __always_inline void set_root_rcu(struct nameidata *nd)
684{
685 if (!nd->root.mnt) {
686 struct fs_struct *fs = current->fs;
687 spin_lock(&fs->lock);
688 nd->root = fs->root;
689 spin_unlock(&fs->lock);
690 }
691}
692
492static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) 693static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
493{ 694{
695 int ret;
696
494 if (IS_ERR(link)) 697 if (IS_ERR(link))
495 goto fail; 698 goto fail;
496 699
@@ -500,8 +703,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
500 nd->path = nd->root; 703 nd->path = nd->root;
501 path_get(&nd->root); 704 path_get(&nd->root);
502 } 705 }
706 nd->inode = nd->path.dentry->d_inode;
503 707
504 return link_path_walk(link, nd); 708 ret = link_path_walk(link, nd);
709 return ret;
505fail: 710fail:
506 path_put(&nd->path); 711 path_put(&nd->path);
507 return PTR_ERR(link); 712 return PTR_ERR(link);
@@ -516,11 +721,12 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
516 721
517static inline void path_to_nameidata(struct path *path, struct nameidata *nd) 722static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
518{ 723{
519 dput(nd->path.dentry); 724 if (!(nd->flags & LOOKUP_RCU)) {
520 if (nd->path.mnt != path->mnt) { 725 dput(nd->path.dentry);
521 mntput(nd->path.mnt); 726 if (nd->path.mnt != path->mnt)
522 nd->path.mnt = path->mnt; 727 mntput(nd->path.mnt);
523 } 728 }
729 nd->path.mnt = path->mnt;
524 nd->path.dentry = path->dentry; 730 nd->path.dentry = path->dentry;
525} 731}
526 732
@@ -535,9 +741,11 @@ __do_follow_link(struct path *path, struct nameidata *nd, void **p)
535 741
536 if (path->mnt != nd->path.mnt) { 742 if (path->mnt != nd->path.mnt) {
537 path_to_nameidata(path, nd); 743 path_to_nameidata(path, nd);
744 nd->inode = nd->path.dentry->d_inode;
538 dget(dentry); 745 dget(dentry);
539 } 746 }
540 mntget(path->mnt); 747 mntget(path->mnt);
748
541 nd->last_type = LAST_BIND; 749 nd->last_type = LAST_BIND;
542 *p = dentry->d_inode->i_op->follow_link(dentry, nd); 750 *p = dentry->d_inode->i_op->follow_link(dentry, nd);
543 error = PTR_ERR(*p); 751 error = PTR_ERR(*p);
@@ -591,6 +799,20 @@ loop:
591 return err; 799 return err;
592} 800}
593 801
802static int follow_up_rcu(struct path *path)
803{
804 struct vfsmount *parent;
805 struct dentry *mountpoint;
806
807 parent = path->mnt->mnt_parent;
808 if (parent == path->mnt)
809 return 0;
810 mountpoint = path->mnt->mnt_mountpoint;
811 path->dentry = mountpoint;
812 path->mnt = parent;
813 return 1;
814}
815
594int follow_up(struct path *path) 816int follow_up(struct path *path)
595{ 817{
596 struct vfsmount *parent; 818 struct vfsmount *parent;
@@ -615,6 +837,21 @@ int follow_up(struct path *path)
615/* 837/*
616 * serialization is taken care of in namespace.c 838 * serialization is taken care of in namespace.c
617 */ 839 */
840static void __follow_mount_rcu(struct nameidata *nd, struct path *path,
841 struct inode **inode)
842{
843 while (d_mountpoint(path->dentry)) {
844 struct vfsmount *mounted;
845 mounted = __lookup_mnt(path->mnt, path->dentry, 1);
846 if (!mounted)
847 return;
848 path->mnt = mounted;
849 path->dentry = mounted->mnt_root;
850 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
851 *inode = path->dentry->d_inode;
852 }
853}
854
618static int __follow_mount(struct path *path) 855static int __follow_mount(struct path *path)
619{ 856{
620 int res = 0; 857 int res = 0;
@@ -660,7 +897,42 @@ int follow_down(struct path *path)
660 return 0; 897 return 0;
661} 898}
662 899
663static __always_inline void follow_dotdot(struct nameidata *nd) 900static int follow_dotdot_rcu(struct nameidata *nd)
901{
902 struct inode *inode = nd->inode;
903
904 set_root_rcu(nd);
905
906 while(1) {
907 if (nd->path.dentry == nd->root.dentry &&
908 nd->path.mnt == nd->root.mnt) {
909 break;
910 }
911 if (nd->path.dentry != nd->path.mnt->mnt_root) {
912 struct dentry *old = nd->path.dentry;
913 struct dentry *parent = old->d_parent;
914 unsigned seq;
915
916 seq = read_seqcount_begin(&parent->d_seq);
917 if (read_seqcount_retry(&old->d_seq, nd->seq))
918 return -ECHILD;
919 inode = parent->d_inode;
920 nd->path.dentry = parent;
921 nd->seq = seq;
922 break;
923 }
924 if (!follow_up_rcu(&nd->path))
925 break;
926 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
927 inode = nd->path.dentry->d_inode;
928 }
929 __follow_mount_rcu(nd, &nd->path, &inode);
930 nd->inode = inode;
931
932 return 0;
933}
934
935static void follow_dotdot(struct nameidata *nd)
664{ 936{
665 set_root(nd); 937 set_root(nd);
666 938
@@ -681,6 +953,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
681 break; 953 break;
682 } 954 }
683 follow_mount(&nd->path); 955 follow_mount(&nd->path);
956 nd->inode = nd->path.dentry->d_inode;
684} 957}
685 958
686/* 959/*
@@ -718,18 +991,17 @@ static struct dentry *d_alloc_and_lookup(struct dentry *parent,
718 * It _is_ time-critical. 991 * It _is_ time-critical.
719 */ 992 */
720static int do_lookup(struct nameidata *nd, struct qstr *name, 993static int do_lookup(struct nameidata *nd, struct qstr *name,
721 struct path *path) 994 struct path *path, struct inode **inode)
722{ 995{
723 struct vfsmount *mnt = nd->path.mnt; 996 struct vfsmount *mnt = nd->path.mnt;
724 struct dentry *dentry, *parent; 997 struct dentry *dentry, *parent = nd->path.dentry;
725 struct inode *dir; 998 struct inode *dir;
726 /* 999 /*
727 * See if the low-level filesystem might want 1000 * See if the low-level filesystem might want
728 * to use its own hash.. 1001 * to use its own hash..
729 */ 1002 */
730 if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { 1003 if (parent->d_op && parent->d_op->d_hash) {
731 int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, 1004 int err = parent->d_op->d_hash(parent, nd->inode, name);
732 nd->path.dentry->d_inode, name);
733 if (err < 0) 1005 if (err < 0)
734 return err; 1006 return err;
735 } 1007 }
@@ -739,21 +1011,48 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
739 * of a false negative due to a concurrent rename, we're going to 1011 * of a false negative due to a concurrent rename, we're going to
740 * do the non-racy lookup, below. 1012 * do the non-racy lookup, below.
741 */ 1013 */
742 dentry = __d_lookup(nd->path.dentry, name); 1014 if (nd->flags & LOOKUP_RCU) {
743 if (!dentry) 1015 unsigned seq;
744 goto need_lookup; 1016
1017 *inode = nd->inode;
1018 dentry = __d_lookup_rcu(parent, name, &seq, inode);
1019 if (!dentry) {
1020 if (nameidata_drop_rcu(nd))
1021 return -ECHILD;
1022 goto need_lookup;
1023 }
1024 /* Memory barrier in read_seqcount_begin of child is enough */
1025 if (__read_seqcount_retry(&parent->d_seq, nd->seq))
1026 return -ECHILD;
1027
1028 nd->seq = seq;
1029 if (dentry->d_op && dentry->d_op->d_revalidate) {
1030 /* We commonly drop rcu-walk here */
1031 if (nameidata_dentry_drop_rcu(nd, dentry))
1032 return -ECHILD;
1033 goto need_revalidate;
1034 }
1035 path->mnt = mnt;
1036 path->dentry = dentry;
1037 __follow_mount_rcu(nd, path, inode);
1038 } else {
1039 dentry = __d_lookup(parent, name);
1040 if (!dentry)
1041 goto need_lookup;
745found: 1042found:
746 if (dentry->d_op && dentry->d_op->d_revalidate) 1043 if (dentry->d_op && dentry->d_op->d_revalidate)
747 goto need_revalidate; 1044 goto need_revalidate;
748done: 1045done:
749 path->mnt = mnt; 1046 path->mnt = mnt;
750 path->dentry = dentry; 1047 path->dentry = dentry;
751 __follow_mount(path); 1048 __follow_mount(path);
1049 *inode = path->dentry->d_inode;
1050 }
752 return 0; 1051 return 0;
753 1052
754need_lookup: 1053need_lookup:
755 parent = nd->path.dentry;
756 dir = parent->d_inode; 1054 dir = parent->d_inode;
1055 BUG_ON(nd->inode != dir);
757 1056
758 mutex_lock(&dir->i_mutex); 1057 mutex_lock(&dir->i_mutex);
759 /* 1058 /*
@@ -815,7 +1114,6 @@ static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
815static int link_path_walk(const char *name, struct nameidata *nd) 1114static int link_path_walk(const char *name, struct nameidata *nd)
816{ 1115{
817 struct path next; 1116 struct path next;
818 struct inode *inode;
819 int err; 1117 int err;
820 unsigned int lookup_flags = nd->flags; 1118 unsigned int lookup_flags = nd->flags;
821 1119
@@ -824,18 +1122,28 @@ static int link_path_walk(const char *name, struct nameidata *nd)
824 if (!*name) 1122 if (!*name)
825 goto return_reval; 1123 goto return_reval;
826 1124
827 inode = nd->path.dentry->d_inode;
828 if (nd->depth) 1125 if (nd->depth)
829 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); 1126 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
830 1127
831 /* At this point we know we have a real path component. */ 1128 /* At this point we know we have a real path component. */
832 for(;;) { 1129 for(;;) {
1130 struct inode *inode;
833 unsigned long hash; 1131 unsigned long hash;
834 struct qstr this; 1132 struct qstr this;
835 unsigned int c; 1133 unsigned int c;
836 1134
837 nd->flags |= LOOKUP_CONTINUE; 1135 nd->flags |= LOOKUP_CONTINUE;
838 err = exec_permission(inode); 1136 if (nd->flags & LOOKUP_RCU) {
1137 err = exec_permission_rcu(nd->inode);
1138 if (err == -ECHILD) {
1139 if (nameidata_drop_rcu(nd))
1140 return -ECHILD;
1141 goto exec_again;
1142 }
1143 } else {
1144exec_again:
1145 err = exec_permission(nd->inode);
1146 }
839 if (err) 1147 if (err)
840 break; 1148 break;
841 1149
@@ -866,37 +1174,44 @@ static int link_path_walk(const char *name, struct nameidata *nd)
866 if (this.name[0] == '.') switch (this.len) { 1174 if (this.name[0] == '.') switch (this.len) {
867 default: 1175 default:
868 break; 1176 break;
869 case 2: 1177 case 2:
870 if (this.name[1] != '.') 1178 if (this.name[1] != '.')
871 break; 1179 break;
872 follow_dotdot(nd); 1180 if (nd->flags & LOOKUP_RCU) {
873 inode = nd->path.dentry->d_inode; 1181 if (follow_dotdot_rcu(nd))
1182 return -ECHILD;
1183 } else
1184 follow_dotdot(nd);
874 /* fallthrough */ 1185 /* fallthrough */
875 case 1: 1186 case 1:
876 continue; 1187 continue;
877 } 1188 }
878 /* This does the actual lookups.. */ 1189 /* This does the actual lookups.. */
879 err = do_lookup(nd, &this, &next); 1190 err = do_lookup(nd, &this, &next, &inode);
880 if (err) 1191 if (err)
881 break; 1192 break;
882
883 err = -ENOENT; 1193 err = -ENOENT;
884 inode = next.dentry->d_inode;
885 if (!inode) 1194 if (!inode)
886 goto out_dput; 1195 goto out_dput;
887 1196
888 if (inode->i_op->follow_link) { 1197 if (inode->i_op->follow_link) {
1198 /* We commonly drop rcu-walk here */
1199 if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
1200 return -ECHILD;
1201 BUG_ON(inode != next.dentry->d_inode);
889 err = do_follow_link(&next, nd); 1202 err = do_follow_link(&next, nd);
890 if (err) 1203 if (err)
891 goto return_err; 1204 goto return_err;
1205 nd->inode = nd->path.dentry->d_inode;
892 err = -ENOENT; 1206 err = -ENOENT;
893 inode = nd->path.dentry->d_inode; 1207 if (!nd->inode)
894 if (!inode)
895 break; 1208 break;
896 } else 1209 } else {
897 path_to_nameidata(&next, nd); 1210 path_to_nameidata(&next, nd);
1211 nd->inode = inode;
1212 }
898 err = -ENOTDIR; 1213 err = -ENOTDIR;
899 if (!inode->i_op->lookup) 1214 if (!nd->inode->i_op->lookup)
900 break; 1215 break;
901 continue; 1216 continue;
902 /* here ends the main loop */ 1217 /* here ends the main loop */
@@ -911,32 +1226,39 @@ last_component:
911 if (this.name[0] == '.') switch (this.len) { 1226 if (this.name[0] == '.') switch (this.len) {
912 default: 1227 default:
913 break; 1228 break;
914 case 2: 1229 case 2:
915 if (this.name[1] != '.') 1230 if (this.name[1] != '.')
916 break; 1231 break;
917 follow_dotdot(nd); 1232 if (nd->flags & LOOKUP_RCU) {
918 inode = nd->path.dentry->d_inode; 1233 if (follow_dotdot_rcu(nd))
1234 return -ECHILD;
1235 } else
1236 follow_dotdot(nd);
919 /* fallthrough */ 1237 /* fallthrough */
920 case 1: 1238 case 1:
921 goto return_reval; 1239 goto return_reval;
922 } 1240 }
923 err = do_lookup(nd, &this, &next); 1241 err = do_lookup(nd, &this, &next, &inode);
924 if (err) 1242 if (err)
925 break; 1243 break;
926 inode = next.dentry->d_inode;
927 if (follow_on_final(inode, lookup_flags)) { 1244 if (follow_on_final(inode, lookup_flags)) {
1245 if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
1246 return -ECHILD;
1247 BUG_ON(inode != next.dentry->d_inode);
928 err = do_follow_link(&next, nd); 1248 err = do_follow_link(&next, nd);
929 if (err) 1249 if (err)
930 goto return_err; 1250 goto return_err;
931 inode = nd->path.dentry->d_inode; 1251 nd->inode = nd->path.dentry->d_inode;
932 } else 1252 } else {
933 path_to_nameidata(&next, nd); 1253 path_to_nameidata(&next, nd);
1254 nd->inode = inode;
1255 }
934 err = -ENOENT; 1256 err = -ENOENT;
935 if (!inode) 1257 if (!nd->inode)
936 break; 1258 break;
937 if (lookup_flags & LOOKUP_DIRECTORY) { 1259 if (lookup_flags & LOOKUP_DIRECTORY) {
938 err = -ENOTDIR; 1260 err = -ENOTDIR;
939 if (!inode->i_op->lookup) 1261 if (!nd->inode->i_op->lookup)
940 break; 1262 break;
941 } 1263 }
942 goto return_base; 1264 goto return_base;
@@ -958,6 +1280,8 @@ return_reval:
958 */ 1280 */
959 if (nd->path.dentry && nd->path.dentry->d_sb && 1281 if (nd->path.dentry && nd->path.dentry->d_sb &&
960 (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { 1282 (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
1283 if (nameidata_drop_rcu_maybe(nd))
1284 return -ECHILD;
961 err = -ESTALE; 1285 err = -ESTALE;
962 /* Note: we do not d_invalidate() */ 1286 /* Note: we do not d_invalidate() */
963 if (!nd->path.dentry->d_op->d_revalidate( 1287 if (!nd->path.dentry->d_op->d_revalidate(
@@ -965,16 +1289,34 @@ return_reval:
965 break; 1289 break;
966 } 1290 }
967return_base: 1291return_base:
1292 if (nameidata_drop_rcu_last_maybe(nd))
1293 return -ECHILD;
968 return 0; 1294 return 0;
969out_dput: 1295out_dput:
970 path_put_conditional(&next, nd); 1296 if (!(nd->flags & LOOKUP_RCU))
1297 path_put_conditional(&next, nd);
971 break; 1298 break;
972 } 1299 }
973 path_put(&nd->path); 1300 if (!(nd->flags & LOOKUP_RCU))
1301 path_put(&nd->path);
974return_err: 1302return_err:
975 return err; 1303 return err;
976} 1304}
977 1305
1306static inline int path_walk_rcu(const char *name, struct nameidata *nd)
1307{
1308 current->total_link_count = 0;
1309
1310 return link_path_walk(name, nd);
1311}
1312
1313static inline int path_walk_simple(const char *name, struct nameidata *nd)
1314{
1315 current->total_link_count = 0;
1316
1317 return link_path_walk(name, nd);
1318}
1319
978static int path_walk(const char *name, struct nameidata *nd) 1320static int path_walk(const char *name, struct nameidata *nd)
979{ 1321{
980 struct path save = nd->path; 1322 struct path save = nd->path;
@@ -1000,6 +1342,88 @@ static int path_walk(const char *name, struct nameidata *nd)
1000 return result; 1342 return result;
1001} 1343}
1002 1344
1345static void path_finish_rcu(struct nameidata *nd)
1346{
1347 if (nd->flags & LOOKUP_RCU) {
1348 /* RCU dangling. Cancel it. */
1349 nd->flags &= ~LOOKUP_RCU;
1350 nd->root.mnt = NULL;
1351 rcu_read_unlock();
1352 br_read_unlock(vfsmount_lock);
1353 }
1354 if (nd->file)
1355 fput(nd->file);
1356}
1357
1358static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
1359{
1360 int retval = 0;
1361 int fput_needed;
1362 struct file *file;
1363
1364 nd->last_type = LAST_ROOT; /* if there are only slashes... */
1365 nd->flags = flags | LOOKUP_RCU;
1366 nd->depth = 0;
1367 nd->root.mnt = NULL;
1368 nd->file = NULL;
1369
1370 if (*name=='/') {
1371 struct fs_struct *fs = current->fs;
1372
1373 br_read_lock(vfsmount_lock);
1374 rcu_read_lock();
1375
1376 spin_lock(&fs->lock);
1377 nd->root = fs->root;
1378 nd->path = nd->root;
1379 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1380 spin_unlock(&fs->lock);
1381
1382 } else if (dfd == AT_FDCWD) {
1383 struct fs_struct *fs = current->fs;
1384
1385 br_read_lock(vfsmount_lock);
1386 rcu_read_lock();
1387
1388 spin_lock(&fs->lock);
1389 nd->path = fs->pwd;
1390 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1391 spin_unlock(&fs->lock);
1392 } else {
1393 struct dentry *dentry;
1394
1395 file = fget_light(dfd, &fput_needed);
1396 retval = -EBADF;
1397 if (!file)
1398 goto out_fail;
1399
1400 dentry = file->f_path.dentry;
1401
1402 retval = -ENOTDIR;
1403 if (!S_ISDIR(dentry->d_inode->i_mode))
1404 goto fput_fail;
1405
1406 retval = file_permission(file, MAY_EXEC);
1407 if (retval)
1408 goto fput_fail;
1409
1410 nd->path = file->f_path;
1411 if (fput_needed)
1412 nd->file = file;
1413
1414 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1415 br_read_lock(vfsmount_lock);
1416 rcu_read_lock();
1417 }
1418 nd->inode = nd->path.dentry->d_inode;
1419 return 0;
1420
1421fput_fail:
1422 fput_light(file, fput_needed);
1423out_fail:
1424 return retval;
1425}
1426
1003static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) 1427static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
1004{ 1428{
1005 int retval = 0; 1429 int retval = 0;
@@ -1040,6 +1464,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
1040 1464
1041 fput_light(file, fput_needed); 1465 fput_light(file, fput_needed);
1042 } 1466 }
1467 nd->inode = nd->path.dentry->d_inode;
1043 return 0; 1468 return 0;
1044 1469
1045fput_fail: 1470fput_fail:
@@ -1052,16 +1477,53 @@ out_fail:
1052static int do_path_lookup(int dfd, const char *name, 1477static int do_path_lookup(int dfd, const char *name,
1053 unsigned int flags, struct nameidata *nd) 1478 unsigned int flags, struct nameidata *nd)
1054{ 1479{
1055 int retval = path_init(dfd, name, flags, nd); 1480 int retval;
1056 if (!retval) 1481
1057 retval = path_walk(name, nd); 1482 /*
1058 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && 1483 * Path walking is largely split up into 2 different synchronisation
1059 nd->path.dentry->d_inode)) 1484 * schemes, rcu-walk and ref-walk (explained in
1060 audit_inode(name, nd->path.dentry); 1485 * Documentation/filesystems/path-lookup.txt). These share much of the
1486 * path walk code, but some things particularly setup, cleanup, and
1487 * following mounts are sufficiently divergent that functions are
1488 * duplicated. Typically there is a function foo(), and its RCU
1489 * analogue, foo_rcu().
1490 *
1491 * -ECHILD is the error number of choice (just to avoid clashes) that
1492 * is returned if some aspect of an rcu-walk fails. Such an error must
1493 * be handled by restarting a traditional ref-walk (which will always
1494 * be able to complete).
1495 */
1496 retval = path_init_rcu(dfd, name, flags, nd);
1497 if (unlikely(retval))
1498 return retval;
1499 retval = path_walk_rcu(name, nd);
1500 path_finish_rcu(nd);
1061 if (nd->root.mnt) { 1501 if (nd->root.mnt) {
1062 path_put(&nd->root); 1502 path_put(&nd->root);
1063 nd->root.mnt = NULL; 1503 nd->root.mnt = NULL;
1064 } 1504 }
1505
1506 if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
1507 /* slower, locked walk */
1508 if (retval == -ESTALE)
1509 flags |= LOOKUP_REVAL;
1510 retval = path_init(dfd, name, flags, nd);
1511 if (unlikely(retval))
1512 return retval;
1513 retval = path_walk(name, nd);
1514 if (nd->root.mnt) {
1515 path_put(&nd->root);
1516 nd->root.mnt = NULL;
1517 }
1518 }
1519
1520 if (likely(!retval)) {
1521 if (unlikely(!audit_dummy_context())) {
1522 if (nd->path.dentry && nd->inode)
1523 audit_inode(name, nd->path.dentry);
1524 }
1525 }
1526
1065 return retval; 1527 return retval;
1066} 1528}
1067 1529
@@ -1104,10 +1566,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1104 path_get(&nd->path); 1566 path_get(&nd->path);
1105 nd->root = nd->path; 1567 nd->root = nd->path;
1106 path_get(&nd->root); 1568 path_get(&nd->root);
1569 nd->inode = nd->path.dentry->d_inode;
1107 1570
1108 retval = path_walk(name, nd); 1571 retval = path_walk(name, nd);
1109 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && 1572 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1110 nd->path.dentry->d_inode)) 1573 nd->inode))
1111 audit_inode(name, nd->path.dentry); 1574 audit_inode(name, nd->path.dentry);
1112 1575
1113 path_put(&nd->root); 1576 path_put(&nd->root);
@@ -1488,6 +1951,7 @@ out_unlock:
1488 mutex_unlock(&dir->d_inode->i_mutex); 1951 mutex_unlock(&dir->d_inode->i_mutex);
1489 dput(nd->path.dentry); 1952 dput(nd->path.dentry);
1490 nd->path.dentry = path->dentry; 1953 nd->path.dentry = path->dentry;
1954
1491 if (error) 1955 if (error)
1492 return error; 1956 return error;
1493 /* Don't check for write permission, don't truncate */ 1957 /* Don't check for write permission, don't truncate */
@@ -1582,6 +2046,9 @@ exit:
1582 return ERR_PTR(error); 2046 return ERR_PTR(error);
1583} 2047}
1584 2048
2049/*
2050 * Handle O_CREAT case for do_filp_open
2051 */
1585static struct file *do_last(struct nameidata *nd, struct path *path, 2052static struct file *do_last(struct nameidata *nd, struct path *path,
1586 int open_flag, int acc_mode, 2053 int open_flag, int acc_mode,
1587 int mode, const char *pathname) 2054 int mode, const char *pathname)
@@ -1603,42 +2070,16 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1603 } 2070 }
1604 /* fallthrough */ 2071 /* fallthrough */
1605 case LAST_ROOT: 2072 case LAST_ROOT:
1606 if (open_flag & O_CREAT) 2073 goto exit;
1607 goto exit;
1608 /* fallthrough */
1609 case LAST_BIND: 2074 case LAST_BIND:
1610 audit_inode(pathname, dir); 2075 audit_inode(pathname, dir);
1611 goto ok; 2076 goto ok;
1612 } 2077 }
1613 2078
1614 /* trailing slashes? */ 2079 /* trailing slashes? */
1615 if (nd->last.name[nd->last.len]) { 2080 if (nd->last.name[nd->last.len])
1616 if (open_flag & O_CREAT) 2081 goto exit;
1617 goto exit;
1618 nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
1619 }
1620
1621 /* just plain open? */
1622 if (!(open_flag & O_CREAT)) {
1623 error = do_lookup(nd, &nd->last, path);
1624 if (error)
1625 goto exit;
1626 error = -ENOENT;
1627 if (!path->dentry->d_inode)
1628 goto exit_dput;
1629 if (path->dentry->d_inode->i_op->follow_link)
1630 return NULL;
1631 error = -ENOTDIR;
1632 if (nd->flags & LOOKUP_DIRECTORY) {
1633 if (!path->dentry->d_inode->i_op->lookup)
1634 goto exit_dput;
1635 }
1636 path_to_nameidata(path, nd);
1637 audit_inode(pathname, nd->path.dentry);
1638 goto ok;
1639 }
1640 2082
1641 /* OK, it's O_CREAT */
1642 mutex_lock(&dir->d_inode->i_mutex); 2083 mutex_lock(&dir->d_inode->i_mutex);
1643 2084
1644 path->dentry = lookup_hash(nd); 2085 path->dentry = lookup_hash(nd);
@@ -1709,8 +2150,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1709 return NULL; 2150 return NULL;
1710 2151
1711 path_to_nameidata(path, nd); 2152 path_to_nameidata(path, nd);
2153 nd->inode = path->dentry->d_inode;
1712 error = -EISDIR; 2154 error = -EISDIR;
1713 if (S_ISDIR(path->dentry->d_inode->i_mode)) 2155 if (S_ISDIR(nd->inode->i_mode))
1714 goto exit; 2156 goto exit;
1715ok: 2157ok:
1716 filp = finish_open(nd, open_flag, acc_mode); 2158 filp = finish_open(nd, open_flag, acc_mode);
@@ -1741,7 +2183,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
1741 struct path path; 2183 struct path path;
1742 int count = 0; 2184 int count = 0;
1743 int flag = open_to_namei_flags(open_flag); 2185 int flag = open_to_namei_flags(open_flag);
1744 int force_reval = 0; 2186 int flags;
1745 2187
1746 if (!(open_flag & O_CREAT)) 2188 if (!(open_flag & O_CREAT))
1747 mode = 0; 2189 mode = 0;
@@ -1770,54 +2212,84 @@ struct file *do_filp_open(int dfd, const char *pathname,
1770 if (open_flag & O_APPEND) 2212 if (open_flag & O_APPEND)
1771 acc_mode |= MAY_APPEND; 2213 acc_mode |= MAY_APPEND;
1772 2214
1773 /* find the parent */ 2215 flags = LOOKUP_OPEN;
1774reval: 2216 if (open_flag & O_CREAT) {
1775 error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); 2217 flags |= LOOKUP_CREATE;
2218 if (open_flag & O_EXCL)
2219 flags |= LOOKUP_EXCL;
2220 }
2221 if (open_flag & O_DIRECTORY)
2222 flags |= LOOKUP_DIRECTORY;
2223 if (!(open_flag & O_NOFOLLOW))
2224 flags |= LOOKUP_FOLLOW;
2225
2226 filp = get_empty_filp();
2227 if (!filp)
2228 return ERR_PTR(-ENFILE);
2229
2230 filp->f_flags = open_flag;
2231 nd.intent.open.file = filp;
2232 nd.intent.open.flags = flag;
2233 nd.intent.open.create_mode = mode;
2234
2235 if (open_flag & O_CREAT)
2236 goto creat;
2237
2238 /* !O_CREAT, simple open */
2239 error = do_path_lookup(dfd, pathname, flags, &nd);
2240 if (unlikely(error))
2241 goto out_filp;
2242 error = -ELOOP;
2243 if (!(nd.flags & LOOKUP_FOLLOW)) {
2244 if (nd.inode->i_op->follow_link)
2245 goto out_path;
2246 }
2247 error = -ENOTDIR;
2248 if (nd.flags & LOOKUP_DIRECTORY) {
2249 if (!nd.inode->i_op->lookup)
2250 goto out_path;
2251 }
2252 audit_inode(pathname, nd.path.dentry);
2253 filp = finish_open(&nd, open_flag, acc_mode);
2254 return filp;
2255
2256creat:
2257 /* OK, have to create the file. Find the parent. */
2258 error = path_init_rcu(dfd, pathname,
2259 LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
1776 if (error) 2260 if (error)
1777 return ERR_PTR(error); 2261 goto out_filp;
1778 if (force_reval) 2262 error = path_walk_rcu(pathname, &nd);
1779 nd.flags |= LOOKUP_REVAL; 2263 path_finish_rcu(&nd);
2264 if (unlikely(error == -ECHILD || error == -ESTALE)) {
2265 /* slower, locked walk */
2266 if (error == -ESTALE) {
2267reval:
2268 flags |= LOOKUP_REVAL;
2269 }
2270 error = path_init(dfd, pathname,
2271 LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
2272 if (error)
2273 goto out_filp;
1780 2274
1781 current->total_link_count = 0; 2275 error = path_walk_simple(pathname, &nd);
1782 error = link_path_walk(pathname, &nd);
1783 if (error) {
1784 filp = ERR_PTR(error);
1785 goto out;
1786 } 2276 }
1787 if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT)) 2277 if (unlikely(error))
2278 goto out_filp;
2279 if (unlikely(!audit_dummy_context()))
1788 audit_inode(pathname, nd.path.dentry); 2280 audit_inode(pathname, nd.path.dentry);
1789 2281
1790 /* 2282 /*
1791 * We have the parent and last component. 2283 * We have the parent and last component.
1792 */ 2284 */
1793 2285 nd.flags = flags;
1794 error = -ENFILE;
1795 filp = get_empty_filp();
1796 if (filp == NULL)
1797 goto exit_parent;
1798 nd.intent.open.file = filp;
1799 filp->f_flags = open_flag;
1800 nd.intent.open.flags = flag;
1801 nd.intent.open.create_mode = mode;
1802 nd.flags &= ~LOOKUP_PARENT;
1803 nd.flags |= LOOKUP_OPEN;
1804 if (open_flag & O_CREAT) {
1805 nd.flags |= LOOKUP_CREATE;
1806 if (open_flag & O_EXCL)
1807 nd.flags |= LOOKUP_EXCL;
1808 }
1809 if (open_flag & O_DIRECTORY)
1810 nd.flags |= LOOKUP_DIRECTORY;
1811 if (!(open_flag & O_NOFOLLOW))
1812 nd.flags |= LOOKUP_FOLLOW;
1813 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); 2286 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1814 while (unlikely(!filp)) { /* trailing symlink */ 2287 while (unlikely(!filp)) { /* trailing symlink */
1815 struct path holder; 2288 struct path holder;
1816 struct inode *inode = path.dentry->d_inode;
1817 void *cookie; 2289 void *cookie;
1818 error = -ELOOP; 2290 error = -ELOOP;
1819 /* S_ISDIR part is a temporary automount kludge */ 2291 /* S_ISDIR part is a temporary automount kludge */
1820 if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode)) 2292 if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(nd.inode->i_mode))
1821 goto exit_dput; 2293 goto exit_dput;
1822 if (count++ == 32) 2294 if (count++ == 32)
1823 goto exit_dput; 2295 goto exit_dput;
@@ -1838,36 +2310,33 @@ reval:
1838 goto exit_dput; 2310 goto exit_dput;
1839 error = __do_follow_link(&path, &nd, &cookie); 2311 error = __do_follow_link(&path, &nd, &cookie);
1840 if (unlikely(error)) { 2312 if (unlikely(error)) {
2313 if (!IS_ERR(cookie) && nd.inode->i_op->put_link)
2314 nd.inode->i_op->put_link(path.dentry, &nd, cookie);
1841 /* nd.path had been dropped */ 2315 /* nd.path had been dropped */
1842 if (!IS_ERR(cookie) && inode->i_op->put_link) 2316 nd.path = path;
1843 inode->i_op->put_link(path.dentry, &nd, cookie); 2317 goto out_path;
1844 path_put(&path);
1845 release_open_intent(&nd);
1846 filp = ERR_PTR(error);
1847 goto out;
1848 } 2318 }
1849 holder = path; 2319 holder = path;
1850 nd.flags &= ~LOOKUP_PARENT; 2320 nd.flags &= ~LOOKUP_PARENT;
1851 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); 2321 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1852 if (inode->i_op->put_link) 2322 if (nd.inode->i_op->put_link)
1853 inode->i_op->put_link(holder.dentry, &nd, cookie); 2323 nd.inode->i_op->put_link(holder.dentry, &nd, cookie);
1854 path_put(&holder); 2324 path_put(&holder);
1855 } 2325 }
1856out: 2326out:
1857 if (nd.root.mnt) 2327 if (nd.root.mnt)
1858 path_put(&nd.root); 2328 path_put(&nd.root);
1859 if (filp == ERR_PTR(-ESTALE) && !force_reval) { 2329 if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
1860 force_reval = 1;
1861 goto reval; 2330 goto reval;
1862 }
1863 return filp; 2331 return filp;
1864 2332
1865exit_dput: 2333exit_dput:
1866 path_put_conditional(&path, &nd); 2334 path_put_conditional(&path, &nd);
2335out_path:
2336 path_put(&nd.path);
2337out_filp:
1867 if (!IS_ERR(nd.intent.open.file)) 2338 if (!IS_ERR(nd.intent.open.file))
1868 release_open_intent(&nd); 2339 release_open_intent(&nd);
1869exit_parent:
1870 path_put(&nd.path);
1871 filp = ERR_PTR(error); 2340 filp = ERR_PTR(error);
1872 goto out; 2341 goto out;
1873} 2342}
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ae4b0fd9033f..998e3a715bcc 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -402,6 +402,10 @@ static int proc_sys_compare(const struct dentry *parent,
402 const struct dentry *dentry, const struct inode *inode, 402 const struct dentry *dentry, const struct inode *inode,
403 unsigned int len, const char *str, const struct qstr *name) 403 unsigned int len, const char *str, const struct qstr *name)
404{ 404{
405 /* Although proc doesn't have negative dentries, rcu-walk means
406 * that inode here can be NULL */
407 if (!inode)
408 return 0;
405 if (name->len != len) 409 if (name->len != len)
406 return 1; 410 return 1;
407 if (memcmp(name->name, str, len)) 411 if (memcmp(name->name, str, len))