aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/dcache.c203
-rw-r--r--fs/filesystems.c3
-rw-r--r--fs/namei.c743
-rw-r--r--fs/proc/proc_sysctl.c4
4 files changed, 794 insertions, 159 deletions
diff --git a/fs/dcache.c b/fs/dcache.c
index dc0551c9755d..187fea040108 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -152,9 +152,23 @@ static void d_free(struct dentry *dentry)
152 call_rcu(&dentry->d_u.d_rcu, __d_free); 152 call_rcu(&dentry->d_u.d_rcu, __d_free);
153} 153}
154 154
155/**
156 * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
157 * After this call, in-progress rcu-walk path lookup will fail. This
158 * should be called after unhashing, and after changing d_inode (if
159 * the dentry has not already been unhashed).
160 */
161static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
162{
163 assert_spin_locked(&dentry->d_lock);
164 /* Go through a barrier */
165 write_seqcount_barrier(&dentry->d_seq);
166}
167
155/* 168/*
156 * Release the dentry's inode, using the filesystem 169 * Release the dentry's inode, using the filesystem
157 * d_iput() operation if defined. 170 * d_iput() operation if defined. Dentry has no refcount
171 * and is unhashed.
158 */ 172 */
159static void dentry_iput(struct dentry * dentry) 173static void dentry_iput(struct dentry * dentry)
160 __releases(dentry->d_lock) 174 __releases(dentry->d_lock)
@@ -179,6 +193,28 @@ static void dentry_iput(struct dentry * dentry)
179} 193}
180 194
181/* 195/*
196 * Release the dentry's inode, using the filesystem
197 * d_iput() operation if defined. dentry remains in-use.
198 */
199static void dentry_unlink_inode(struct dentry * dentry)
200 __releases(dentry->d_lock)
201 __releases(dcache_inode_lock)
202{
203 struct inode *inode = dentry->d_inode;
204 dentry->d_inode = NULL;
205 list_del_init(&dentry->d_alias);
206 dentry_rcuwalk_barrier(dentry);
207 spin_unlock(&dentry->d_lock);
208 spin_unlock(&dcache_inode_lock);
209 if (!inode->i_nlink)
210 fsnotify_inoderemove(inode);
211 if (dentry->d_op && dentry->d_op->d_iput)
212 dentry->d_op->d_iput(dentry, inode);
213 else
214 iput(inode);
215}
216
217/*
182 * dentry_lru_(add|del|move_tail) must be called with d_lock held. 218 * dentry_lru_(add|del|move_tail) must be called with d_lock held.
183 */ 219 */
184static void dentry_lru_add(struct dentry *dentry) 220static void dentry_lru_add(struct dentry *dentry)
@@ -272,6 +308,7 @@ void __d_drop(struct dentry *dentry)
272 spin_lock(&dcache_hash_lock); 308 spin_lock(&dcache_hash_lock);
273 hlist_del_rcu(&dentry->d_hash); 309 hlist_del_rcu(&dentry->d_hash);
274 spin_unlock(&dcache_hash_lock); 310 spin_unlock(&dcache_hash_lock);
311 dentry_rcuwalk_barrier(dentry);
275 } 312 }
276} 313}
277EXPORT_SYMBOL(__d_drop); 314EXPORT_SYMBOL(__d_drop);
@@ -309,6 +346,7 @@ relock:
309 spin_unlock(&dcache_inode_lock); 346 spin_unlock(&dcache_inode_lock);
310 goto relock; 347 goto relock;
311 } 348 }
349
312 if (ref) 350 if (ref)
313 dentry->d_count--; 351 dentry->d_count--;
314 /* if dentry was on the d_lru list delete it from there */ 352 /* if dentry was on the d_lru list delete it from there */
@@ -1221,6 +1259,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
1221 dentry->d_count = 1; 1259 dentry->d_count = 1;
1222 dentry->d_flags = DCACHE_UNHASHED; 1260 dentry->d_flags = DCACHE_UNHASHED;
1223 spin_lock_init(&dentry->d_lock); 1261 spin_lock_init(&dentry->d_lock);
1262 seqcount_init(&dentry->d_seq);
1224 dentry->d_inode = NULL; 1263 dentry->d_inode = NULL;
1225 dentry->d_parent = NULL; 1264 dentry->d_parent = NULL;
1226 dentry->d_sb = NULL; 1265 dentry->d_sb = NULL;
@@ -1269,6 +1308,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
1269 if (inode) 1308 if (inode)
1270 list_add(&dentry->d_alias, &inode->i_dentry); 1309 list_add(&dentry->d_alias, &inode->i_dentry);
1271 dentry->d_inode = inode; 1310 dentry->d_inode = inode;
1311 dentry_rcuwalk_barrier(dentry);
1272 spin_unlock(&dentry->d_lock); 1312 spin_unlock(&dentry->d_lock);
1273 fsnotify_d_instantiate(dentry, inode); 1313 fsnotify_d_instantiate(dentry, inode);
1274} 1314}
@@ -1611,6 +1651,111 @@ err_out:
1611EXPORT_SYMBOL(d_add_ci); 1651EXPORT_SYMBOL(d_add_ci);
1612 1652
1613/** 1653/**
1654 * __d_lookup_rcu - search for a dentry (racy, store-free)
1655 * @parent: parent dentry
1656 * @name: qstr of name we wish to find
1657 * @seq: returns d_seq value at the point where the dentry was found
1658 * @inode: returns dentry->d_inode when the inode was found valid.
1659 * Returns: dentry, or NULL
1660 *
1661 * __d_lookup_rcu is the dcache lookup function for rcu-walk name
1662 * resolution (store-free path walking) design described in
1663 * Documentation/filesystems/path-lookup.txt.
1664 *
1665 * This is not to be used outside core vfs.
1666 *
1667 * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
1668 * held, and rcu_read_lock held. The returned dentry must not be stored into
1669 * without taking d_lock and checking d_seq sequence count against @seq
1670 * returned here.
1671 *
1672 * A refcount may be taken on the found dentry with the __d_rcu_to_refcount
1673 * function.
1674 *
1675 * Alternatively, __d_lookup_rcu may be called again to look up the child of
1676 * the returned dentry, so long as its parent's seqlock is checked after the
1677 * child is looked up. Thus, an interlocking stepping of sequence lock checks
1678 * is formed, giving integrity down the path walk.
1679 */
1680struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
1681 unsigned *seq, struct inode **inode)
1682{
1683 unsigned int len = name->len;
1684 unsigned int hash = name->hash;
1685 const unsigned char *str = name->name;
1686 struct hlist_head *head = d_hash(parent, hash);
1687 struct hlist_node *node;
1688 struct dentry *dentry;
1689
1690 /*
1691 * Note: There is significant duplication with __d_lookup_rcu which is
1692 * required to prevent single threaded performance regressions
1693 * especially on architectures where smp_rmb (in seqcounts) are costly.
1694 * Keep the two functions in sync.
1695 */
1696
1697 /*
1698 * The hash list is protected using RCU.
1699 *
1700 * Carefully use d_seq when comparing a candidate dentry, to avoid
1701 * races with d_move().
1702 *
1703 * It is possible that concurrent renames can mess up our list
1704 * walk here and result in missing our dentry, resulting in the
1705 * false-negative result. d_lookup() protects against concurrent
1706 * renames using rename_lock seqlock.
1707 *
1708 * See Documentation/vfs/dcache-locking.txt for more details.
1709 */
1710 hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
1711 struct inode *i;
1712 const char *tname;
1713 int tlen;
1714
1715 if (dentry->d_name.hash != hash)
1716 continue;
1717
1718seqretry:
1719 *seq = read_seqcount_begin(&dentry->d_seq);
1720 if (dentry->d_parent != parent)
1721 continue;
1722 if (d_unhashed(dentry))
1723 continue;
1724 tlen = dentry->d_name.len;
1725 tname = dentry->d_name.name;
1726 i = dentry->d_inode;
1727 /*
1728 * This seqcount check is required to ensure name and
1729 * len are loaded atomically, so as not to walk off the
1730 * edge of memory when walking. If we could load this
1731 * atomically some other way, we could drop this check.
1732 */
1733 if (read_seqcount_retry(&dentry->d_seq, *seq))
1734 goto seqretry;
1735 if (parent->d_op && parent->d_op->d_compare) {
1736 if (parent->d_op->d_compare(parent, *inode,
1737 dentry, i,
1738 tlen, tname, name))
1739 continue;
1740 } else {
1741 if (tlen != len)
1742 continue;
1743 if (memcmp(tname, str, tlen))
1744 continue;
1745 }
1746 /*
1747 * No extra seqcount check is required after the name
1748 * compare. The caller must perform a seqcount check in
1749 * order to do anything useful with the returned dentry
1750 * anyway.
1751 */
1752 *inode = i;
1753 return dentry;
1754 }
1755 return NULL;
1756}
1757
1758/**
1614 * d_lookup - search for a dentry 1759 * d_lookup - search for a dentry
1615 * @parent: parent dentry 1760 * @parent: parent dentry
1616 * @name: qstr of name we wish to find 1761 * @name: qstr of name we wish to find
@@ -1621,9 +1766,9 @@ EXPORT_SYMBOL(d_add_ci);
1621 * dentry is returned. The caller must use dput to free the entry when it has 1766 * dentry is returned. The caller must use dput to free the entry when it has
1622 * finished using it. %NULL is returned if the dentry does not exist. 1767 * finished using it. %NULL is returned if the dentry does not exist.
1623 */ 1768 */
1624struct dentry * d_lookup(struct dentry * parent, struct qstr * name) 1769struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
1625{ 1770{
1626 struct dentry * dentry = NULL; 1771 struct dentry *dentry;
1627 unsigned seq; 1772 unsigned seq;
1628 1773
1629 do { 1774 do {
@@ -1636,7 +1781,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
1636} 1781}
1637EXPORT_SYMBOL(d_lookup); 1782EXPORT_SYMBOL(d_lookup);
1638 1783
1639/* 1784/**
1640 * __d_lookup - search for a dentry (racy) 1785 * __d_lookup - search for a dentry (racy)
1641 * @parent: parent dentry 1786 * @parent: parent dentry
1642 * @name: qstr of name we wish to find 1787 * @name: qstr of name we wish to find
@@ -1651,17 +1796,24 @@ EXPORT_SYMBOL(d_lookup);
1651 * 1796 *
1652 * __d_lookup callers must be commented. 1797 * __d_lookup callers must be commented.
1653 */ 1798 */
1654struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) 1799struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
1655{ 1800{
1656 unsigned int len = name->len; 1801 unsigned int len = name->len;
1657 unsigned int hash = name->hash; 1802 unsigned int hash = name->hash;
1658 const unsigned char *str = name->name; 1803 const unsigned char *str = name->name;
1659 struct hlist_head *head = d_hash(parent,hash); 1804 struct hlist_head *head = d_hash(parent,hash);
1660 struct dentry *found = NULL;
1661 struct hlist_node *node; 1805 struct hlist_node *node;
1806 struct dentry *found = NULL;
1662 struct dentry *dentry; 1807 struct dentry *dentry;
1663 1808
1664 /* 1809 /*
1810 * Note: There is significant duplication with __d_lookup_rcu which is
1811 * required to prevent single threaded performance regressions
1812 * especially on architectures where smp_rmb (in seqcounts) are costly.
1813 * Keep the two functions in sync.
1814 */
1815
1816 /*
1665 * The hash list is protected using RCU. 1817 * The hash list is protected using RCU.
1666 * 1818 *
1667 * Take d_lock when comparing a candidate dentry, to avoid races 1819 * Take d_lock when comparing a candidate dentry, to avoid races
@@ -1677,24 +1829,15 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1677 rcu_read_lock(); 1829 rcu_read_lock();
1678 1830
1679 hlist_for_each_entry_rcu(dentry, node, head, d_hash) { 1831 hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
1680 struct qstr *qstr; 1832 const char *tname;
1833 int tlen;
1681 1834
1682 if (dentry->d_name.hash != hash) 1835 if (dentry->d_name.hash != hash)
1683 continue; 1836 continue;
1684 if (dentry->d_parent != parent)
1685 continue;
1686 1837
1687 spin_lock(&dentry->d_lock); 1838 spin_lock(&dentry->d_lock);
1688
1689 /*
1690 * Recheck the dentry after taking the lock - d_move may have
1691 * changed things. Don't bother checking the hash because
1692 * we're about to compare the whole name anyway.
1693 */
1694 if (dentry->d_parent != parent) 1839 if (dentry->d_parent != parent)
1695 goto next; 1840 goto next;
1696
1697 /* non-existing due to RCU? */
1698 if (d_unhashed(dentry)) 1841 if (d_unhashed(dentry))
1699 goto next; 1842 goto next;
1700 1843
@@ -1702,16 +1845,17 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1702 * It is safe to compare names since d_move() cannot 1845 * It is safe to compare names since d_move() cannot
1703 * change the qstr (protected by d_lock). 1846 * change the qstr (protected by d_lock).
1704 */ 1847 */
1705 qstr = &dentry->d_name; 1848 tlen = dentry->d_name.len;
1849 tname = dentry->d_name.name;
1706 if (parent->d_op && parent->d_op->d_compare) { 1850 if (parent->d_op && parent->d_op->d_compare) {
1707 if (parent->d_op->d_compare(parent, parent->d_inode, 1851 if (parent->d_op->d_compare(parent, parent->d_inode,
1708 dentry, dentry->d_inode, 1852 dentry, dentry->d_inode,
1709 qstr->len, qstr->name, name)) 1853 tlen, tname, name))
1710 goto next; 1854 goto next;
1711 } else { 1855 } else {
1712 if (qstr->len != len) 1856 if (tlen != len)
1713 goto next; 1857 goto next;
1714 if (memcmp(qstr->name, str, len)) 1858 if (memcmp(tname, str, tlen))
1715 goto next; 1859 goto next;
1716 } 1860 }
1717 1861
@@ -1821,7 +1965,7 @@ again:
1821 goto again; 1965 goto again;
1822 } 1966 }
1823 dentry->d_flags &= ~DCACHE_CANT_MOUNT; 1967 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
1824 dentry_iput(dentry); 1968 dentry_unlink_inode(dentry);
1825 fsnotify_nameremove(dentry, isdir); 1969 fsnotify_nameremove(dentry, isdir);
1826 return; 1970 return;
1827 } 1971 }
@@ -1884,7 +2028,9 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
1884 BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ 2028 BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
1885 2029
1886 spin_lock(&dentry->d_lock); 2030 spin_lock(&dentry->d_lock);
2031 write_seqcount_begin(&dentry->d_seq);
1887 memcpy((unsigned char *)dentry->d_name.name, name->name, name->len); 2032 memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
2033 write_seqcount_end(&dentry->d_seq);
1888 spin_unlock(&dentry->d_lock); 2034 spin_unlock(&dentry->d_lock);
1889} 2035}
1890EXPORT_SYMBOL(dentry_update_name_case); 2036EXPORT_SYMBOL(dentry_update_name_case);
@@ -1997,6 +2143,9 @@ void d_move(struct dentry * dentry, struct dentry * target)
1997 2143
1998 dentry_lock_for_move(dentry, target); 2144 dentry_lock_for_move(dentry, target);
1999 2145
2146 write_seqcount_begin(&dentry->d_seq);
2147 write_seqcount_begin(&target->d_seq);
2148
2000 /* Move the dentry to the target hash queue, if on different bucket */ 2149 /* Move the dentry to the target hash queue, if on different bucket */
2001 spin_lock(&dcache_hash_lock); 2150 spin_lock(&dcache_hash_lock);
2002 if (!d_unhashed(dentry)) 2151 if (!d_unhashed(dentry))
@@ -2005,6 +2154,7 @@ void d_move(struct dentry * dentry, struct dentry * target)
2005 spin_unlock(&dcache_hash_lock); 2154 spin_unlock(&dcache_hash_lock);
2006 2155
2007 /* Unhash the target: dput() will then get rid of it */ 2156 /* Unhash the target: dput() will then get rid of it */
2157 /* __d_drop does write_seqcount_barrier, but they're OK to nest. */
2008 __d_drop(target); 2158 __d_drop(target);
2009 2159
2010 list_del(&dentry->d_u.d_child); 2160 list_del(&dentry->d_u.d_child);
@@ -2028,6 +2178,9 @@ void d_move(struct dentry * dentry, struct dentry * target)
2028 2178
2029 list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); 2179 list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
2030 2180
2181 write_seqcount_end(&target->d_seq);
2182 write_seqcount_end(&dentry->d_seq);
2183
2031 dentry_unlock_parents_for_move(dentry, target); 2184 dentry_unlock_parents_for_move(dentry, target);
2032 spin_unlock(&target->d_lock); 2185 spin_unlock(&target->d_lock);
2033 fsnotify_d_move(dentry); 2186 fsnotify_d_move(dentry);
@@ -2110,6 +2263,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
2110 2263
2111 dentry_lock_for_move(anon, dentry); 2264 dentry_lock_for_move(anon, dentry);
2112 2265
2266 write_seqcount_begin(&dentry->d_seq);
2267 write_seqcount_begin(&anon->d_seq);
2268
2113 dparent = dentry->d_parent; 2269 dparent = dentry->d_parent;
2114 aparent = anon->d_parent; 2270 aparent = anon->d_parent;
2115 2271
@@ -2130,6 +2286,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
2130 else 2286 else
2131 INIT_LIST_HEAD(&anon->d_u.d_child); 2287 INIT_LIST_HEAD(&anon->d_u.d_child);
2132 2288
2289 write_seqcount_end(&dentry->d_seq);
2290 write_seqcount_end(&anon->d_seq);
2291
2133 dentry_unlock_parents_for_move(anon, dentry); 2292 dentry_unlock_parents_for_move(anon, dentry);
2134 spin_unlock(&dentry->d_lock); 2293 spin_unlock(&dentry->d_lock);
2135 2294
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 68ba492d8eef..751d6b255a12 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -115,6 +115,9 @@ int unregister_filesystem(struct file_system_type * fs)
115 tmp = &(*tmp)->next; 115 tmp = &(*tmp)->next;
116 } 116 }
117 write_unlock(&file_systems_lock); 117 write_unlock(&file_systems_lock);
118
119 synchronize_rcu();
120
118 return -EINVAL; 121 return -EINVAL;
119} 122}
120 123
diff --git a/fs/namei.c b/fs/namei.c
index 5642bc2be418..8d3f15b3a541 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname);
169/* 169/*
170 * This does basic POSIX ACL permission checking 170 * This does basic POSIX ACL permission checking
171 */ 171 */
172static int acl_permission_check(struct inode *inode, int mask, 172static inline int __acl_permission_check(struct inode *inode, int mask,
173 int (*check_acl)(struct inode *inode, int mask)) 173 int (*check_acl)(struct inode *inode, int mask), int rcu)
174{ 174{
175 umode_t mode = inode->i_mode; 175 umode_t mode = inode->i_mode;
176 176
@@ -180,9 +180,13 @@ static int acl_permission_check(struct inode *inode, int mask,
180 mode >>= 6; 180 mode >>= 6;
181 else { 181 else {
182 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { 182 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
183 int error = check_acl(inode, mask); 183 if (rcu) {
184 if (error != -EAGAIN) 184 return -ECHILD;
185 return error; 185 } else {
186 int error = check_acl(inode, mask);
187 if (error != -EAGAIN)
188 return error;
189 }
186 } 190 }
187 191
188 if (in_group_p(inode->i_gid)) 192 if (in_group_p(inode->i_gid))
@@ -197,6 +201,12 @@ static int acl_permission_check(struct inode *inode, int mask,
197 return -EACCES; 201 return -EACCES;
198} 202}
199 203
204static inline int acl_permission_check(struct inode *inode, int mask,
205 int (*check_acl)(struct inode *inode, int mask))
206{
207 return __acl_permission_check(inode, mask, check_acl, 0);
208}
209
200/** 210/**
201 * generic_permission - check for access rights on a Posix-like filesystem 211 * generic_permission - check for access rights on a Posix-like filesystem
202 * @inode: inode to check access rights for 212 * @inode: inode to check access rights for
@@ -375,6 +385,173 @@ void path_put(struct path *path)
375EXPORT_SYMBOL(path_put); 385EXPORT_SYMBOL(path_put);
376 386
377/** 387/**
388 * nameidata_drop_rcu - drop this nameidata out of rcu-walk
389 * @nd: nameidata pathwalk data to drop
390 * @Returns: 0 on success, -ECHLID on failure
391 *
392 * Path walking has 2 modes, rcu-walk and ref-walk (see
393 * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
394 * to drop out of rcu-walk mode and take normal reference counts on dentries
395 * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
396 * refcounts at the last known good point before rcu-walk got stuck, so
397 * ref-walk may continue from there. If this is not successful (eg. a seqcount
398 * has changed), then failure is returned and path walk restarts from the
399 * beginning in ref-walk mode.
400 *
401 * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
402 * ref-walk. Must be called from rcu-walk context.
403 */
404static int nameidata_drop_rcu(struct nameidata *nd)
405{
406 struct fs_struct *fs = current->fs;
407 struct dentry *dentry = nd->path.dentry;
408
409 BUG_ON(!(nd->flags & LOOKUP_RCU));
410 if (nd->root.mnt) {
411 spin_lock(&fs->lock);
412 if (nd->root.mnt != fs->root.mnt ||
413 nd->root.dentry != fs->root.dentry)
414 goto err_root;
415 }
416 spin_lock(&dentry->d_lock);
417 if (!__d_rcu_to_refcount(dentry, nd->seq))
418 goto err;
419 BUG_ON(nd->inode != dentry->d_inode);
420 spin_unlock(&dentry->d_lock);
421 if (nd->root.mnt) {
422 path_get(&nd->root);
423 spin_unlock(&fs->lock);
424 }
425 mntget(nd->path.mnt);
426
427 rcu_read_unlock();
428 br_read_unlock(vfsmount_lock);
429 nd->flags &= ~LOOKUP_RCU;
430 return 0;
431err:
432 spin_unlock(&dentry->d_lock);
433err_root:
434 if (nd->root.mnt)
435 spin_unlock(&fs->lock);
436 return -ECHILD;
437}
438
439/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
440static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
441{
442 if (nd->flags & LOOKUP_RCU)
443 return nameidata_drop_rcu(nd);
444 return 0;
445}
446
447/**
448 * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
449 * @nd: nameidata pathwalk data to drop
450 * @dentry: dentry to drop
451 * @Returns: 0 on success, -ECHLID on failure
452 *
453 * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
454 * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
455 * @nd. Must be called from rcu-walk context.
456 */
457static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
458{
459 struct fs_struct *fs = current->fs;
460 struct dentry *parent = nd->path.dentry;
461
462 BUG_ON(!(nd->flags & LOOKUP_RCU));
463 if (nd->root.mnt) {
464 spin_lock(&fs->lock);
465 if (nd->root.mnt != fs->root.mnt ||
466 nd->root.dentry != fs->root.dentry)
467 goto err_root;
468 }
469 spin_lock(&parent->d_lock);
470 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
471 if (!__d_rcu_to_refcount(dentry, nd->seq))
472 goto err;
473 /*
474 * If the sequence check on the child dentry passed, then the child has
475 * not been removed from its parent. This means the parent dentry must
476 * be valid and able to take a reference at this point.
477 */
478 BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
479 BUG_ON(!parent->d_count);
480 parent->d_count++;
481 spin_unlock(&dentry->d_lock);
482 spin_unlock(&parent->d_lock);
483 if (nd->root.mnt) {
484 path_get(&nd->root);
485 spin_unlock(&fs->lock);
486 }
487 mntget(nd->path.mnt);
488
489 rcu_read_unlock();
490 br_read_unlock(vfsmount_lock);
491 nd->flags &= ~LOOKUP_RCU;
492 return 0;
493err:
494 spin_unlock(&dentry->d_lock);
495 spin_unlock(&parent->d_lock);
496err_root:
497 if (nd->root.mnt)
498 spin_unlock(&fs->lock);
499 return -ECHILD;
500}
501
502/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
503static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
504{
505 if (nd->flags & LOOKUP_RCU)
506 return nameidata_dentry_drop_rcu(nd, dentry);
507 return 0;
508}
509
510/**
511 * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
512 * @nd: nameidata pathwalk data to drop
513 * @Returns: 0 on success, -ECHLID on failure
514 *
515 * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
516 * nd->path should be the final element of the lookup, so nd->root is discarded.
517 * Must be called from rcu-walk context.
518 */
519static int nameidata_drop_rcu_last(struct nameidata *nd)
520{
521 struct dentry *dentry = nd->path.dentry;
522
523 BUG_ON(!(nd->flags & LOOKUP_RCU));
524 nd->flags &= ~LOOKUP_RCU;
525 nd->root.mnt = NULL;
526 spin_lock(&dentry->d_lock);
527 if (!__d_rcu_to_refcount(dentry, nd->seq))
528 goto err_unlock;
529 BUG_ON(nd->inode != dentry->d_inode);
530 spin_unlock(&dentry->d_lock);
531
532 mntget(nd->path.mnt);
533
534 rcu_read_unlock();
535 br_read_unlock(vfsmount_lock);
536
537 return 0;
538
539err_unlock:
540 spin_unlock(&dentry->d_lock);
541 rcu_read_unlock();
542 br_read_unlock(vfsmount_lock);
543 return -ECHILD;
544}
545
546/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
547static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
548{
549 if (likely(nd->flags & LOOKUP_RCU))
550 return nameidata_drop_rcu_last(nd);
551 return 0;
552}
553
554/**
378 * release_open_intent - free up open intent resources 555 * release_open_intent - free up open intent resources
379 * @nd: pointer to nameidata 556 * @nd: pointer to nameidata
380 */ 557 */
@@ -459,26 +636,40 @@ force_reval_path(struct path *path, struct nameidata *nd)
459 * short-cut DAC fails, then call ->permission() to do more 636 * short-cut DAC fails, then call ->permission() to do more
460 * complete permission check. 637 * complete permission check.
461 */ 638 */
462static int exec_permission(struct inode *inode) 639static inline int __exec_permission(struct inode *inode, int rcu)
463{ 640{
464 int ret; 641 int ret;
465 642
466 if (inode->i_op->permission) { 643 if (inode->i_op->permission) {
644 if (rcu)
645 return -ECHILD;
467 ret = inode->i_op->permission(inode, MAY_EXEC); 646 ret = inode->i_op->permission(inode, MAY_EXEC);
468 if (!ret) 647 if (!ret)
469 goto ok; 648 goto ok;
470 return ret; 649 return ret;
471 } 650 }
472 ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl); 651 ret = __acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl, rcu);
473 if (!ret) 652 if (!ret)
474 goto ok; 653 goto ok;
654 if (rcu && ret == -ECHILD)
655 return ret;
475 656
476 if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) 657 if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
477 goto ok; 658 goto ok;
478 659
479 return ret; 660 return ret;
480ok: 661ok:
481 return security_inode_permission(inode, MAY_EXEC); 662 return security_inode_exec_permission(inode, rcu);
663}
664
665static int exec_permission(struct inode *inode)
666{
667 return __exec_permission(inode, 0);
668}
669
670static int exec_permission_rcu(struct inode *inode)
671{
672 return __exec_permission(inode, 1);
482} 673}
483 674
484static __always_inline void set_root(struct nameidata *nd) 675static __always_inline void set_root(struct nameidata *nd)
@@ -489,8 +680,20 @@ static __always_inline void set_root(struct nameidata *nd)
489 680
490static int link_path_walk(const char *, struct nameidata *); 681static int link_path_walk(const char *, struct nameidata *);
491 682
683static __always_inline void set_root_rcu(struct nameidata *nd)
684{
685 if (!nd->root.mnt) {
686 struct fs_struct *fs = current->fs;
687 spin_lock(&fs->lock);
688 nd->root = fs->root;
689 spin_unlock(&fs->lock);
690 }
691}
692
492static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) 693static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
493{ 694{
695 int ret;
696
494 if (IS_ERR(link)) 697 if (IS_ERR(link))
495 goto fail; 698 goto fail;
496 699
@@ -500,8 +703,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
500 nd->path = nd->root; 703 nd->path = nd->root;
501 path_get(&nd->root); 704 path_get(&nd->root);
502 } 705 }
706 nd->inode = nd->path.dentry->d_inode;
503 707
504 return link_path_walk(link, nd); 708 ret = link_path_walk(link, nd);
709 return ret;
505fail: 710fail:
506 path_put(&nd->path); 711 path_put(&nd->path);
507 return PTR_ERR(link); 712 return PTR_ERR(link);
@@ -516,11 +721,12 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
516 721
517static inline void path_to_nameidata(struct path *path, struct nameidata *nd) 722static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
518{ 723{
519 dput(nd->path.dentry); 724 if (!(nd->flags & LOOKUP_RCU)) {
520 if (nd->path.mnt != path->mnt) { 725 dput(nd->path.dentry);
521 mntput(nd->path.mnt); 726 if (nd->path.mnt != path->mnt)
522 nd->path.mnt = path->mnt; 727 mntput(nd->path.mnt);
523 } 728 }
729 nd->path.mnt = path->mnt;
524 nd->path.dentry = path->dentry; 730 nd->path.dentry = path->dentry;
525} 731}
526 732
@@ -535,9 +741,11 @@ __do_follow_link(struct path *path, struct nameidata *nd, void **p)
535 741
536 if (path->mnt != nd->path.mnt) { 742 if (path->mnt != nd->path.mnt) {
537 path_to_nameidata(path, nd); 743 path_to_nameidata(path, nd);
744 nd->inode = nd->path.dentry->d_inode;
538 dget(dentry); 745 dget(dentry);
539 } 746 }
540 mntget(path->mnt); 747 mntget(path->mnt);
748
541 nd->last_type = LAST_BIND; 749 nd->last_type = LAST_BIND;
542 *p = dentry->d_inode->i_op->follow_link(dentry, nd); 750 *p = dentry->d_inode->i_op->follow_link(dentry, nd);
543 error = PTR_ERR(*p); 751 error = PTR_ERR(*p);
@@ -591,6 +799,20 @@ loop:
591 return err; 799 return err;
592} 800}
593 801
802static int follow_up_rcu(struct path *path)
803{
804 struct vfsmount *parent;
805 struct dentry *mountpoint;
806
807 parent = path->mnt->mnt_parent;
808 if (parent == path->mnt)
809 return 0;
810 mountpoint = path->mnt->mnt_mountpoint;
811 path->dentry = mountpoint;
812 path->mnt = parent;
813 return 1;
814}
815
594int follow_up(struct path *path) 816int follow_up(struct path *path)
595{ 817{
596 struct vfsmount *parent; 818 struct vfsmount *parent;
@@ -615,6 +837,21 @@ int follow_up(struct path *path)
615/* 837/*
616 * serialization is taken care of in namespace.c 838 * serialization is taken care of in namespace.c
617 */ 839 */
840static void __follow_mount_rcu(struct nameidata *nd, struct path *path,
841 struct inode **inode)
842{
843 while (d_mountpoint(path->dentry)) {
844 struct vfsmount *mounted;
845 mounted = __lookup_mnt(path->mnt, path->dentry, 1);
846 if (!mounted)
847 return;
848 path->mnt = mounted;
849 path->dentry = mounted->mnt_root;
850 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
851 *inode = path->dentry->d_inode;
852 }
853}
854
618static int __follow_mount(struct path *path) 855static int __follow_mount(struct path *path)
619{ 856{
620 int res = 0; 857 int res = 0;
@@ -660,7 +897,42 @@ int follow_down(struct path *path)
660 return 0; 897 return 0;
661} 898}
662 899
663static __always_inline void follow_dotdot(struct nameidata *nd) 900static int follow_dotdot_rcu(struct nameidata *nd)
901{
902 struct inode *inode = nd->inode;
903
904 set_root_rcu(nd);
905
906 while(1) {
907 if (nd->path.dentry == nd->root.dentry &&
908 nd->path.mnt == nd->root.mnt) {
909 break;
910 }
911 if (nd->path.dentry != nd->path.mnt->mnt_root) {
912 struct dentry *old = nd->path.dentry;
913 struct dentry *parent = old->d_parent;
914 unsigned seq;
915
916 seq = read_seqcount_begin(&parent->d_seq);
917 if (read_seqcount_retry(&old->d_seq, nd->seq))
918 return -ECHILD;
919 inode = parent->d_inode;
920 nd->path.dentry = parent;
921 nd->seq = seq;
922 break;
923 }
924 if (!follow_up_rcu(&nd->path))
925 break;
926 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
927 inode = nd->path.dentry->d_inode;
928 }
929 __follow_mount_rcu(nd, &nd->path, &inode);
930 nd->inode = inode;
931
932 return 0;
933}
934
935static void follow_dotdot(struct nameidata *nd)
664{ 936{
665 set_root(nd); 937 set_root(nd);
666 938
@@ -681,6 +953,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
681 break; 953 break;
682 } 954 }
683 follow_mount(&nd->path); 955 follow_mount(&nd->path);
956 nd->inode = nd->path.dentry->d_inode;
684} 957}
685 958
686/* 959/*
@@ -718,18 +991,17 @@ static struct dentry *d_alloc_and_lookup(struct dentry *parent,
718 * It _is_ time-critical. 991 * It _is_ time-critical.
719 */ 992 */
720static int do_lookup(struct nameidata *nd, struct qstr *name, 993static int do_lookup(struct nameidata *nd, struct qstr *name,
721 struct path *path) 994 struct path *path, struct inode **inode)
722{ 995{
723 struct vfsmount *mnt = nd->path.mnt; 996 struct vfsmount *mnt = nd->path.mnt;
724 struct dentry *dentry, *parent; 997 struct dentry *dentry, *parent = nd->path.dentry;
725 struct inode *dir; 998 struct inode *dir;
726 /* 999 /*
727 * See if the low-level filesystem might want 1000 * See if the low-level filesystem might want
728 * to use its own hash.. 1001 * to use its own hash..
729 */ 1002 */
730 if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { 1003 if (parent->d_op && parent->d_op->d_hash) {
731 int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, 1004 int err = parent->d_op->d_hash(parent, nd->inode, name);
732 nd->path.dentry->d_inode, name);
733 if (err < 0) 1005 if (err < 0)
734 return err; 1006 return err;
735 } 1007 }
@@ -739,21 +1011,48 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
739 * of a false negative due to a concurrent rename, we're going to 1011 * of a false negative due to a concurrent rename, we're going to
740 * do the non-racy lookup, below. 1012 * do the non-racy lookup, below.
741 */ 1013 */
742 dentry = __d_lookup(nd->path.dentry, name); 1014 if (nd->flags & LOOKUP_RCU) {
743 if (!dentry) 1015 unsigned seq;
744 goto need_lookup; 1016
1017 *inode = nd->inode;
1018 dentry = __d_lookup_rcu(parent, name, &seq, inode);
1019 if (!dentry) {
1020 if (nameidata_drop_rcu(nd))
1021 return -ECHILD;
1022 goto need_lookup;
1023 }
1024 /* Memory barrier in read_seqcount_begin of child is enough */
1025 if (__read_seqcount_retry(&parent->d_seq, nd->seq))
1026 return -ECHILD;
1027
1028 nd->seq = seq;
1029 if (dentry->d_op && dentry->d_op->d_revalidate) {
1030 /* We commonly drop rcu-walk here */
1031 if (nameidata_dentry_drop_rcu(nd, dentry))
1032 return -ECHILD;
1033 goto need_revalidate;
1034 }
1035 path->mnt = mnt;
1036 path->dentry = dentry;
1037 __follow_mount_rcu(nd, path, inode);
1038 } else {
1039 dentry = __d_lookup(parent, name);
1040 if (!dentry)
1041 goto need_lookup;
745found: 1042found:
746 if (dentry->d_op && dentry->d_op->d_revalidate) 1043 if (dentry->d_op && dentry->d_op->d_revalidate)
747 goto need_revalidate; 1044 goto need_revalidate;
748done: 1045done:
749 path->mnt = mnt; 1046 path->mnt = mnt;
750 path->dentry = dentry; 1047 path->dentry = dentry;
751 __follow_mount(path); 1048 __follow_mount(path);
1049 *inode = path->dentry->d_inode;
1050 }
752 return 0; 1051 return 0;
753 1052
754need_lookup: 1053need_lookup:
755 parent = nd->path.dentry;
756 dir = parent->d_inode; 1054 dir = parent->d_inode;
1055 BUG_ON(nd->inode != dir);
757 1056
758 mutex_lock(&dir->i_mutex); 1057 mutex_lock(&dir->i_mutex);
759 /* 1058 /*
@@ -815,7 +1114,6 @@ static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
815static int link_path_walk(const char *name, struct nameidata *nd) 1114static int link_path_walk(const char *name, struct nameidata *nd)
816{ 1115{
817 struct path next; 1116 struct path next;
818 struct inode *inode;
819 int err; 1117 int err;
820 unsigned int lookup_flags = nd->flags; 1118 unsigned int lookup_flags = nd->flags;
821 1119
@@ -824,18 +1122,28 @@ static int link_path_walk(const char *name, struct nameidata *nd)
824 if (!*name) 1122 if (!*name)
825 goto return_reval; 1123 goto return_reval;
826 1124
827 inode = nd->path.dentry->d_inode;
828 if (nd->depth) 1125 if (nd->depth)
829 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); 1126 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
830 1127
831 /* At this point we know we have a real path component. */ 1128 /* At this point we know we have a real path component. */
832 for(;;) { 1129 for(;;) {
1130 struct inode *inode;
833 unsigned long hash; 1131 unsigned long hash;
834 struct qstr this; 1132 struct qstr this;
835 unsigned int c; 1133 unsigned int c;
836 1134
837 nd->flags |= LOOKUP_CONTINUE; 1135 nd->flags |= LOOKUP_CONTINUE;
838 err = exec_permission(inode); 1136 if (nd->flags & LOOKUP_RCU) {
1137 err = exec_permission_rcu(nd->inode);
1138 if (err == -ECHILD) {
1139 if (nameidata_drop_rcu(nd))
1140 return -ECHILD;
1141 goto exec_again;
1142 }
1143 } else {
1144exec_again:
1145 err = exec_permission(nd->inode);
1146 }
839 if (err) 1147 if (err)
840 break; 1148 break;
841 1149
@@ -866,37 +1174,44 @@ static int link_path_walk(const char *name, struct nameidata *nd)
866 if (this.name[0] == '.') switch (this.len) { 1174 if (this.name[0] == '.') switch (this.len) {
867 default: 1175 default:
868 break; 1176 break;
869 case 2: 1177 case 2:
870 if (this.name[1] != '.') 1178 if (this.name[1] != '.')
871 break; 1179 break;
872 follow_dotdot(nd); 1180 if (nd->flags & LOOKUP_RCU) {
873 inode = nd->path.dentry->d_inode; 1181 if (follow_dotdot_rcu(nd))
1182 return -ECHILD;
1183 } else
1184 follow_dotdot(nd);
874 /* fallthrough */ 1185 /* fallthrough */
875 case 1: 1186 case 1:
876 continue; 1187 continue;
877 } 1188 }
878 /* This does the actual lookups.. */ 1189 /* This does the actual lookups.. */
879 err = do_lookup(nd, &this, &next); 1190 err = do_lookup(nd, &this, &next, &inode);
880 if (err) 1191 if (err)
881 break; 1192 break;
882
883 err = -ENOENT; 1193 err = -ENOENT;
884 inode = next.dentry->d_inode;
885 if (!inode) 1194 if (!inode)
886 goto out_dput; 1195 goto out_dput;
887 1196
888 if (inode->i_op->follow_link) { 1197 if (inode->i_op->follow_link) {
1198 /* We commonly drop rcu-walk here */
1199 if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
1200 return -ECHILD;
1201 BUG_ON(inode != next.dentry->d_inode);
889 err = do_follow_link(&next, nd); 1202 err = do_follow_link(&next, nd);
890 if (err) 1203 if (err)
891 goto return_err; 1204 goto return_err;
1205 nd->inode = nd->path.dentry->d_inode;
892 err = -ENOENT; 1206 err = -ENOENT;
893 inode = nd->path.dentry->d_inode; 1207 if (!nd->inode)
894 if (!inode)
895 break; 1208 break;
896 } else 1209 } else {
897 path_to_nameidata(&next, nd); 1210 path_to_nameidata(&next, nd);
1211 nd->inode = inode;
1212 }
898 err = -ENOTDIR; 1213 err = -ENOTDIR;
899 if (!inode->i_op->lookup) 1214 if (!nd->inode->i_op->lookup)
900 break; 1215 break;
901 continue; 1216 continue;
902 /* here ends the main loop */ 1217 /* here ends the main loop */
@@ -911,32 +1226,39 @@ last_component:
911 if (this.name[0] == '.') switch (this.len) { 1226 if (this.name[0] == '.') switch (this.len) {
912 default: 1227 default:
913 break; 1228 break;
914 case 2: 1229 case 2:
915 if (this.name[1] != '.') 1230 if (this.name[1] != '.')
916 break; 1231 break;
917 follow_dotdot(nd); 1232 if (nd->flags & LOOKUP_RCU) {
918 inode = nd->path.dentry->d_inode; 1233 if (follow_dotdot_rcu(nd))
1234 return -ECHILD;
1235 } else
1236 follow_dotdot(nd);
919 /* fallthrough */ 1237 /* fallthrough */
920 case 1: 1238 case 1:
921 goto return_reval; 1239 goto return_reval;
922 } 1240 }
923 err = do_lookup(nd, &this, &next); 1241 err = do_lookup(nd, &this, &next, &inode);
924 if (err) 1242 if (err)
925 break; 1243 break;
926 inode = next.dentry->d_inode;
927 if (follow_on_final(inode, lookup_flags)) { 1244 if (follow_on_final(inode, lookup_flags)) {
1245 if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
1246 return -ECHILD;
1247 BUG_ON(inode != next.dentry->d_inode);
928 err = do_follow_link(&next, nd); 1248 err = do_follow_link(&next, nd);
929 if (err) 1249 if (err)
930 goto return_err; 1250 goto return_err;
931 inode = nd->path.dentry->d_inode; 1251 nd->inode = nd->path.dentry->d_inode;
932 } else 1252 } else {
933 path_to_nameidata(&next, nd); 1253 path_to_nameidata(&next, nd);
1254 nd->inode = inode;
1255 }
934 err = -ENOENT; 1256 err = -ENOENT;
935 if (!inode) 1257 if (!nd->inode)
936 break; 1258 break;
937 if (lookup_flags & LOOKUP_DIRECTORY) { 1259 if (lookup_flags & LOOKUP_DIRECTORY) {
938 err = -ENOTDIR; 1260 err = -ENOTDIR;
939 if (!inode->i_op->lookup) 1261 if (!nd->inode->i_op->lookup)
940 break; 1262 break;
941 } 1263 }
942 goto return_base; 1264 goto return_base;
@@ -958,6 +1280,8 @@ return_reval:
958 */ 1280 */
959 if (nd->path.dentry && nd->path.dentry->d_sb && 1281 if (nd->path.dentry && nd->path.dentry->d_sb &&
960 (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { 1282 (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
1283 if (nameidata_drop_rcu_maybe(nd))
1284 return -ECHILD;
961 err = -ESTALE; 1285 err = -ESTALE;
962 /* Note: we do not d_invalidate() */ 1286 /* Note: we do not d_invalidate() */
963 if (!nd->path.dentry->d_op->d_revalidate( 1287 if (!nd->path.dentry->d_op->d_revalidate(
@@ -965,16 +1289,34 @@ return_reval:
965 break; 1289 break;
966 } 1290 }
967return_base: 1291return_base:
1292 if (nameidata_drop_rcu_last_maybe(nd))
1293 return -ECHILD;
968 return 0; 1294 return 0;
969out_dput: 1295out_dput:
970 path_put_conditional(&next, nd); 1296 if (!(nd->flags & LOOKUP_RCU))
1297 path_put_conditional(&next, nd);
971 break; 1298 break;
972 } 1299 }
973 path_put(&nd->path); 1300 if (!(nd->flags & LOOKUP_RCU))
1301 path_put(&nd->path);
974return_err: 1302return_err:
975 return err; 1303 return err;
976} 1304}
977 1305
1306static inline int path_walk_rcu(const char *name, struct nameidata *nd)
1307{
1308 current->total_link_count = 0;
1309
1310 return link_path_walk(name, nd);
1311}
1312
1313static inline int path_walk_simple(const char *name, struct nameidata *nd)
1314{
1315 current->total_link_count = 0;
1316
1317 return link_path_walk(name, nd);
1318}
1319
978static int path_walk(const char *name, struct nameidata *nd) 1320static int path_walk(const char *name, struct nameidata *nd)
979{ 1321{
980 struct path save = nd->path; 1322 struct path save = nd->path;
@@ -1000,6 +1342,88 @@ static int path_walk(const char *name, struct nameidata *nd)
1000 return result; 1342 return result;
1001} 1343}
1002 1344
1345static void path_finish_rcu(struct nameidata *nd)
1346{
1347 if (nd->flags & LOOKUP_RCU) {
1348 /* RCU dangling. Cancel it. */
1349 nd->flags &= ~LOOKUP_RCU;
1350 nd->root.mnt = NULL;
1351 rcu_read_unlock();
1352 br_read_unlock(vfsmount_lock);
1353 }
1354 if (nd->file)
1355 fput(nd->file);
1356}
1357
1358static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
1359{
1360 int retval = 0;
1361 int fput_needed;
1362 struct file *file;
1363
1364 nd->last_type = LAST_ROOT; /* if there are only slashes... */
1365 nd->flags = flags | LOOKUP_RCU;
1366 nd->depth = 0;
1367 nd->root.mnt = NULL;
1368 nd->file = NULL;
1369
1370 if (*name=='/') {
1371 struct fs_struct *fs = current->fs;
1372
1373 br_read_lock(vfsmount_lock);
1374 rcu_read_lock();
1375
1376 spin_lock(&fs->lock);
1377 nd->root = fs->root;
1378 nd->path = nd->root;
1379 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1380 spin_unlock(&fs->lock);
1381
1382 } else if (dfd == AT_FDCWD) {
1383 struct fs_struct *fs = current->fs;
1384
1385 br_read_lock(vfsmount_lock);
1386 rcu_read_lock();
1387
1388 spin_lock(&fs->lock);
1389 nd->path = fs->pwd;
1390 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1391 spin_unlock(&fs->lock);
1392 } else {
1393 struct dentry *dentry;
1394
1395 file = fget_light(dfd, &fput_needed);
1396 retval = -EBADF;
1397 if (!file)
1398 goto out_fail;
1399
1400 dentry = file->f_path.dentry;
1401
1402 retval = -ENOTDIR;
1403 if (!S_ISDIR(dentry->d_inode->i_mode))
1404 goto fput_fail;
1405
1406 retval = file_permission(file, MAY_EXEC);
1407 if (retval)
1408 goto fput_fail;
1409
1410 nd->path = file->f_path;
1411 if (fput_needed)
1412 nd->file = file;
1413
1414 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1415 br_read_lock(vfsmount_lock);
1416 rcu_read_lock();
1417 }
1418 nd->inode = nd->path.dentry->d_inode;
1419 return 0;
1420
1421fput_fail:
1422 fput_light(file, fput_needed);
1423out_fail:
1424 return retval;
1425}
1426
1003static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) 1427static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
1004{ 1428{
1005 int retval = 0; 1429 int retval = 0;
@@ -1040,6 +1464,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
1040 1464
1041 fput_light(file, fput_needed); 1465 fput_light(file, fput_needed);
1042 } 1466 }
1467 nd->inode = nd->path.dentry->d_inode;
1043 return 0; 1468 return 0;
1044 1469
1045fput_fail: 1470fput_fail:
@@ -1052,16 +1477,53 @@ out_fail:
1052static int do_path_lookup(int dfd, const char *name, 1477static int do_path_lookup(int dfd, const char *name,
1053 unsigned int flags, struct nameidata *nd) 1478 unsigned int flags, struct nameidata *nd)
1054{ 1479{
1055 int retval = path_init(dfd, name, flags, nd); 1480 int retval;
1056 if (!retval) 1481
1057 retval = path_walk(name, nd); 1482 /*
1058 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && 1483 * Path walking is largely split up into 2 different synchronisation
1059 nd->path.dentry->d_inode)) 1484 * schemes, rcu-walk and ref-walk (explained in
1060 audit_inode(name, nd->path.dentry); 1485 * Documentation/filesystems/path-lookup.txt). These share much of the
1486 * path walk code, but some things particularly setup, cleanup, and
1487 * following mounts are sufficiently divergent that functions are
1488 * duplicated. Typically there is a function foo(), and its RCU
1489 * analogue, foo_rcu().
1490 *
1491 * -ECHILD is the error number of choice (just to avoid clashes) that
1492 * is returned if some aspect of an rcu-walk fails. Such an error must
1493 * be handled by restarting a traditional ref-walk (which will always
1494 * be able to complete).
1495 */
1496 retval = path_init_rcu(dfd, name, flags, nd);
1497 if (unlikely(retval))
1498 return retval;
1499 retval = path_walk_rcu(name, nd);
1500 path_finish_rcu(nd);
1061 if (nd->root.mnt) { 1501 if (nd->root.mnt) {
1062 path_put(&nd->root); 1502 path_put(&nd->root);
1063 nd->root.mnt = NULL; 1503 nd->root.mnt = NULL;
1064 } 1504 }
1505
1506 if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
1507 /* slower, locked walk */
1508 if (retval == -ESTALE)
1509 flags |= LOOKUP_REVAL;
1510 retval = path_init(dfd, name, flags, nd);
1511 if (unlikely(retval))
1512 return retval;
1513 retval = path_walk(name, nd);
1514 if (nd->root.mnt) {
1515 path_put(&nd->root);
1516 nd->root.mnt = NULL;
1517 }
1518 }
1519
1520 if (likely(!retval)) {
1521 if (unlikely(!audit_dummy_context())) {
1522 if (nd->path.dentry && nd->inode)
1523 audit_inode(name, nd->path.dentry);
1524 }
1525 }
1526
1065 return retval; 1527 return retval;
1066} 1528}
1067 1529
@@ -1104,10 +1566,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1104 path_get(&nd->path); 1566 path_get(&nd->path);
1105 nd->root = nd->path; 1567 nd->root = nd->path;
1106 path_get(&nd->root); 1568 path_get(&nd->root);
1569 nd->inode = nd->path.dentry->d_inode;
1107 1570
1108 retval = path_walk(name, nd); 1571 retval = path_walk(name, nd);
1109 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && 1572 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1110 nd->path.dentry->d_inode)) 1573 nd->inode))
1111 audit_inode(name, nd->path.dentry); 1574 audit_inode(name, nd->path.dentry);
1112 1575
1113 path_put(&nd->root); 1576 path_put(&nd->root);
@@ -1488,6 +1951,7 @@ out_unlock:
1488 mutex_unlock(&dir->d_inode->i_mutex); 1951 mutex_unlock(&dir->d_inode->i_mutex);
1489 dput(nd->path.dentry); 1952 dput(nd->path.dentry);
1490 nd->path.dentry = path->dentry; 1953 nd->path.dentry = path->dentry;
1954
1491 if (error) 1955 if (error)
1492 return error; 1956 return error;
1493 /* Don't check for write permission, don't truncate */ 1957 /* Don't check for write permission, don't truncate */
@@ -1582,6 +2046,9 @@ exit:
1582 return ERR_PTR(error); 2046 return ERR_PTR(error);
1583} 2047}
1584 2048
2049/*
2050 * Handle O_CREAT case for do_filp_open
2051 */
1585static struct file *do_last(struct nameidata *nd, struct path *path, 2052static struct file *do_last(struct nameidata *nd, struct path *path,
1586 int open_flag, int acc_mode, 2053 int open_flag, int acc_mode,
1587 int mode, const char *pathname) 2054 int mode, const char *pathname)
@@ -1603,42 +2070,16 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1603 } 2070 }
1604 /* fallthrough */ 2071 /* fallthrough */
1605 case LAST_ROOT: 2072 case LAST_ROOT:
1606 if (open_flag & O_CREAT) 2073 goto exit;
1607 goto exit;
1608 /* fallthrough */
1609 case LAST_BIND: 2074 case LAST_BIND:
1610 audit_inode(pathname, dir); 2075 audit_inode(pathname, dir);
1611 goto ok; 2076 goto ok;
1612 } 2077 }
1613 2078
1614 /* trailing slashes? */ 2079 /* trailing slashes? */
1615 if (nd->last.name[nd->last.len]) { 2080 if (nd->last.name[nd->last.len])
1616 if (open_flag & O_CREAT) 2081 goto exit;
1617 goto exit;
1618 nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
1619 }
1620
1621 /* just plain open? */
1622 if (!(open_flag & O_CREAT)) {
1623 error = do_lookup(nd, &nd->last, path);
1624 if (error)
1625 goto exit;
1626 error = -ENOENT;
1627 if (!path->dentry->d_inode)
1628 goto exit_dput;
1629 if (path->dentry->d_inode->i_op->follow_link)
1630 return NULL;
1631 error = -ENOTDIR;
1632 if (nd->flags & LOOKUP_DIRECTORY) {
1633 if (!path->dentry->d_inode->i_op->lookup)
1634 goto exit_dput;
1635 }
1636 path_to_nameidata(path, nd);
1637 audit_inode(pathname, nd->path.dentry);
1638 goto ok;
1639 }
1640 2082
1641 /* OK, it's O_CREAT */
1642 mutex_lock(&dir->d_inode->i_mutex); 2083 mutex_lock(&dir->d_inode->i_mutex);
1643 2084
1644 path->dentry = lookup_hash(nd); 2085 path->dentry = lookup_hash(nd);
@@ -1709,8 +2150,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1709 return NULL; 2150 return NULL;
1710 2151
1711 path_to_nameidata(path, nd); 2152 path_to_nameidata(path, nd);
2153 nd->inode = path->dentry->d_inode;
1712 error = -EISDIR; 2154 error = -EISDIR;
1713 if (S_ISDIR(path->dentry->d_inode->i_mode)) 2155 if (S_ISDIR(nd->inode->i_mode))
1714 goto exit; 2156 goto exit;
1715ok: 2157ok:
1716 filp = finish_open(nd, open_flag, acc_mode); 2158 filp = finish_open(nd, open_flag, acc_mode);
@@ -1741,7 +2183,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
1741 struct path path; 2183 struct path path;
1742 int count = 0; 2184 int count = 0;
1743 int flag = open_to_namei_flags(open_flag); 2185 int flag = open_to_namei_flags(open_flag);
1744 int force_reval = 0; 2186 int flags;
1745 2187
1746 if (!(open_flag & O_CREAT)) 2188 if (!(open_flag & O_CREAT))
1747 mode = 0; 2189 mode = 0;
@@ -1770,54 +2212,84 @@ struct file *do_filp_open(int dfd, const char *pathname,
1770 if (open_flag & O_APPEND) 2212 if (open_flag & O_APPEND)
1771 acc_mode |= MAY_APPEND; 2213 acc_mode |= MAY_APPEND;
1772 2214
1773 /* find the parent */ 2215 flags = LOOKUP_OPEN;
1774reval: 2216 if (open_flag & O_CREAT) {
1775 error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); 2217 flags |= LOOKUP_CREATE;
2218 if (open_flag & O_EXCL)
2219 flags |= LOOKUP_EXCL;
2220 }
2221 if (open_flag & O_DIRECTORY)
2222 flags |= LOOKUP_DIRECTORY;
2223 if (!(open_flag & O_NOFOLLOW))
2224 flags |= LOOKUP_FOLLOW;
2225
2226 filp = get_empty_filp();
2227 if (!filp)
2228 return ERR_PTR(-ENFILE);
2229
2230 filp->f_flags = open_flag;
2231 nd.intent.open.file = filp;
2232 nd.intent.open.flags = flag;
2233 nd.intent.open.create_mode = mode;
2234
2235 if (open_flag & O_CREAT)
2236 goto creat;
2237
2238 /* !O_CREAT, simple open */
2239 error = do_path_lookup(dfd, pathname, flags, &nd);
2240 if (unlikely(error))
2241 goto out_filp;
2242 error = -ELOOP;
2243 if (!(nd.flags & LOOKUP_FOLLOW)) {
2244 if (nd.inode->i_op->follow_link)
2245 goto out_path;
2246 }
2247 error = -ENOTDIR;
2248 if (nd.flags & LOOKUP_DIRECTORY) {
2249 if (!nd.inode->i_op->lookup)
2250 goto out_path;
2251 }
2252 audit_inode(pathname, nd.path.dentry);
2253 filp = finish_open(&nd, open_flag, acc_mode);
2254 return filp;
2255
2256creat:
2257 /* OK, have to create the file. Find the parent. */
2258 error = path_init_rcu(dfd, pathname,
2259 LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
1776 if (error) 2260 if (error)
1777 return ERR_PTR(error); 2261 goto out_filp;
1778 if (force_reval) 2262 error = path_walk_rcu(pathname, &nd);
1779 nd.flags |= LOOKUP_REVAL; 2263 path_finish_rcu(&nd);
2264 if (unlikely(error == -ECHILD || error == -ESTALE)) {
2265 /* slower, locked walk */
2266 if (error == -ESTALE) {
2267reval:
2268 flags |= LOOKUP_REVAL;
2269 }
2270 error = path_init(dfd, pathname,
2271 LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
2272 if (error)
2273 goto out_filp;
1780 2274
1781 current->total_link_count = 0; 2275 error = path_walk_simple(pathname, &nd);
1782 error = link_path_walk(pathname, &nd);
1783 if (error) {
1784 filp = ERR_PTR(error);
1785 goto out;
1786 } 2276 }
1787 if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT)) 2277 if (unlikely(error))
2278 goto out_filp;
2279 if (unlikely(!audit_dummy_context()))
1788 audit_inode(pathname, nd.path.dentry); 2280 audit_inode(pathname, nd.path.dentry);
1789 2281
1790 /* 2282 /*
1791 * We have the parent and last component. 2283 * We have the parent and last component.
1792 */ 2284 */
1793 2285 nd.flags = flags;
1794 error = -ENFILE;
1795 filp = get_empty_filp();
1796 if (filp == NULL)
1797 goto exit_parent;
1798 nd.intent.open.file = filp;
1799 filp->f_flags = open_flag;
1800 nd.intent.open.flags = flag;
1801 nd.intent.open.create_mode = mode;
1802 nd.flags &= ~LOOKUP_PARENT;
1803 nd.flags |= LOOKUP_OPEN;
1804 if (open_flag & O_CREAT) {
1805 nd.flags |= LOOKUP_CREATE;
1806 if (open_flag & O_EXCL)
1807 nd.flags |= LOOKUP_EXCL;
1808 }
1809 if (open_flag & O_DIRECTORY)
1810 nd.flags |= LOOKUP_DIRECTORY;
1811 if (!(open_flag & O_NOFOLLOW))
1812 nd.flags |= LOOKUP_FOLLOW;
1813 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); 2286 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1814 while (unlikely(!filp)) { /* trailing symlink */ 2287 while (unlikely(!filp)) { /* trailing symlink */
1815 struct path holder; 2288 struct path holder;
1816 struct inode *inode = path.dentry->d_inode;
1817 void *cookie; 2289 void *cookie;
1818 error = -ELOOP; 2290 error = -ELOOP;
1819 /* S_ISDIR part is a temporary automount kludge */ 2291 /* S_ISDIR part is a temporary automount kludge */
1820 if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode)) 2292 if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(nd.inode->i_mode))
1821 goto exit_dput; 2293 goto exit_dput;
1822 if (count++ == 32) 2294 if (count++ == 32)
1823 goto exit_dput; 2295 goto exit_dput;
@@ -1838,36 +2310,33 @@ reval:
1838 goto exit_dput; 2310 goto exit_dput;
1839 error = __do_follow_link(&path, &nd, &cookie); 2311 error = __do_follow_link(&path, &nd, &cookie);
1840 if (unlikely(error)) { 2312 if (unlikely(error)) {
2313 if (!IS_ERR(cookie) && nd.inode->i_op->put_link)
2314 nd.inode->i_op->put_link(path.dentry, &nd, cookie);
1841 /* nd.path had been dropped */ 2315 /* nd.path had been dropped */
1842 if (!IS_ERR(cookie) && inode->i_op->put_link) 2316 nd.path = path;
1843 inode->i_op->put_link(path.dentry, &nd, cookie); 2317 goto out_path;
1844 path_put(&path);
1845 release_open_intent(&nd);
1846 filp = ERR_PTR(error);
1847 goto out;
1848 } 2318 }
1849 holder = path; 2319 holder = path;
1850 nd.flags &= ~LOOKUP_PARENT; 2320 nd.flags &= ~LOOKUP_PARENT;
1851 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); 2321 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1852 if (inode->i_op->put_link) 2322 if (nd.inode->i_op->put_link)
1853 inode->i_op->put_link(holder.dentry, &nd, cookie); 2323 nd.inode->i_op->put_link(holder.dentry, &nd, cookie);
1854 path_put(&holder); 2324 path_put(&holder);
1855 } 2325 }
1856out: 2326out:
1857 if (nd.root.mnt) 2327 if (nd.root.mnt)
1858 path_put(&nd.root); 2328 path_put(&nd.root);
1859 if (filp == ERR_PTR(-ESTALE) && !force_reval) { 2329 if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
1860 force_reval = 1;
1861 goto reval; 2330 goto reval;
1862 }
1863 return filp; 2331 return filp;
1864 2332
1865exit_dput: 2333exit_dput:
1866 path_put_conditional(&path, &nd); 2334 path_put_conditional(&path, &nd);
2335out_path:
2336 path_put(&nd.path);
2337out_filp:
1867 if (!IS_ERR(nd.intent.open.file)) 2338 if (!IS_ERR(nd.intent.open.file))
1868 release_open_intent(&nd); 2339 release_open_intent(&nd);
1869exit_parent:
1870 path_put(&nd.path);
1871 filp = ERR_PTR(error); 2340 filp = ERR_PTR(error);
1872 goto out; 2341 goto out;
1873} 2342}
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ae4b0fd9033f..998e3a715bcc 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -402,6 +402,10 @@ static int proc_sys_compare(const struct dentry *parent,
402 const struct dentry *dentry, const struct inode *inode, 402 const struct dentry *dentry, const struct inode *inode,
403 unsigned int len, const char *str, const struct qstr *name) 403 unsigned int len, const char *str, const struct qstr *name)
404{ 404{
405 /* Although proc doesn't have negative dentries, rcu-walk means
406 * that inode here can be NULL */
407 if (!inode)
408 return 0;
405 if (name->len != len) 409 if (name->len != len)
406 return 1; 410 return 1;
407 if (memcmp(name->name, str, len)) 411 if (memcmp(name->name, str, len))