diff options
author | Nick Piggin <npiggin@kernel.dk> | 2011-01-07 01:49:52 -0500 |
---|---|---|
committer | Nick Piggin <npiggin@kernel.dk> | 2011-01-07 01:50:27 -0500 |
commit | 31e6b01f4183ff419a6d1f86177cbf4662347cec (patch) | |
tree | e215ec9af88352c55e024f784f3d9f8eb13fab85 /fs | |
parent | 3c22cd5709e8143444a6d08682a87f4c57902df3 (diff) |
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/dcache.c | 203 | ||||
-rw-r--r-- | fs/filesystems.c | 3 | ||||
-rw-r--r-- | fs/namei.c | 743 | ||||
-rw-r--r-- | fs/proc/proc_sysctl.c | 4 |
4 files changed, 794 insertions, 159 deletions
diff --git a/fs/dcache.c b/fs/dcache.c index dc0551c9755d..187fea040108 100644 --- a/fs/dcache.c +++ b/fs/dcache.c | |||
@@ -152,9 +152,23 @@ static void d_free(struct dentry *dentry) | |||
152 | call_rcu(&dentry->d_u.d_rcu, __d_free); | 152 | call_rcu(&dentry->d_u.d_rcu, __d_free); |
153 | } | 153 | } |
154 | 154 | ||
155 | /** | ||
156 | * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups | ||
157 | * After this call, in-progress rcu-walk path lookup will fail. This | ||
158 | * should be called after unhashing, and after changing d_inode (if | ||
159 | * the dentry has not already been unhashed). | ||
160 | */ | ||
161 | static inline void dentry_rcuwalk_barrier(struct dentry *dentry) | ||
162 | { | ||
163 | assert_spin_locked(&dentry->d_lock); | ||
164 | /* Go through a barrier */ | ||
165 | write_seqcount_barrier(&dentry->d_seq); | ||
166 | } | ||
167 | |||
155 | /* | 168 | /* |
156 | * Release the dentry's inode, using the filesystem | 169 | * Release the dentry's inode, using the filesystem |
157 | * d_iput() operation if defined. | 170 | * d_iput() operation if defined. Dentry has no refcount |
171 | * and is unhashed. | ||
158 | */ | 172 | */ |
159 | static void dentry_iput(struct dentry * dentry) | 173 | static void dentry_iput(struct dentry * dentry) |
160 | __releases(dentry->d_lock) | 174 | __releases(dentry->d_lock) |
@@ -179,6 +193,28 @@ static void dentry_iput(struct dentry * dentry) | |||
179 | } | 193 | } |
180 | 194 | ||
181 | /* | 195 | /* |
196 | * Release the dentry's inode, using the filesystem | ||
197 | * d_iput() operation if defined. dentry remains in-use. | ||
198 | */ | ||
199 | static void dentry_unlink_inode(struct dentry * dentry) | ||
200 | __releases(dentry->d_lock) | ||
201 | __releases(dcache_inode_lock) | ||
202 | { | ||
203 | struct inode *inode = dentry->d_inode; | ||
204 | dentry->d_inode = NULL; | ||
205 | list_del_init(&dentry->d_alias); | ||
206 | dentry_rcuwalk_barrier(dentry); | ||
207 | spin_unlock(&dentry->d_lock); | ||
208 | spin_unlock(&dcache_inode_lock); | ||
209 | if (!inode->i_nlink) | ||
210 | fsnotify_inoderemove(inode); | ||
211 | if (dentry->d_op && dentry->d_op->d_iput) | ||
212 | dentry->d_op->d_iput(dentry, inode); | ||
213 | else | ||
214 | iput(inode); | ||
215 | } | ||
216 | |||
217 | /* | ||
182 | * dentry_lru_(add|del|move_tail) must be called with d_lock held. | 218 | * dentry_lru_(add|del|move_tail) must be called with d_lock held. |
183 | */ | 219 | */ |
184 | static void dentry_lru_add(struct dentry *dentry) | 220 | static void dentry_lru_add(struct dentry *dentry) |
@@ -272,6 +308,7 @@ void __d_drop(struct dentry *dentry) | |||
272 | spin_lock(&dcache_hash_lock); | 308 | spin_lock(&dcache_hash_lock); |
273 | hlist_del_rcu(&dentry->d_hash); | 309 | hlist_del_rcu(&dentry->d_hash); |
274 | spin_unlock(&dcache_hash_lock); | 310 | spin_unlock(&dcache_hash_lock); |
311 | dentry_rcuwalk_barrier(dentry); | ||
275 | } | 312 | } |
276 | } | 313 | } |
277 | EXPORT_SYMBOL(__d_drop); | 314 | EXPORT_SYMBOL(__d_drop); |
@@ -309,6 +346,7 @@ relock: | |||
309 | spin_unlock(&dcache_inode_lock); | 346 | spin_unlock(&dcache_inode_lock); |
310 | goto relock; | 347 | goto relock; |
311 | } | 348 | } |
349 | |||
312 | if (ref) | 350 | if (ref) |
313 | dentry->d_count--; | 351 | dentry->d_count--; |
314 | /* if dentry was on the d_lru list delete it from there */ | 352 | /* if dentry was on the d_lru list delete it from there */ |
@@ -1221,6 +1259,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) | |||
1221 | dentry->d_count = 1; | 1259 | dentry->d_count = 1; |
1222 | dentry->d_flags = DCACHE_UNHASHED; | 1260 | dentry->d_flags = DCACHE_UNHASHED; |
1223 | spin_lock_init(&dentry->d_lock); | 1261 | spin_lock_init(&dentry->d_lock); |
1262 | seqcount_init(&dentry->d_seq); | ||
1224 | dentry->d_inode = NULL; | 1263 | dentry->d_inode = NULL; |
1225 | dentry->d_parent = NULL; | 1264 | dentry->d_parent = NULL; |
1226 | dentry->d_sb = NULL; | 1265 | dentry->d_sb = NULL; |
@@ -1269,6 +1308,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) | |||
1269 | if (inode) | 1308 | if (inode) |
1270 | list_add(&dentry->d_alias, &inode->i_dentry); | 1309 | list_add(&dentry->d_alias, &inode->i_dentry); |
1271 | dentry->d_inode = inode; | 1310 | dentry->d_inode = inode; |
1311 | dentry_rcuwalk_barrier(dentry); | ||
1272 | spin_unlock(&dentry->d_lock); | 1312 | spin_unlock(&dentry->d_lock); |
1273 | fsnotify_d_instantiate(dentry, inode); | 1313 | fsnotify_d_instantiate(dentry, inode); |
1274 | } | 1314 | } |
@@ -1611,6 +1651,111 @@ err_out: | |||
1611 | EXPORT_SYMBOL(d_add_ci); | 1651 | EXPORT_SYMBOL(d_add_ci); |
1612 | 1652 | ||
1613 | /** | 1653 | /** |
1654 | * __d_lookup_rcu - search for a dentry (racy, store-free) | ||
1655 | * @parent: parent dentry | ||
1656 | * @name: qstr of name we wish to find | ||
1657 | * @seq: returns d_seq value at the point where the dentry was found | ||
1658 | * @inode: returns dentry->d_inode when the inode was found valid. | ||
1659 | * Returns: dentry, or NULL | ||
1660 | * | ||
1661 | * __d_lookup_rcu is the dcache lookup function for rcu-walk name | ||
1662 | * resolution (store-free path walking) design described in | ||
1663 | * Documentation/filesystems/path-lookup.txt. | ||
1664 | * | ||
1665 | * This is not to be used outside core vfs. | ||
1666 | * | ||
1667 | * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock | ||
1668 | * held, and rcu_read_lock held. The returned dentry must not be stored into | ||
1669 | * without taking d_lock and checking d_seq sequence count against @seq | ||
1670 | * returned here. | ||
1671 | * | ||
1672 | * A refcount may be taken on the found dentry with the __d_rcu_to_refcount | ||
1673 | * function. | ||
1674 | * | ||
1675 | * Alternatively, __d_lookup_rcu may be called again to look up the child of | ||
1676 | * the returned dentry, so long as its parent's seqlock is checked after the | ||
1677 | * child is looked up. Thus, an interlocking stepping of sequence lock checks | ||
1678 | * is formed, giving integrity down the path walk. | ||
1679 | */ | ||
1680 | struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, | ||
1681 | unsigned *seq, struct inode **inode) | ||
1682 | { | ||
1683 | unsigned int len = name->len; | ||
1684 | unsigned int hash = name->hash; | ||
1685 | const unsigned char *str = name->name; | ||
1686 | struct hlist_head *head = d_hash(parent, hash); | ||
1687 | struct hlist_node *node; | ||
1688 | struct dentry *dentry; | ||
1689 | |||
1690 | /* | ||
1691 | * Note: There is significant duplication with __d_lookup_rcu which is | ||
1692 | * required to prevent single threaded performance regressions | ||
1693 | * especially on architectures where smp_rmb (in seqcounts) are costly. | ||
1694 | * Keep the two functions in sync. | ||
1695 | */ | ||
1696 | |||
1697 | /* | ||
1698 | * The hash list is protected using RCU. | ||
1699 | * | ||
1700 | * Carefully use d_seq when comparing a candidate dentry, to avoid | ||
1701 | * races with d_move(). | ||
1702 | * | ||
1703 | * It is possible that concurrent renames can mess up our list | ||
1704 | * walk here and result in missing our dentry, resulting in the | ||
1705 | * false-negative result. d_lookup() protects against concurrent | ||
1706 | * renames using rename_lock seqlock. | ||
1707 | * | ||
1708 | * See Documentation/vfs/dcache-locking.txt for more details. | ||
1709 | */ | ||
1710 | hlist_for_each_entry_rcu(dentry, node, head, d_hash) { | ||
1711 | struct inode *i; | ||
1712 | const char *tname; | ||
1713 | int tlen; | ||
1714 | |||
1715 | if (dentry->d_name.hash != hash) | ||
1716 | continue; | ||
1717 | |||
1718 | seqretry: | ||
1719 | *seq = read_seqcount_begin(&dentry->d_seq); | ||
1720 | if (dentry->d_parent != parent) | ||
1721 | continue; | ||
1722 | if (d_unhashed(dentry)) | ||
1723 | continue; | ||
1724 | tlen = dentry->d_name.len; | ||
1725 | tname = dentry->d_name.name; | ||
1726 | i = dentry->d_inode; | ||
1727 | /* | ||
1728 | * This seqcount check is required to ensure name and | ||
1729 | * len are loaded atomically, so as not to walk off the | ||
1730 | * edge of memory when walking. If we could load this | ||
1731 | * atomically some other way, we could drop this check. | ||
1732 | */ | ||
1733 | if (read_seqcount_retry(&dentry->d_seq, *seq)) | ||
1734 | goto seqretry; | ||
1735 | if (parent->d_op && parent->d_op->d_compare) { | ||
1736 | if (parent->d_op->d_compare(parent, *inode, | ||
1737 | dentry, i, | ||
1738 | tlen, tname, name)) | ||
1739 | continue; | ||
1740 | } else { | ||
1741 | if (tlen != len) | ||
1742 | continue; | ||
1743 | if (memcmp(tname, str, tlen)) | ||
1744 | continue; | ||
1745 | } | ||
1746 | /* | ||
1747 | * No extra seqcount check is required after the name | ||
1748 | * compare. The caller must perform a seqcount check in | ||
1749 | * order to do anything useful with the returned dentry | ||
1750 | * anyway. | ||
1751 | */ | ||
1752 | *inode = i; | ||
1753 | return dentry; | ||
1754 | } | ||
1755 | return NULL; | ||
1756 | } | ||
1757 | |||
1758 | /** | ||
1614 | * d_lookup - search for a dentry | 1759 | * d_lookup - search for a dentry |
1615 | * @parent: parent dentry | 1760 | * @parent: parent dentry |
1616 | * @name: qstr of name we wish to find | 1761 | * @name: qstr of name we wish to find |
@@ -1621,9 +1766,9 @@ EXPORT_SYMBOL(d_add_ci); | |||
1621 | * dentry is returned. The caller must use dput to free the entry when it has | 1766 | * dentry is returned. The caller must use dput to free the entry when it has |
1622 | * finished using it. %NULL is returned if the dentry does not exist. | 1767 | * finished using it. %NULL is returned if the dentry does not exist. |
1623 | */ | 1768 | */ |
1624 | struct dentry * d_lookup(struct dentry * parent, struct qstr * name) | 1769 | struct dentry *d_lookup(struct dentry *parent, struct qstr *name) |
1625 | { | 1770 | { |
1626 | struct dentry * dentry = NULL; | 1771 | struct dentry *dentry; |
1627 | unsigned seq; | 1772 | unsigned seq; |
1628 | 1773 | ||
1629 | do { | 1774 | do { |
@@ -1636,7 +1781,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name) | |||
1636 | } | 1781 | } |
1637 | EXPORT_SYMBOL(d_lookup); | 1782 | EXPORT_SYMBOL(d_lookup); |
1638 | 1783 | ||
1639 | /* | 1784 | /** |
1640 | * __d_lookup - search for a dentry (racy) | 1785 | * __d_lookup - search for a dentry (racy) |
1641 | * @parent: parent dentry | 1786 | * @parent: parent dentry |
1642 | * @name: qstr of name we wish to find | 1787 | * @name: qstr of name we wish to find |
@@ -1651,17 +1796,24 @@ EXPORT_SYMBOL(d_lookup); | |||
1651 | * | 1796 | * |
1652 | * __d_lookup callers must be commented. | 1797 | * __d_lookup callers must be commented. |
1653 | */ | 1798 | */ |
1654 | struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) | 1799 | struct dentry *__d_lookup(struct dentry *parent, struct qstr *name) |
1655 | { | 1800 | { |
1656 | unsigned int len = name->len; | 1801 | unsigned int len = name->len; |
1657 | unsigned int hash = name->hash; | 1802 | unsigned int hash = name->hash; |
1658 | const unsigned char *str = name->name; | 1803 | const unsigned char *str = name->name; |
1659 | struct hlist_head *head = d_hash(parent,hash); | 1804 | struct hlist_head *head = d_hash(parent,hash); |
1660 | struct dentry *found = NULL; | ||
1661 | struct hlist_node *node; | 1805 | struct hlist_node *node; |
1806 | struct dentry *found = NULL; | ||
1662 | struct dentry *dentry; | 1807 | struct dentry *dentry; |
1663 | 1808 | ||
1664 | /* | 1809 | /* |
1810 | * Note: There is significant duplication with __d_lookup_rcu which is | ||
1811 | * required to prevent single threaded performance regressions | ||
1812 | * especially on architectures where smp_rmb (in seqcounts) are costly. | ||
1813 | * Keep the two functions in sync. | ||
1814 | */ | ||
1815 | |||
1816 | /* | ||
1665 | * The hash list is protected using RCU. | 1817 | * The hash list is protected using RCU. |
1666 | * | 1818 | * |
1667 | * Take d_lock when comparing a candidate dentry, to avoid races | 1819 | * Take d_lock when comparing a candidate dentry, to avoid races |
@@ -1677,24 +1829,15 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) | |||
1677 | rcu_read_lock(); | 1829 | rcu_read_lock(); |
1678 | 1830 | ||
1679 | hlist_for_each_entry_rcu(dentry, node, head, d_hash) { | 1831 | hlist_for_each_entry_rcu(dentry, node, head, d_hash) { |
1680 | struct qstr *qstr; | 1832 | const char *tname; |
1833 | int tlen; | ||
1681 | 1834 | ||
1682 | if (dentry->d_name.hash != hash) | 1835 | if (dentry->d_name.hash != hash) |
1683 | continue; | 1836 | continue; |
1684 | if (dentry->d_parent != parent) | ||
1685 | continue; | ||
1686 | 1837 | ||
1687 | spin_lock(&dentry->d_lock); | 1838 | spin_lock(&dentry->d_lock); |
1688 | |||
1689 | /* | ||
1690 | * Recheck the dentry after taking the lock - d_move may have | ||
1691 | * changed things. Don't bother checking the hash because | ||
1692 | * we're about to compare the whole name anyway. | ||
1693 | */ | ||
1694 | if (dentry->d_parent != parent) | 1839 | if (dentry->d_parent != parent) |
1695 | goto next; | 1840 | goto next; |
1696 | |||
1697 | /* non-existing due to RCU? */ | ||
1698 | if (d_unhashed(dentry)) | 1841 | if (d_unhashed(dentry)) |
1699 | goto next; | 1842 | goto next; |
1700 | 1843 | ||
@@ -1702,16 +1845,17 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) | |||
1702 | * It is safe to compare names since d_move() cannot | 1845 | * It is safe to compare names since d_move() cannot |
1703 | * change the qstr (protected by d_lock). | 1846 | * change the qstr (protected by d_lock). |
1704 | */ | 1847 | */ |
1705 | qstr = &dentry->d_name; | 1848 | tlen = dentry->d_name.len; |
1849 | tname = dentry->d_name.name; | ||
1706 | if (parent->d_op && parent->d_op->d_compare) { | 1850 | if (parent->d_op && parent->d_op->d_compare) { |
1707 | if (parent->d_op->d_compare(parent, parent->d_inode, | 1851 | if (parent->d_op->d_compare(parent, parent->d_inode, |
1708 | dentry, dentry->d_inode, | 1852 | dentry, dentry->d_inode, |
1709 | qstr->len, qstr->name, name)) | 1853 | tlen, tname, name)) |
1710 | goto next; | 1854 | goto next; |
1711 | } else { | 1855 | } else { |
1712 | if (qstr->len != len) | 1856 | if (tlen != len) |
1713 | goto next; | 1857 | goto next; |
1714 | if (memcmp(qstr->name, str, len)) | 1858 | if (memcmp(tname, str, tlen)) |
1715 | goto next; | 1859 | goto next; |
1716 | } | 1860 | } |
1717 | 1861 | ||
@@ -1821,7 +1965,7 @@ again: | |||
1821 | goto again; | 1965 | goto again; |
1822 | } | 1966 | } |
1823 | dentry->d_flags &= ~DCACHE_CANT_MOUNT; | 1967 | dentry->d_flags &= ~DCACHE_CANT_MOUNT; |
1824 | dentry_iput(dentry); | 1968 | dentry_unlink_inode(dentry); |
1825 | fsnotify_nameremove(dentry, isdir); | 1969 | fsnotify_nameremove(dentry, isdir); |
1826 | return; | 1970 | return; |
1827 | } | 1971 | } |
@@ -1884,7 +2028,9 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name) | |||
1884 | BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ | 2028 | BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ |
1885 | 2029 | ||
1886 | spin_lock(&dentry->d_lock); | 2030 | spin_lock(&dentry->d_lock); |
2031 | write_seqcount_begin(&dentry->d_seq); | ||
1887 | memcpy((unsigned char *)dentry->d_name.name, name->name, name->len); | 2032 | memcpy((unsigned char *)dentry->d_name.name, name->name, name->len); |
2033 | write_seqcount_end(&dentry->d_seq); | ||
1888 | spin_unlock(&dentry->d_lock); | 2034 | spin_unlock(&dentry->d_lock); |
1889 | } | 2035 | } |
1890 | EXPORT_SYMBOL(dentry_update_name_case); | 2036 | EXPORT_SYMBOL(dentry_update_name_case); |
@@ -1997,6 +2143,9 @@ void d_move(struct dentry * dentry, struct dentry * target) | |||
1997 | 2143 | ||
1998 | dentry_lock_for_move(dentry, target); | 2144 | dentry_lock_for_move(dentry, target); |
1999 | 2145 | ||
2146 | write_seqcount_begin(&dentry->d_seq); | ||
2147 | write_seqcount_begin(&target->d_seq); | ||
2148 | |||
2000 | /* Move the dentry to the target hash queue, if on different bucket */ | 2149 | /* Move the dentry to the target hash queue, if on different bucket */ |
2001 | spin_lock(&dcache_hash_lock); | 2150 | spin_lock(&dcache_hash_lock); |
2002 | if (!d_unhashed(dentry)) | 2151 | if (!d_unhashed(dentry)) |
@@ -2005,6 +2154,7 @@ void d_move(struct dentry * dentry, struct dentry * target) | |||
2005 | spin_unlock(&dcache_hash_lock); | 2154 | spin_unlock(&dcache_hash_lock); |
2006 | 2155 | ||
2007 | /* Unhash the target: dput() will then get rid of it */ | 2156 | /* Unhash the target: dput() will then get rid of it */ |
2157 | /* __d_drop does write_seqcount_barrier, but they're OK to nest. */ | ||
2008 | __d_drop(target); | 2158 | __d_drop(target); |
2009 | 2159 | ||
2010 | list_del(&dentry->d_u.d_child); | 2160 | list_del(&dentry->d_u.d_child); |
@@ -2028,6 +2178,9 @@ void d_move(struct dentry * dentry, struct dentry * target) | |||
2028 | 2178 | ||
2029 | list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); | 2179 | list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); |
2030 | 2180 | ||
2181 | write_seqcount_end(&target->d_seq); | ||
2182 | write_seqcount_end(&dentry->d_seq); | ||
2183 | |||
2031 | dentry_unlock_parents_for_move(dentry, target); | 2184 | dentry_unlock_parents_for_move(dentry, target); |
2032 | spin_unlock(&target->d_lock); | 2185 | spin_unlock(&target->d_lock); |
2033 | fsnotify_d_move(dentry); | 2186 | fsnotify_d_move(dentry); |
@@ -2110,6 +2263,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) | |||
2110 | 2263 | ||
2111 | dentry_lock_for_move(anon, dentry); | 2264 | dentry_lock_for_move(anon, dentry); |
2112 | 2265 | ||
2266 | write_seqcount_begin(&dentry->d_seq); | ||
2267 | write_seqcount_begin(&anon->d_seq); | ||
2268 | |||
2113 | dparent = dentry->d_parent; | 2269 | dparent = dentry->d_parent; |
2114 | aparent = anon->d_parent; | 2270 | aparent = anon->d_parent; |
2115 | 2271 | ||
@@ -2130,6 +2286,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) | |||
2130 | else | 2286 | else |
2131 | INIT_LIST_HEAD(&anon->d_u.d_child); | 2287 | INIT_LIST_HEAD(&anon->d_u.d_child); |
2132 | 2288 | ||
2289 | write_seqcount_end(&dentry->d_seq); | ||
2290 | write_seqcount_end(&anon->d_seq); | ||
2291 | |||
2133 | dentry_unlock_parents_for_move(anon, dentry); | 2292 | dentry_unlock_parents_for_move(anon, dentry); |
2134 | spin_unlock(&dentry->d_lock); | 2293 | spin_unlock(&dentry->d_lock); |
2135 | 2294 | ||
diff --git a/fs/filesystems.c b/fs/filesystems.c index 68ba492d8eef..751d6b255a12 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c | |||
@@ -115,6 +115,9 @@ int unregister_filesystem(struct file_system_type * fs) | |||
115 | tmp = &(*tmp)->next; | 115 | tmp = &(*tmp)->next; |
116 | } | 116 | } |
117 | write_unlock(&file_systems_lock); | 117 | write_unlock(&file_systems_lock); |
118 | |||
119 | synchronize_rcu(); | ||
120 | |||
118 | return -EINVAL; | 121 | return -EINVAL; |
119 | } | 122 | } |
120 | 123 | ||
diff --git a/fs/namei.c b/fs/namei.c index 5642bc2be418..8d3f15b3a541 100644 --- a/fs/namei.c +++ b/fs/namei.c | |||
@@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname); | |||
169 | /* | 169 | /* |
170 | * This does basic POSIX ACL permission checking | 170 | * This does basic POSIX ACL permission checking |
171 | */ | 171 | */ |
172 | static int acl_permission_check(struct inode *inode, int mask, | 172 | static inline int __acl_permission_check(struct inode *inode, int mask, |
173 | int (*check_acl)(struct inode *inode, int mask)) | 173 | int (*check_acl)(struct inode *inode, int mask), int rcu) |
174 | { | 174 | { |
175 | umode_t mode = inode->i_mode; | 175 | umode_t mode = inode->i_mode; |
176 | 176 | ||
@@ -180,9 +180,13 @@ static int acl_permission_check(struct inode *inode, int mask, | |||
180 | mode >>= 6; | 180 | mode >>= 6; |
181 | else { | 181 | else { |
182 | if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { | 182 | if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { |
183 | int error = check_acl(inode, mask); | 183 | if (rcu) { |
184 | if (error != -EAGAIN) | 184 | return -ECHILD; |
185 | return error; | 185 | } else { |
186 | int error = check_acl(inode, mask); | ||
187 | if (error != -EAGAIN) | ||
188 | return error; | ||
189 | } | ||
186 | } | 190 | } |
187 | 191 | ||
188 | if (in_group_p(inode->i_gid)) | 192 | if (in_group_p(inode->i_gid)) |
@@ -197,6 +201,12 @@ static int acl_permission_check(struct inode *inode, int mask, | |||
197 | return -EACCES; | 201 | return -EACCES; |
198 | } | 202 | } |
199 | 203 | ||
204 | static inline int acl_permission_check(struct inode *inode, int mask, | ||
205 | int (*check_acl)(struct inode *inode, int mask)) | ||
206 | { | ||
207 | return __acl_permission_check(inode, mask, check_acl, 0); | ||
208 | } | ||
209 | |||
200 | /** | 210 | /** |
201 | * generic_permission - check for access rights on a Posix-like filesystem | 211 | * generic_permission - check for access rights on a Posix-like filesystem |
202 | * @inode: inode to check access rights for | 212 | * @inode: inode to check access rights for |
@@ -375,6 +385,173 @@ void path_put(struct path *path) | |||
375 | EXPORT_SYMBOL(path_put); | 385 | EXPORT_SYMBOL(path_put); |
376 | 386 | ||
377 | /** | 387 | /** |
388 | * nameidata_drop_rcu - drop this nameidata out of rcu-walk | ||
389 | * @nd: nameidata pathwalk data to drop | ||
390 | * @Returns: 0 on success, -ECHLID on failure | ||
391 | * | ||
392 | * Path walking has 2 modes, rcu-walk and ref-walk (see | ||
393 | * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt | ||
394 | * to drop out of rcu-walk mode and take normal reference counts on dentries | ||
395 | * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take | ||
396 | * refcounts at the last known good point before rcu-walk got stuck, so | ||
397 | * ref-walk may continue from there. If this is not successful (eg. a seqcount | ||
398 | * has changed), then failure is returned and path walk restarts from the | ||
399 | * beginning in ref-walk mode. | ||
400 | * | ||
401 | * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into | ||
402 | * ref-walk. Must be called from rcu-walk context. | ||
403 | */ | ||
404 | static int nameidata_drop_rcu(struct nameidata *nd) | ||
405 | { | ||
406 | struct fs_struct *fs = current->fs; | ||
407 | struct dentry *dentry = nd->path.dentry; | ||
408 | |||
409 | BUG_ON(!(nd->flags & LOOKUP_RCU)); | ||
410 | if (nd->root.mnt) { | ||
411 | spin_lock(&fs->lock); | ||
412 | if (nd->root.mnt != fs->root.mnt || | ||
413 | nd->root.dentry != fs->root.dentry) | ||
414 | goto err_root; | ||
415 | } | ||
416 | spin_lock(&dentry->d_lock); | ||
417 | if (!__d_rcu_to_refcount(dentry, nd->seq)) | ||
418 | goto err; | ||
419 | BUG_ON(nd->inode != dentry->d_inode); | ||
420 | spin_unlock(&dentry->d_lock); | ||
421 | if (nd->root.mnt) { | ||
422 | path_get(&nd->root); | ||
423 | spin_unlock(&fs->lock); | ||
424 | } | ||
425 | mntget(nd->path.mnt); | ||
426 | |||
427 | rcu_read_unlock(); | ||
428 | br_read_unlock(vfsmount_lock); | ||
429 | nd->flags &= ~LOOKUP_RCU; | ||
430 | return 0; | ||
431 | err: | ||
432 | spin_unlock(&dentry->d_lock); | ||
433 | err_root: | ||
434 | if (nd->root.mnt) | ||
435 | spin_unlock(&fs->lock); | ||
436 | return -ECHILD; | ||
437 | } | ||
438 | |||
439 | /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ | ||
440 | static inline int nameidata_drop_rcu_maybe(struct nameidata *nd) | ||
441 | { | ||
442 | if (nd->flags & LOOKUP_RCU) | ||
443 | return nameidata_drop_rcu(nd); | ||
444 | return 0; | ||
445 | } | ||
446 | |||
447 | /** | ||
448 | * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk | ||
449 | * @nd: nameidata pathwalk data to drop | ||
450 | * @dentry: dentry to drop | ||
451 | * @Returns: 0 on success, -ECHLID on failure | ||
452 | * | ||
453 | * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root, | ||
454 | * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on | ||
455 | * @nd. Must be called from rcu-walk context. | ||
456 | */ | ||
457 | static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry) | ||
458 | { | ||
459 | struct fs_struct *fs = current->fs; | ||
460 | struct dentry *parent = nd->path.dentry; | ||
461 | |||
462 | BUG_ON(!(nd->flags & LOOKUP_RCU)); | ||
463 | if (nd->root.mnt) { | ||
464 | spin_lock(&fs->lock); | ||
465 | if (nd->root.mnt != fs->root.mnt || | ||
466 | nd->root.dentry != fs->root.dentry) | ||
467 | goto err_root; | ||
468 | } | ||
469 | spin_lock(&parent->d_lock); | ||
470 | spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); | ||
471 | if (!__d_rcu_to_refcount(dentry, nd->seq)) | ||
472 | goto err; | ||
473 | /* | ||
474 | * If the sequence check on the child dentry passed, then the child has | ||
475 | * not been removed from its parent. This means the parent dentry must | ||
476 | * be valid and able to take a reference at this point. | ||
477 | */ | ||
478 | BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); | ||
479 | BUG_ON(!parent->d_count); | ||
480 | parent->d_count++; | ||
481 | spin_unlock(&dentry->d_lock); | ||
482 | spin_unlock(&parent->d_lock); | ||
483 | if (nd->root.mnt) { | ||
484 | path_get(&nd->root); | ||
485 | spin_unlock(&fs->lock); | ||
486 | } | ||
487 | mntget(nd->path.mnt); | ||
488 | |||
489 | rcu_read_unlock(); | ||
490 | br_read_unlock(vfsmount_lock); | ||
491 | nd->flags &= ~LOOKUP_RCU; | ||
492 | return 0; | ||
493 | err: | ||
494 | spin_unlock(&dentry->d_lock); | ||
495 | spin_unlock(&parent->d_lock); | ||
496 | err_root: | ||
497 | if (nd->root.mnt) | ||
498 | spin_unlock(&fs->lock); | ||
499 | return -ECHILD; | ||
500 | } | ||
501 | |||
502 | /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ | ||
503 | static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry) | ||
504 | { | ||
505 | if (nd->flags & LOOKUP_RCU) | ||
506 | return nameidata_dentry_drop_rcu(nd, dentry); | ||
507 | return 0; | ||
508 | } | ||
509 | |||
510 | /** | ||
511 | * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk | ||
512 | * @nd: nameidata pathwalk data to drop | ||
513 | * @Returns: 0 on success, -ECHLID on failure | ||
514 | * | ||
515 | * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk. | ||
516 | * nd->path should be the final element of the lookup, so nd->root is discarded. | ||
517 | * Must be called from rcu-walk context. | ||
518 | */ | ||
519 | static int nameidata_drop_rcu_last(struct nameidata *nd) | ||
520 | { | ||
521 | struct dentry *dentry = nd->path.dentry; | ||
522 | |||
523 | BUG_ON(!(nd->flags & LOOKUP_RCU)); | ||
524 | nd->flags &= ~LOOKUP_RCU; | ||
525 | nd->root.mnt = NULL; | ||
526 | spin_lock(&dentry->d_lock); | ||
527 | if (!__d_rcu_to_refcount(dentry, nd->seq)) | ||
528 | goto err_unlock; | ||
529 | BUG_ON(nd->inode != dentry->d_inode); | ||
530 | spin_unlock(&dentry->d_lock); | ||
531 | |||
532 | mntget(nd->path.mnt); | ||
533 | |||
534 | rcu_read_unlock(); | ||
535 | br_read_unlock(vfsmount_lock); | ||
536 | |||
537 | return 0; | ||
538 | |||
539 | err_unlock: | ||
540 | spin_unlock(&dentry->d_lock); | ||
541 | rcu_read_unlock(); | ||
542 | br_read_unlock(vfsmount_lock); | ||
543 | return -ECHILD; | ||
544 | } | ||
545 | |||
546 | /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ | ||
547 | static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd) | ||
548 | { | ||
549 | if (likely(nd->flags & LOOKUP_RCU)) | ||
550 | return nameidata_drop_rcu_last(nd); | ||
551 | return 0; | ||
552 | } | ||
553 | |||
554 | /** | ||
378 | * release_open_intent - free up open intent resources | 555 | * release_open_intent - free up open intent resources |
379 | * @nd: pointer to nameidata | 556 | * @nd: pointer to nameidata |
380 | */ | 557 | */ |
@@ -459,26 +636,40 @@ force_reval_path(struct path *path, struct nameidata *nd) | |||
459 | * short-cut DAC fails, then call ->permission() to do more | 636 | * short-cut DAC fails, then call ->permission() to do more |
460 | * complete permission check. | 637 | * complete permission check. |
461 | */ | 638 | */ |
462 | static int exec_permission(struct inode *inode) | 639 | static inline int __exec_permission(struct inode *inode, int rcu) |
463 | { | 640 | { |
464 | int ret; | 641 | int ret; |
465 | 642 | ||
466 | if (inode->i_op->permission) { | 643 | if (inode->i_op->permission) { |
644 | if (rcu) | ||
645 | return -ECHILD; | ||
467 | ret = inode->i_op->permission(inode, MAY_EXEC); | 646 | ret = inode->i_op->permission(inode, MAY_EXEC); |
468 | if (!ret) | 647 | if (!ret) |
469 | goto ok; | 648 | goto ok; |
470 | return ret; | 649 | return ret; |
471 | } | 650 | } |
472 | ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl); | 651 | ret = __acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl, rcu); |
473 | if (!ret) | 652 | if (!ret) |
474 | goto ok; | 653 | goto ok; |
654 | if (rcu && ret == -ECHILD) | ||
655 | return ret; | ||
475 | 656 | ||
476 | if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) | 657 | if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) |
477 | goto ok; | 658 | goto ok; |
478 | 659 | ||
479 | return ret; | 660 | return ret; |
480 | ok: | 661 | ok: |
481 | return security_inode_permission(inode, MAY_EXEC); | 662 | return security_inode_exec_permission(inode, rcu); |
663 | } | ||
664 | |||
665 | static int exec_permission(struct inode *inode) | ||
666 | { | ||
667 | return __exec_permission(inode, 0); | ||
668 | } | ||
669 | |||
670 | static int exec_permission_rcu(struct inode *inode) | ||
671 | { | ||
672 | return __exec_permission(inode, 1); | ||
482 | } | 673 | } |
483 | 674 | ||
484 | static __always_inline void set_root(struct nameidata *nd) | 675 | static __always_inline void set_root(struct nameidata *nd) |
@@ -489,8 +680,20 @@ static __always_inline void set_root(struct nameidata *nd) | |||
489 | 680 | ||
490 | static int link_path_walk(const char *, struct nameidata *); | 681 | static int link_path_walk(const char *, struct nameidata *); |
491 | 682 | ||
683 | static __always_inline void set_root_rcu(struct nameidata *nd) | ||
684 | { | ||
685 | if (!nd->root.mnt) { | ||
686 | struct fs_struct *fs = current->fs; | ||
687 | spin_lock(&fs->lock); | ||
688 | nd->root = fs->root; | ||
689 | spin_unlock(&fs->lock); | ||
690 | } | ||
691 | } | ||
692 | |||
492 | static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) | 693 | static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) |
493 | { | 694 | { |
695 | int ret; | ||
696 | |||
494 | if (IS_ERR(link)) | 697 | if (IS_ERR(link)) |
495 | goto fail; | 698 | goto fail; |
496 | 699 | ||
@@ -500,8 +703,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l | |||
500 | nd->path = nd->root; | 703 | nd->path = nd->root; |
501 | path_get(&nd->root); | 704 | path_get(&nd->root); |
502 | } | 705 | } |
706 | nd->inode = nd->path.dentry->d_inode; | ||
503 | 707 | ||
504 | return link_path_walk(link, nd); | 708 | ret = link_path_walk(link, nd); |
709 | return ret; | ||
505 | fail: | 710 | fail: |
506 | path_put(&nd->path); | 711 | path_put(&nd->path); |
507 | return PTR_ERR(link); | 712 | return PTR_ERR(link); |
@@ -516,11 +721,12 @@ static void path_put_conditional(struct path *path, struct nameidata *nd) | |||
516 | 721 | ||
517 | static inline void path_to_nameidata(struct path *path, struct nameidata *nd) | 722 | static inline void path_to_nameidata(struct path *path, struct nameidata *nd) |
518 | { | 723 | { |
519 | dput(nd->path.dentry); | 724 | if (!(nd->flags & LOOKUP_RCU)) { |
520 | if (nd->path.mnt != path->mnt) { | 725 | dput(nd->path.dentry); |
521 | mntput(nd->path.mnt); | 726 | if (nd->path.mnt != path->mnt) |
522 | nd->path.mnt = path->mnt; | 727 | mntput(nd->path.mnt); |
523 | } | 728 | } |
729 | nd->path.mnt = path->mnt; | ||
524 | nd->path.dentry = path->dentry; | 730 | nd->path.dentry = path->dentry; |
525 | } | 731 | } |
526 | 732 | ||
@@ -535,9 +741,11 @@ __do_follow_link(struct path *path, struct nameidata *nd, void **p) | |||
535 | 741 | ||
536 | if (path->mnt != nd->path.mnt) { | 742 | if (path->mnt != nd->path.mnt) { |
537 | path_to_nameidata(path, nd); | 743 | path_to_nameidata(path, nd); |
744 | nd->inode = nd->path.dentry->d_inode; | ||
538 | dget(dentry); | 745 | dget(dentry); |
539 | } | 746 | } |
540 | mntget(path->mnt); | 747 | mntget(path->mnt); |
748 | |||
541 | nd->last_type = LAST_BIND; | 749 | nd->last_type = LAST_BIND; |
542 | *p = dentry->d_inode->i_op->follow_link(dentry, nd); | 750 | *p = dentry->d_inode->i_op->follow_link(dentry, nd); |
543 | error = PTR_ERR(*p); | 751 | error = PTR_ERR(*p); |
@@ -591,6 +799,20 @@ loop: | |||
591 | return err; | 799 | return err; |
592 | } | 800 | } |
593 | 801 | ||
802 | static int follow_up_rcu(struct path *path) | ||
803 | { | ||
804 | struct vfsmount *parent; | ||
805 | struct dentry *mountpoint; | ||
806 | |||
807 | parent = path->mnt->mnt_parent; | ||
808 | if (parent == path->mnt) | ||
809 | return 0; | ||
810 | mountpoint = path->mnt->mnt_mountpoint; | ||
811 | path->dentry = mountpoint; | ||
812 | path->mnt = parent; | ||
813 | return 1; | ||
814 | } | ||
815 | |||
594 | int follow_up(struct path *path) | 816 | int follow_up(struct path *path) |
595 | { | 817 | { |
596 | struct vfsmount *parent; | 818 | struct vfsmount *parent; |
@@ -615,6 +837,21 @@ int follow_up(struct path *path) | |||
615 | /* | 837 | /* |
616 | * serialization is taken care of in namespace.c | 838 | * serialization is taken care of in namespace.c |
617 | */ | 839 | */ |
840 | static void __follow_mount_rcu(struct nameidata *nd, struct path *path, | ||
841 | struct inode **inode) | ||
842 | { | ||
843 | while (d_mountpoint(path->dentry)) { | ||
844 | struct vfsmount *mounted; | ||
845 | mounted = __lookup_mnt(path->mnt, path->dentry, 1); | ||
846 | if (!mounted) | ||
847 | return; | ||
848 | path->mnt = mounted; | ||
849 | path->dentry = mounted->mnt_root; | ||
850 | nd->seq = read_seqcount_begin(&path->dentry->d_seq); | ||
851 | *inode = path->dentry->d_inode; | ||
852 | } | ||
853 | } | ||
854 | |||
618 | static int __follow_mount(struct path *path) | 855 | static int __follow_mount(struct path *path) |
619 | { | 856 | { |
620 | int res = 0; | 857 | int res = 0; |
@@ -660,7 +897,42 @@ int follow_down(struct path *path) | |||
660 | return 0; | 897 | return 0; |
661 | } | 898 | } |
662 | 899 | ||
663 | static __always_inline void follow_dotdot(struct nameidata *nd) | 900 | static int follow_dotdot_rcu(struct nameidata *nd) |
901 | { | ||
902 | struct inode *inode = nd->inode; | ||
903 | |||
904 | set_root_rcu(nd); | ||
905 | |||
906 | while(1) { | ||
907 | if (nd->path.dentry == nd->root.dentry && | ||
908 | nd->path.mnt == nd->root.mnt) { | ||
909 | break; | ||
910 | } | ||
911 | if (nd->path.dentry != nd->path.mnt->mnt_root) { | ||
912 | struct dentry *old = nd->path.dentry; | ||
913 | struct dentry *parent = old->d_parent; | ||
914 | unsigned seq; | ||
915 | |||
916 | seq = read_seqcount_begin(&parent->d_seq); | ||
917 | if (read_seqcount_retry(&old->d_seq, nd->seq)) | ||
918 | return -ECHILD; | ||
919 | inode = parent->d_inode; | ||
920 | nd->path.dentry = parent; | ||
921 | nd->seq = seq; | ||
922 | break; | ||
923 | } | ||
924 | if (!follow_up_rcu(&nd->path)) | ||
925 | break; | ||
926 | nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); | ||
927 | inode = nd->path.dentry->d_inode; | ||
928 | } | ||
929 | __follow_mount_rcu(nd, &nd->path, &inode); | ||
930 | nd->inode = inode; | ||
931 | |||
932 | return 0; | ||
933 | } | ||
934 | |||
935 | static void follow_dotdot(struct nameidata *nd) | ||
664 | { | 936 | { |
665 | set_root(nd); | 937 | set_root(nd); |
666 | 938 | ||
@@ -681,6 +953,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd) | |||
681 | break; | 953 | break; |
682 | } | 954 | } |
683 | follow_mount(&nd->path); | 955 | follow_mount(&nd->path); |
956 | nd->inode = nd->path.dentry->d_inode; | ||
684 | } | 957 | } |
685 | 958 | ||
686 | /* | 959 | /* |
@@ -718,18 +991,17 @@ static struct dentry *d_alloc_and_lookup(struct dentry *parent, | |||
718 | * It _is_ time-critical. | 991 | * It _is_ time-critical. |
719 | */ | 992 | */ |
720 | static int do_lookup(struct nameidata *nd, struct qstr *name, | 993 | static int do_lookup(struct nameidata *nd, struct qstr *name, |
721 | struct path *path) | 994 | struct path *path, struct inode **inode) |
722 | { | 995 | { |
723 | struct vfsmount *mnt = nd->path.mnt; | 996 | struct vfsmount *mnt = nd->path.mnt; |
724 | struct dentry *dentry, *parent; | 997 | struct dentry *dentry, *parent = nd->path.dentry; |
725 | struct inode *dir; | 998 | struct inode *dir; |
726 | /* | 999 | /* |
727 | * See if the low-level filesystem might want | 1000 | * See if the low-level filesystem might want |
728 | * to use its own hash.. | 1001 | * to use its own hash.. |
729 | */ | 1002 | */ |
730 | if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { | 1003 | if (parent->d_op && parent->d_op->d_hash) { |
731 | int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, | 1004 | int err = parent->d_op->d_hash(parent, nd->inode, name); |
732 | nd->path.dentry->d_inode, name); | ||
733 | if (err < 0) | 1005 | if (err < 0) |
734 | return err; | 1006 | return err; |
735 | } | 1007 | } |
@@ -739,21 +1011,48 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, | |||
739 | * of a false negative due to a concurrent rename, we're going to | 1011 | * of a false negative due to a concurrent rename, we're going to |
740 | * do the non-racy lookup, below. | 1012 | * do the non-racy lookup, below. |
741 | */ | 1013 | */ |
742 | dentry = __d_lookup(nd->path.dentry, name); | 1014 | if (nd->flags & LOOKUP_RCU) { |
743 | if (!dentry) | 1015 | unsigned seq; |
744 | goto need_lookup; | 1016 | |
1017 | *inode = nd->inode; | ||
1018 | dentry = __d_lookup_rcu(parent, name, &seq, inode); | ||
1019 | if (!dentry) { | ||
1020 | if (nameidata_drop_rcu(nd)) | ||
1021 | return -ECHILD; | ||
1022 | goto need_lookup; | ||
1023 | } | ||
1024 | /* Memory barrier in read_seqcount_begin of child is enough */ | ||
1025 | if (__read_seqcount_retry(&parent->d_seq, nd->seq)) | ||
1026 | return -ECHILD; | ||
1027 | |||
1028 | nd->seq = seq; | ||
1029 | if (dentry->d_op && dentry->d_op->d_revalidate) { | ||
1030 | /* We commonly drop rcu-walk here */ | ||
1031 | if (nameidata_dentry_drop_rcu(nd, dentry)) | ||
1032 | return -ECHILD; | ||
1033 | goto need_revalidate; | ||
1034 | } | ||
1035 | path->mnt = mnt; | ||
1036 | path->dentry = dentry; | ||
1037 | __follow_mount_rcu(nd, path, inode); | ||
1038 | } else { | ||
1039 | dentry = __d_lookup(parent, name); | ||
1040 | if (!dentry) | ||
1041 | goto need_lookup; | ||
745 | found: | 1042 | found: |
746 | if (dentry->d_op && dentry->d_op->d_revalidate) | 1043 | if (dentry->d_op && dentry->d_op->d_revalidate) |
747 | goto need_revalidate; | 1044 | goto need_revalidate; |
748 | done: | 1045 | done: |
749 | path->mnt = mnt; | 1046 | path->mnt = mnt; |
750 | path->dentry = dentry; | 1047 | path->dentry = dentry; |
751 | __follow_mount(path); | 1048 | __follow_mount(path); |
1049 | *inode = path->dentry->d_inode; | ||
1050 | } | ||
752 | return 0; | 1051 | return 0; |
753 | 1052 | ||
754 | need_lookup: | 1053 | need_lookup: |
755 | parent = nd->path.dentry; | ||
756 | dir = parent->d_inode; | 1054 | dir = parent->d_inode; |
1055 | BUG_ON(nd->inode != dir); | ||
757 | 1056 | ||
758 | mutex_lock(&dir->i_mutex); | 1057 | mutex_lock(&dir->i_mutex); |
759 | /* | 1058 | /* |
@@ -815,7 +1114,6 @@ static inline int follow_on_final(struct inode *inode, unsigned lookup_flags) | |||
815 | static int link_path_walk(const char *name, struct nameidata *nd) | 1114 | static int link_path_walk(const char *name, struct nameidata *nd) |
816 | { | 1115 | { |
817 | struct path next; | 1116 | struct path next; |
818 | struct inode *inode; | ||
819 | int err; | 1117 | int err; |
820 | unsigned int lookup_flags = nd->flags; | 1118 | unsigned int lookup_flags = nd->flags; |
821 | 1119 | ||
@@ -824,18 +1122,28 @@ static int link_path_walk(const char *name, struct nameidata *nd) | |||
824 | if (!*name) | 1122 | if (!*name) |
825 | goto return_reval; | 1123 | goto return_reval; |
826 | 1124 | ||
827 | inode = nd->path.dentry->d_inode; | ||
828 | if (nd->depth) | 1125 | if (nd->depth) |
829 | lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); | 1126 | lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); |
830 | 1127 | ||
831 | /* At this point we know we have a real path component. */ | 1128 | /* At this point we know we have a real path component. */ |
832 | for(;;) { | 1129 | for(;;) { |
1130 | struct inode *inode; | ||
833 | unsigned long hash; | 1131 | unsigned long hash; |
834 | struct qstr this; | 1132 | struct qstr this; |
835 | unsigned int c; | 1133 | unsigned int c; |
836 | 1134 | ||
837 | nd->flags |= LOOKUP_CONTINUE; | 1135 | nd->flags |= LOOKUP_CONTINUE; |
838 | err = exec_permission(inode); | 1136 | if (nd->flags & LOOKUP_RCU) { |
1137 | err = exec_permission_rcu(nd->inode); | ||
1138 | if (err == -ECHILD) { | ||
1139 | if (nameidata_drop_rcu(nd)) | ||
1140 | return -ECHILD; | ||
1141 | goto exec_again; | ||
1142 | } | ||
1143 | } else { | ||
1144 | exec_again: | ||
1145 | err = exec_permission(nd->inode); | ||
1146 | } | ||
839 | if (err) | 1147 | if (err) |
840 | break; | 1148 | break; |
841 | 1149 | ||
@@ -866,37 +1174,44 @@ static int link_path_walk(const char *name, struct nameidata *nd) | |||
866 | if (this.name[0] == '.') switch (this.len) { | 1174 | if (this.name[0] == '.') switch (this.len) { |
867 | default: | 1175 | default: |
868 | break; | 1176 | break; |
869 | case 2: | 1177 | case 2: |
870 | if (this.name[1] != '.') | 1178 | if (this.name[1] != '.') |
871 | break; | 1179 | break; |
872 | follow_dotdot(nd); | 1180 | if (nd->flags & LOOKUP_RCU) { |
873 | inode = nd->path.dentry->d_inode; | 1181 | if (follow_dotdot_rcu(nd)) |
1182 | return -ECHILD; | ||
1183 | } else | ||
1184 | follow_dotdot(nd); | ||
874 | /* fallthrough */ | 1185 | /* fallthrough */ |
875 | case 1: | 1186 | case 1: |
876 | continue; | 1187 | continue; |
877 | } | 1188 | } |
878 | /* This does the actual lookups.. */ | 1189 | /* This does the actual lookups.. */ |
879 | err = do_lookup(nd, &this, &next); | 1190 | err = do_lookup(nd, &this, &next, &inode); |
880 | if (err) | 1191 | if (err) |
881 | break; | 1192 | break; |
882 | |||
883 | err = -ENOENT; | 1193 | err = -ENOENT; |
884 | inode = next.dentry->d_inode; | ||
885 | if (!inode) | 1194 | if (!inode) |
886 | goto out_dput; | 1195 | goto out_dput; |
887 | 1196 | ||
888 | if (inode->i_op->follow_link) { | 1197 | if (inode->i_op->follow_link) { |
1198 | /* We commonly drop rcu-walk here */ | ||
1199 | if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry)) | ||
1200 | return -ECHILD; | ||
1201 | BUG_ON(inode != next.dentry->d_inode); | ||
889 | err = do_follow_link(&next, nd); | 1202 | err = do_follow_link(&next, nd); |
890 | if (err) | 1203 | if (err) |
891 | goto return_err; | 1204 | goto return_err; |
1205 | nd->inode = nd->path.dentry->d_inode; | ||
892 | err = -ENOENT; | 1206 | err = -ENOENT; |
893 | inode = nd->path.dentry->d_inode; | 1207 | if (!nd->inode) |
894 | if (!inode) | ||
895 | break; | 1208 | break; |
896 | } else | 1209 | } else { |
897 | path_to_nameidata(&next, nd); | 1210 | path_to_nameidata(&next, nd); |
1211 | nd->inode = inode; | ||
1212 | } | ||
898 | err = -ENOTDIR; | 1213 | err = -ENOTDIR; |
899 | if (!inode->i_op->lookup) | 1214 | if (!nd->inode->i_op->lookup) |
900 | break; | 1215 | break; |
901 | continue; | 1216 | continue; |
902 | /* here ends the main loop */ | 1217 | /* here ends the main loop */ |
@@ -911,32 +1226,39 @@ last_component: | |||
911 | if (this.name[0] == '.') switch (this.len) { | 1226 | if (this.name[0] == '.') switch (this.len) { |
912 | default: | 1227 | default: |
913 | break; | 1228 | break; |
914 | case 2: | 1229 | case 2: |
915 | if (this.name[1] != '.') | 1230 | if (this.name[1] != '.') |
916 | break; | 1231 | break; |
917 | follow_dotdot(nd); | 1232 | if (nd->flags & LOOKUP_RCU) { |
918 | inode = nd->path.dentry->d_inode; | 1233 | if (follow_dotdot_rcu(nd)) |
1234 | return -ECHILD; | ||
1235 | } else | ||
1236 | follow_dotdot(nd); | ||
919 | /* fallthrough */ | 1237 | /* fallthrough */ |
920 | case 1: | 1238 | case 1: |
921 | goto return_reval; | 1239 | goto return_reval; |
922 | } | 1240 | } |
923 | err = do_lookup(nd, &this, &next); | 1241 | err = do_lookup(nd, &this, &next, &inode); |
924 | if (err) | 1242 | if (err) |
925 | break; | 1243 | break; |
926 | inode = next.dentry->d_inode; | ||
927 | if (follow_on_final(inode, lookup_flags)) { | 1244 | if (follow_on_final(inode, lookup_flags)) { |
1245 | if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry)) | ||
1246 | return -ECHILD; | ||
1247 | BUG_ON(inode != next.dentry->d_inode); | ||
928 | err = do_follow_link(&next, nd); | 1248 | err = do_follow_link(&next, nd); |
929 | if (err) | 1249 | if (err) |
930 | goto return_err; | 1250 | goto return_err; |
931 | inode = nd->path.dentry->d_inode; | 1251 | nd->inode = nd->path.dentry->d_inode; |
932 | } else | 1252 | } else { |
933 | path_to_nameidata(&next, nd); | 1253 | path_to_nameidata(&next, nd); |
1254 | nd->inode = inode; | ||
1255 | } | ||
934 | err = -ENOENT; | 1256 | err = -ENOENT; |
935 | if (!inode) | 1257 | if (!nd->inode) |
936 | break; | 1258 | break; |
937 | if (lookup_flags & LOOKUP_DIRECTORY) { | 1259 | if (lookup_flags & LOOKUP_DIRECTORY) { |
938 | err = -ENOTDIR; | 1260 | err = -ENOTDIR; |
939 | if (!inode->i_op->lookup) | 1261 | if (!nd->inode->i_op->lookup) |
940 | break; | 1262 | break; |
941 | } | 1263 | } |
942 | goto return_base; | 1264 | goto return_base; |
@@ -958,6 +1280,8 @@ return_reval: | |||
958 | */ | 1280 | */ |
959 | if (nd->path.dentry && nd->path.dentry->d_sb && | 1281 | if (nd->path.dentry && nd->path.dentry->d_sb && |
960 | (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { | 1282 | (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { |
1283 | if (nameidata_drop_rcu_maybe(nd)) | ||
1284 | return -ECHILD; | ||
961 | err = -ESTALE; | 1285 | err = -ESTALE; |
962 | /* Note: we do not d_invalidate() */ | 1286 | /* Note: we do not d_invalidate() */ |
963 | if (!nd->path.dentry->d_op->d_revalidate( | 1287 | if (!nd->path.dentry->d_op->d_revalidate( |
@@ -965,16 +1289,34 @@ return_reval: | |||
965 | break; | 1289 | break; |
966 | } | 1290 | } |
967 | return_base: | 1291 | return_base: |
1292 | if (nameidata_drop_rcu_last_maybe(nd)) | ||
1293 | return -ECHILD; | ||
968 | return 0; | 1294 | return 0; |
969 | out_dput: | 1295 | out_dput: |
970 | path_put_conditional(&next, nd); | 1296 | if (!(nd->flags & LOOKUP_RCU)) |
1297 | path_put_conditional(&next, nd); | ||
971 | break; | 1298 | break; |
972 | } | 1299 | } |
973 | path_put(&nd->path); | 1300 | if (!(nd->flags & LOOKUP_RCU)) |
1301 | path_put(&nd->path); | ||
974 | return_err: | 1302 | return_err: |
975 | return err; | 1303 | return err; |
976 | } | 1304 | } |
977 | 1305 | ||
1306 | static inline int path_walk_rcu(const char *name, struct nameidata *nd) | ||
1307 | { | ||
1308 | current->total_link_count = 0; | ||
1309 | |||
1310 | return link_path_walk(name, nd); | ||
1311 | } | ||
1312 | |||
1313 | static inline int path_walk_simple(const char *name, struct nameidata *nd) | ||
1314 | { | ||
1315 | current->total_link_count = 0; | ||
1316 | |||
1317 | return link_path_walk(name, nd); | ||
1318 | } | ||
1319 | |||
978 | static int path_walk(const char *name, struct nameidata *nd) | 1320 | static int path_walk(const char *name, struct nameidata *nd) |
979 | { | 1321 | { |
980 | struct path save = nd->path; | 1322 | struct path save = nd->path; |
@@ -1000,6 +1342,88 @@ static int path_walk(const char *name, struct nameidata *nd) | |||
1000 | return result; | 1342 | return result; |
1001 | } | 1343 | } |
1002 | 1344 | ||
1345 | static void path_finish_rcu(struct nameidata *nd) | ||
1346 | { | ||
1347 | if (nd->flags & LOOKUP_RCU) { | ||
1348 | /* RCU dangling. Cancel it. */ | ||
1349 | nd->flags &= ~LOOKUP_RCU; | ||
1350 | nd->root.mnt = NULL; | ||
1351 | rcu_read_unlock(); | ||
1352 | br_read_unlock(vfsmount_lock); | ||
1353 | } | ||
1354 | if (nd->file) | ||
1355 | fput(nd->file); | ||
1356 | } | ||
1357 | |||
1358 | static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd) | ||
1359 | { | ||
1360 | int retval = 0; | ||
1361 | int fput_needed; | ||
1362 | struct file *file; | ||
1363 | |||
1364 | nd->last_type = LAST_ROOT; /* if there are only slashes... */ | ||
1365 | nd->flags = flags | LOOKUP_RCU; | ||
1366 | nd->depth = 0; | ||
1367 | nd->root.mnt = NULL; | ||
1368 | nd->file = NULL; | ||
1369 | |||
1370 | if (*name=='/') { | ||
1371 | struct fs_struct *fs = current->fs; | ||
1372 | |||
1373 | br_read_lock(vfsmount_lock); | ||
1374 | rcu_read_lock(); | ||
1375 | |||
1376 | spin_lock(&fs->lock); | ||
1377 | nd->root = fs->root; | ||
1378 | nd->path = nd->root; | ||
1379 | nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); | ||
1380 | spin_unlock(&fs->lock); | ||
1381 | |||
1382 | } else if (dfd == AT_FDCWD) { | ||
1383 | struct fs_struct *fs = current->fs; | ||
1384 | |||
1385 | br_read_lock(vfsmount_lock); | ||
1386 | rcu_read_lock(); | ||
1387 | |||
1388 | spin_lock(&fs->lock); | ||
1389 | nd->path = fs->pwd; | ||
1390 | nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); | ||
1391 | spin_unlock(&fs->lock); | ||
1392 | } else { | ||
1393 | struct dentry *dentry; | ||
1394 | |||
1395 | file = fget_light(dfd, &fput_needed); | ||
1396 | retval = -EBADF; | ||
1397 | if (!file) | ||
1398 | goto out_fail; | ||
1399 | |||
1400 | dentry = file->f_path.dentry; | ||
1401 | |||
1402 | retval = -ENOTDIR; | ||
1403 | if (!S_ISDIR(dentry->d_inode->i_mode)) | ||
1404 | goto fput_fail; | ||
1405 | |||
1406 | retval = file_permission(file, MAY_EXEC); | ||
1407 | if (retval) | ||
1408 | goto fput_fail; | ||
1409 | |||
1410 | nd->path = file->f_path; | ||
1411 | if (fput_needed) | ||
1412 | nd->file = file; | ||
1413 | |||
1414 | nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); | ||
1415 | br_read_lock(vfsmount_lock); | ||
1416 | rcu_read_lock(); | ||
1417 | } | ||
1418 | nd->inode = nd->path.dentry->d_inode; | ||
1419 | return 0; | ||
1420 | |||
1421 | fput_fail: | ||
1422 | fput_light(file, fput_needed); | ||
1423 | out_fail: | ||
1424 | return retval; | ||
1425 | } | ||
1426 | |||
1003 | static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) | 1427 | static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) |
1004 | { | 1428 | { |
1005 | int retval = 0; | 1429 | int retval = 0; |
@@ -1040,6 +1464,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei | |||
1040 | 1464 | ||
1041 | fput_light(file, fput_needed); | 1465 | fput_light(file, fput_needed); |
1042 | } | 1466 | } |
1467 | nd->inode = nd->path.dentry->d_inode; | ||
1043 | return 0; | 1468 | return 0; |
1044 | 1469 | ||
1045 | fput_fail: | 1470 | fput_fail: |
@@ -1052,16 +1477,53 @@ out_fail: | |||
1052 | static int do_path_lookup(int dfd, const char *name, | 1477 | static int do_path_lookup(int dfd, const char *name, |
1053 | unsigned int flags, struct nameidata *nd) | 1478 | unsigned int flags, struct nameidata *nd) |
1054 | { | 1479 | { |
1055 | int retval = path_init(dfd, name, flags, nd); | 1480 | int retval; |
1056 | if (!retval) | 1481 | |
1057 | retval = path_walk(name, nd); | 1482 | /* |
1058 | if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && | 1483 | * Path walking is largely split up into 2 different synchronisation |
1059 | nd->path.dentry->d_inode)) | 1484 | * schemes, rcu-walk and ref-walk (explained in |
1060 | audit_inode(name, nd->path.dentry); | 1485 | * Documentation/filesystems/path-lookup.txt). These share much of the |
1486 | * path walk code, but some things particularly setup, cleanup, and | ||
1487 | * following mounts are sufficiently divergent that functions are | ||
1488 | * duplicated. Typically there is a function foo(), and its RCU | ||
1489 | * analogue, foo_rcu(). | ||
1490 | * | ||
1491 | * -ECHILD is the error number of choice (just to avoid clashes) that | ||
1492 | * is returned if some aspect of an rcu-walk fails. Such an error must | ||
1493 | * be handled by restarting a traditional ref-walk (which will always | ||
1494 | * be able to complete). | ||
1495 | */ | ||
1496 | retval = path_init_rcu(dfd, name, flags, nd); | ||
1497 | if (unlikely(retval)) | ||
1498 | return retval; | ||
1499 | retval = path_walk_rcu(name, nd); | ||
1500 | path_finish_rcu(nd); | ||
1061 | if (nd->root.mnt) { | 1501 | if (nd->root.mnt) { |
1062 | path_put(&nd->root); | 1502 | path_put(&nd->root); |
1063 | nd->root.mnt = NULL; | 1503 | nd->root.mnt = NULL; |
1064 | } | 1504 | } |
1505 | |||
1506 | if (unlikely(retval == -ECHILD || retval == -ESTALE)) { | ||
1507 | /* slower, locked walk */ | ||
1508 | if (retval == -ESTALE) | ||
1509 | flags |= LOOKUP_REVAL; | ||
1510 | retval = path_init(dfd, name, flags, nd); | ||
1511 | if (unlikely(retval)) | ||
1512 | return retval; | ||
1513 | retval = path_walk(name, nd); | ||
1514 | if (nd->root.mnt) { | ||
1515 | path_put(&nd->root); | ||
1516 | nd->root.mnt = NULL; | ||
1517 | } | ||
1518 | } | ||
1519 | |||
1520 | if (likely(!retval)) { | ||
1521 | if (unlikely(!audit_dummy_context())) { | ||
1522 | if (nd->path.dentry && nd->inode) | ||
1523 | audit_inode(name, nd->path.dentry); | ||
1524 | } | ||
1525 | } | ||
1526 | |||
1065 | return retval; | 1527 | return retval; |
1066 | } | 1528 | } |
1067 | 1529 | ||
@@ -1104,10 +1566,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, | |||
1104 | path_get(&nd->path); | 1566 | path_get(&nd->path); |
1105 | nd->root = nd->path; | 1567 | nd->root = nd->path; |
1106 | path_get(&nd->root); | 1568 | path_get(&nd->root); |
1569 | nd->inode = nd->path.dentry->d_inode; | ||
1107 | 1570 | ||
1108 | retval = path_walk(name, nd); | 1571 | retval = path_walk(name, nd); |
1109 | if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && | 1572 | if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && |
1110 | nd->path.dentry->d_inode)) | 1573 | nd->inode)) |
1111 | audit_inode(name, nd->path.dentry); | 1574 | audit_inode(name, nd->path.dentry); |
1112 | 1575 | ||
1113 | path_put(&nd->root); | 1576 | path_put(&nd->root); |
@@ -1488,6 +1951,7 @@ out_unlock: | |||
1488 | mutex_unlock(&dir->d_inode->i_mutex); | 1951 | mutex_unlock(&dir->d_inode->i_mutex); |
1489 | dput(nd->path.dentry); | 1952 | dput(nd->path.dentry); |
1490 | nd->path.dentry = path->dentry; | 1953 | nd->path.dentry = path->dentry; |
1954 | |||
1491 | if (error) | 1955 | if (error) |
1492 | return error; | 1956 | return error; |
1493 | /* Don't check for write permission, don't truncate */ | 1957 | /* Don't check for write permission, don't truncate */ |
@@ -1582,6 +2046,9 @@ exit: | |||
1582 | return ERR_PTR(error); | 2046 | return ERR_PTR(error); |
1583 | } | 2047 | } |
1584 | 2048 | ||
2049 | /* | ||
2050 | * Handle O_CREAT case for do_filp_open | ||
2051 | */ | ||
1585 | static struct file *do_last(struct nameidata *nd, struct path *path, | 2052 | static struct file *do_last(struct nameidata *nd, struct path *path, |
1586 | int open_flag, int acc_mode, | 2053 | int open_flag, int acc_mode, |
1587 | int mode, const char *pathname) | 2054 | int mode, const char *pathname) |
@@ -1603,42 +2070,16 @@ static struct file *do_last(struct nameidata *nd, struct path *path, | |||
1603 | } | 2070 | } |
1604 | /* fallthrough */ | 2071 | /* fallthrough */ |
1605 | case LAST_ROOT: | 2072 | case LAST_ROOT: |
1606 | if (open_flag & O_CREAT) | 2073 | goto exit; |
1607 | goto exit; | ||
1608 | /* fallthrough */ | ||
1609 | case LAST_BIND: | 2074 | case LAST_BIND: |
1610 | audit_inode(pathname, dir); | 2075 | audit_inode(pathname, dir); |
1611 | goto ok; | 2076 | goto ok; |
1612 | } | 2077 | } |
1613 | 2078 | ||
1614 | /* trailing slashes? */ | 2079 | /* trailing slashes? */ |
1615 | if (nd->last.name[nd->last.len]) { | 2080 | if (nd->last.name[nd->last.len]) |
1616 | if (open_flag & O_CREAT) | 2081 | goto exit; |
1617 | goto exit; | ||
1618 | nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW; | ||
1619 | } | ||
1620 | |||
1621 | /* just plain open? */ | ||
1622 | if (!(open_flag & O_CREAT)) { | ||
1623 | error = do_lookup(nd, &nd->last, path); | ||
1624 | if (error) | ||
1625 | goto exit; | ||
1626 | error = -ENOENT; | ||
1627 | if (!path->dentry->d_inode) | ||
1628 | goto exit_dput; | ||
1629 | if (path->dentry->d_inode->i_op->follow_link) | ||
1630 | return NULL; | ||
1631 | error = -ENOTDIR; | ||
1632 | if (nd->flags & LOOKUP_DIRECTORY) { | ||
1633 | if (!path->dentry->d_inode->i_op->lookup) | ||
1634 | goto exit_dput; | ||
1635 | } | ||
1636 | path_to_nameidata(path, nd); | ||
1637 | audit_inode(pathname, nd->path.dentry); | ||
1638 | goto ok; | ||
1639 | } | ||
1640 | 2082 | ||
1641 | /* OK, it's O_CREAT */ | ||
1642 | mutex_lock(&dir->d_inode->i_mutex); | 2083 | mutex_lock(&dir->d_inode->i_mutex); |
1643 | 2084 | ||
1644 | path->dentry = lookup_hash(nd); | 2085 | path->dentry = lookup_hash(nd); |
@@ -1709,8 +2150,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, | |||
1709 | return NULL; | 2150 | return NULL; |
1710 | 2151 | ||
1711 | path_to_nameidata(path, nd); | 2152 | path_to_nameidata(path, nd); |
2153 | nd->inode = path->dentry->d_inode; | ||
1712 | error = -EISDIR; | 2154 | error = -EISDIR; |
1713 | if (S_ISDIR(path->dentry->d_inode->i_mode)) | 2155 | if (S_ISDIR(nd->inode->i_mode)) |
1714 | goto exit; | 2156 | goto exit; |
1715 | ok: | 2157 | ok: |
1716 | filp = finish_open(nd, open_flag, acc_mode); | 2158 | filp = finish_open(nd, open_flag, acc_mode); |
@@ -1741,7 +2183,7 @@ struct file *do_filp_open(int dfd, const char *pathname, | |||
1741 | struct path path; | 2183 | struct path path; |
1742 | int count = 0; | 2184 | int count = 0; |
1743 | int flag = open_to_namei_flags(open_flag); | 2185 | int flag = open_to_namei_flags(open_flag); |
1744 | int force_reval = 0; | 2186 | int flags; |
1745 | 2187 | ||
1746 | if (!(open_flag & O_CREAT)) | 2188 | if (!(open_flag & O_CREAT)) |
1747 | mode = 0; | 2189 | mode = 0; |
@@ -1770,54 +2212,84 @@ struct file *do_filp_open(int dfd, const char *pathname, | |||
1770 | if (open_flag & O_APPEND) | 2212 | if (open_flag & O_APPEND) |
1771 | acc_mode |= MAY_APPEND; | 2213 | acc_mode |= MAY_APPEND; |
1772 | 2214 | ||
1773 | /* find the parent */ | 2215 | flags = LOOKUP_OPEN; |
1774 | reval: | 2216 | if (open_flag & O_CREAT) { |
1775 | error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); | 2217 | flags |= LOOKUP_CREATE; |
2218 | if (open_flag & O_EXCL) | ||
2219 | flags |= LOOKUP_EXCL; | ||
2220 | } | ||
2221 | if (open_flag & O_DIRECTORY) | ||
2222 | flags |= LOOKUP_DIRECTORY; | ||
2223 | if (!(open_flag & O_NOFOLLOW)) | ||
2224 | flags |= LOOKUP_FOLLOW; | ||
2225 | |||
2226 | filp = get_empty_filp(); | ||
2227 | if (!filp) | ||
2228 | return ERR_PTR(-ENFILE); | ||
2229 | |||
2230 | filp->f_flags = open_flag; | ||
2231 | nd.intent.open.file = filp; | ||
2232 | nd.intent.open.flags = flag; | ||
2233 | nd.intent.open.create_mode = mode; | ||
2234 | |||
2235 | if (open_flag & O_CREAT) | ||
2236 | goto creat; | ||
2237 | |||
2238 | /* !O_CREAT, simple open */ | ||
2239 | error = do_path_lookup(dfd, pathname, flags, &nd); | ||
2240 | if (unlikely(error)) | ||
2241 | goto out_filp; | ||
2242 | error = -ELOOP; | ||
2243 | if (!(nd.flags & LOOKUP_FOLLOW)) { | ||
2244 | if (nd.inode->i_op->follow_link) | ||
2245 | goto out_path; | ||
2246 | } | ||
2247 | error = -ENOTDIR; | ||
2248 | if (nd.flags & LOOKUP_DIRECTORY) { | ||
2249 | if (!nd.inode->i_op->lookup) | ||
2250 | goto out_path; | ||
2251 | } | ||
2252 | audit_inode(pathname, nd.path.dentry); | ||
2253 | filp = finish_open(&nd, open_flag, acc_mode); | ||
2254 | return filp; | ||
2255 | |||
2256 | creat: | ||
2257 | /* OK, have to create the file. Find the parent. */ | ||
2258 | error = path_init_rcu(dfd, pathname, | ||
2259 | LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); | ||
1776 | if (error) | 2260 | if (error) |
1777 | return ERR_PTR(error); | 2261 | goto out_filp; |
1778 | if (force_reval) | 2262 | error = path_walk_rcu(pathname, &nd); |
1779 | nd.flags |= LOOKUP_REVAL; | 2263 | path_finish_rcu(&nd); |
2264 | if (unlikely(error == -ECHILD || error == -ESTALE)) { | ||
2265 | /* slower, locked walk */ | ||
2266 | if (error == -ESTALE) { | ||
2267 | reval: | ||
2268 | flags |= LOOKUP_REVAL; | ||
2269 | } | ||
2270 | error = path_init(dfd, pathname, | ||
2271 | LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); | ||
2272 | if (error) | ||
2273 | goto out_filp; | ||
1780 | 2274 | ||
1781 | current->total_link_count = 0; | 2275 | error = path_walk_simple(pathname, &nd); |
1782 | error = link_path_walk(pathname, &nd); | ||
1783 | if (error) { | ||
1784 | filp = ERR_PTR(error); | ||
1785 | goto out; | ||
1786 | } | 2276 | } |
1787 | if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT)) | 2277 | if (unlikely(error)) |
2278 | goto out_filp; | ||
2279 | if (unlikely(!audit_dummy_context())) | ||
1788 | audit_inode(pathname, nd.path.dentry); | 2280 | audit_inode(pathname, nd.path.dentry); |
1789 | 2281 | ||
1790 | /* | 2282 | /* |
1791 | * We have the parent and last component. | 2283 | * We have the parent and last component. |
1792 | */ | 2284 | */ |
1793 | 2285 | nd.flags = flags; | |
1794 | error = -ENFILE; | ||
1795 | filp = get_empty_filp(); | ||
1796 | if (filp == NULL) | ||
1797 | goto exit_parent; | ||
1798 | nd.intent.open.file = filp; | ||
1799 | filp->f_flags = open_flag; | ||
1800 | nd.intent.open.flags = flag; | ||
1801 | nd.intent.open.create_mode = mode; | ||
1802 | nd.flags &= ~LOOKUP_PARENT; | ||
1803 | nd.flags |= LOOKUP_OPEN; | ||
1804 | if (open_flag & O_CREAT) { | ||
1805 | nd.flags |= LOOKUP_CREATE; | ||
1806 | if (open_flag & O_EXCL) | ||
1807 | nd.flags |= LOOKUP_EXCL; | ||
1808 | } | ||
1809 | if (open_flag & O_DIRECTORY) | ||
1810 | nd.flags |= LOOKUP_DIRECTORY; | ||
1811 | if (!(open_flag & O_NOFOLLOW)) | ||
1812 | nd.flags |= LOOKUP_FOLLOW; | ||
1813 | filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); | 2286 | filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); |
1814 | while (unlikely(!filp)) { /* trailing symlink */ | 2287 | while (unlikely(!filp)) { /* trailing symlink */ |
1815 | struct path holder; | 2288 | struct path holder; |
1816 | struct inode *inode = path.dentry->d_inode; | ||
1817 | void *cookie; | 2289 | void *cookie; |
1818 | error = -ELOOP; | 2290 | error = -ELOOP; |
1819 | /* S_ISDIR part is a temporary automount kludge */ | 2291 | /* S_ISDIR part is a temporary automount kludge */ |
1820 | if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode)) | 2292 | if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(nd.inode->i_mode)) |
1821 | goto exit_dput; | 2293 | goto exit_dput; |
1822 | if (count++ == 32) | 2294 | if (count++ == 32) |
1823 | goto exit_dput; | 2295 | goto exit_dput; |
@@ -1838,36 +2310,33 @@ reval: | |||
1838 | goto exit_dput; | 2310 | goto exit_dput; |
1839 | error = __do_follow_link(&path, &nd, &cookie); | 2311 | error = __do_follow_link(&path, &nd, &cookie); |
1840 | if (unlikely(error)) { | 2312 | if (unlikely(error)) { |
2313 | if (!IS_ERR(cookie) && nd.inode->i_op->put_link) | ||
2314 | nd.inode->i_op->put_link(path.dentry, &nd, cookie); | ||
1841 | /* nd.path had been dropped */ | 2315 | /* nd.path had been dropped */ |
1842 | if (!IS_ERR(cookie) && inode->i_op->put_link) | 2316 | nd.path = path; |
1843 | inode->i_op->put_link(path.dentry, &nd, cookie); | 2317 | goto out_path; |
1844 | path_put(&path); | ||
1845 | release_open_intent(&nd); | ||
1846 | filp = ERR_PTR(error); | ||
1847 | goto out; | ||
1848 | } | 2318 | } |
1849 | holder = path; | 2319 | holder = path; |
1850 | nd.flags &= ~LOOKUP_PARENT; | 2320 | nd.flags &= ~LOOKUP_PARENT; |
1851 | filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); | 2321 | filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); |
1852 | if (inode->i_op->put_link) | 2322 | if (nd.inode->i_op->put_link) |
1853 | inode->i_op->put_link(holder.dentry, &nd, cookie); | 2323 | nd.inode->i_op->put_link(holder.dentry, &nd, cookie); |
1854 | path_put(&holder); | 2324 | path_put(&holder); |
1855 | } | 2325 | } |
1856 | out: | 2326 | out: |
1857 | if (nd.root.mnt) | 2327 | if (nd.root.mnt) |
1858 | path_put(&nd.root); | 2328 | path_put(&nd.root); |
1859 | if (filp == ERR_PTR(-ESTALE) && !force_reval) { | 2329 | if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL)) |
1860 | force_reval = 1; | ||
1861 | goto reval; | 2330 | goto reval; |
1862 | } | ||
1863 | return filp; | 2331 | return filp; |
1864 | 2332 | ||
1865 | exit_dput: | 2333 | exit_dput: |
1866 | path_put_conditional(&path, &nd); | 2334 | path_put_conditional(&path, &nd); |
2335 | out_path: | ||
2336 | path_put(&nd.path); | ||
2337 | out_filp: | ||
1867 | if (!IS_ERR(nd.intent.open.file)) | 2338 | if (!IS_ERR(nd.intent.open.file)) |
1868 | release_open_intent(&nd); | 2339 | release_open_intent(&nd); |
1869 | exit_parent: | ||
1870 | path_put(&nd.path); | ||
1871 | filp = ERR_PTR(error); | 2340 | filp = ERR_PTR(error); |
1872 | goto out; | 2341 | goto out; |
1873 | } | 2342 | } |
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index ae4b0fd9033f..998e3a715bcc 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c | |||
@@ -402,6 +402,10 @@ static int proc_sys_compare(const struct dentry *parent, | |||
402 | const struct dentry *dentry, const struct inode *inode, | 402 | const struct dentry *dentry, const struct inode *inode, |
403 | unsigned int len, const char *str, const struct qstr *name) | 403 | unsigned int len, const char *str, const struct qstr *name) |
404 | { | 404 | { |
405 | /* Although proc doesn't have negative dentries, rcu-walk means | ||
406 | * that inode here can be NULL */ | ||
407 | if (!inode) | ||
408 | return 0; | ||
405 | if (name->len != len) | 409 | if (name->len != len) |
406 | return 1; | 410 | return 1; |
407 | if (memcmp(name->name, str, len)) | 411 | if (memcmp(name->name, str, len)) |