diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-05-04 17:59:14 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-05-04 21:21:14 -0400 |
commit | 12f8ad4b0533d9212cb1d5e58ed73d2170114785 (patch) | |
tree | 6bab87d6d25b2ea246904aeabc3692e03c89b923 /fs | |
parent | 4f988f152ee087831ea5c1c77cda4454cacc052c (diff) |
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.
The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller. This
results in two annoyances:
- __d_lookup_rcu() now not only needs to return the dentry and the
sequence number that goes along with the lookup, it also needs to
return the inode pointer that was validated by that sequence number
check.
- and because we did the sequence number check early (to validate the
name pointer and length) we also couldn't just pass the dentry itself
to dentry_cmp(), we had to pass the counted string that contained the
name.
So that sequence number decision caused two separate ugly calling
conventions.
Both of these problems would be solved if we just did the sequence
number check in the caller instead. There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.
That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer. The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.
And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted. Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.
And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding. The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).
Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments. Doing an extra
sequence check for that slow case is not a problem, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/dcache.c | 152 | ||||
-rw-r--r-- | fs/namei.c | 19 |
2 files changed, 120 insertions, 51 deletions
diff --git a/fs/dcache.c b/fs/dcache.c index b80531c91779..539943eb442c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c | |||
@@ -153,16 +153,33 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, | |||
153 | * In contrast, 'ct' and 'tcount' can be from a pathname, and do | 153 | * In contrast, 'ct' and 'tcount' can be from a pathname, and do |
154 | * need the careful unaligned handling. | 154 | * need the careful unaligned handling. |
155 | */ | 155 | */ |
156 | static inline int dentry_cmp(const unsigned char *cs, size_t scount, | 156 | static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) |
157 | const unsigned char *ct, size_t tcount) | ||
158 | { | 157 | { |
159 | unsigned long a,b,mask; | 158 | unsigned long a,b,mask; |
159 | const unsigned char *cs; | ||
160 | 160 | ||
161 | if (unlikely(scount != tcount)) | 161 | if (unlikely(dentry->d_name.len != tcount)) |
162 | return 1; | 162 | return 1; |
163 | /* | ||
164 | * Be careful about RCU walk racing with rename: | ||
165 | * use ACCESS_ONCE to fetch the name pointer. | ||
166 | * | ||
167 | * NOTE! Even if a rename will mean that the length | ||
168 | * was not loaded atomically, we don't care. The | ||
169 | * RCU walk will check the sequence count eventually, | ||
170 | * and catch it. And we won't overrun the buffer, | ||
171 | * because we're reading the name pointer atomically, | ||
172 | * and a dentry name is guaranteed to be properly | ||
173 | * terminated with a NUL byte. | ||
174 | * | ||
175 | * End result: even if 'len' is wrong, we'll exit | ||
176 | * early because the data cannot match (there can | ||
177 | * be no NUL in the ct/tcount data) | ||
178 | */ | ||
179 | cs = ACCESS_ONCE(dentry->d_name.name); | ||
163 | 180 | ||
164 | for (;;) { | 181 | for (;;) { |
165 | a = load_unaligned_zeropad(cs); | 182 | a = *(unsigned long *)cs; |
166 | b = load_unaligned_zeropad(ct); | 183 | b = load_unaligned_zeropad(ct); |
167 | if (tcount < sizeof(unsigned long)) | 184 | if (tcount < sizeof(unsigned long)) |
168 | break; | 185 | break; |
@@ -180,10 +197,11 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount, | |||
180 | 197 | ||
181 | #else | 198 | #else |
182 | 199 | ||
183 | static inline int dentry_cmp(const unsigned char *cs, size_t scount, | 200 | static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) |
184 | const unsigned char *ct, size_t tcount) | ||
185 | { | 201 | { |
186 | if (scount != tcount) | 202 | const unsigned char *cs = dentry->d_name.name; |
203 | |||
204 | if (dentry->d_name.len != tcount) | ||
187 | return 1; | 205 | return 1; |
188 | 206 | ||
189 | do { | 207 | do { |
@@ -1439,18 +1457,16 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry, | |||
1439 | } | 1457 | } |
1440 | 1458 | ||
1441 | list_for_each_entry(alias, &inode->i_dentry, d_alias) { | 1459 | list_for_each_entry(alias, &inode->i_dentry, d_alias) { |
1442 | struct qstr *qstr = &alias->d_name; | ||
1443 | |||
1444 | /* | 1460 | /* |
1445 | * Don't need alias->d_lock here, because aliases with | 1461 | * Don't need alias->d_lock here, because aliases with |
1446 | * d_parent == entry->d_parent are not subject to name or | 1462 | * d_parent == entry->d_parent are not subject to name or |
1447 | * parent changes, because the parent inode i_mutex is held. | 1463 | * parent changes, because the parent inode i_mutex is held. |
1448 | */ | 1464 | */ |
1449 | if (qstr->hash != hash) | 1465 | if (alias->d_name.hash != hash) |
1450 | continue; | 1466 | continue; |
1451 | if (alias->d_parent != entry->d_parent) | 1467 | if (alias->d_parent != entry->d_parent) |
1452 | continue; | 1468 | continue; |
1453 | if (dentry_cmp(qstr->name, qstr->len, name, len)) | 1469 | if (dentry_cmp(alias, name, len)) |
1454 | continue; | 1470 | continue; |
1455 | __dget(alias); | 1471 | __dget(alias); |
1456 | return alias; | 1472 | return alias; |
@@ -1727,6 +1743,48 @@ err_out: | |||
1727 | } | 1743 | } |
1728 | EXPORT_SYMBOL(d_add_ci); | 1744 | EXPORT_SYMBOL(d_add_ci); |
1729 | 1745 | ||
1746 | /* | ||
1747 | * Do the slow-case of the dentry name compare. | ||
1748 | * | ||
1749 | * Unlike the dentry_cmp() function, we need to atomically | ||
1750 | * load the name, length and inode information, so that the | ||
1751 | * filesystem can rely on them, and can use the 'name' and | ||
1752 | * 'len' information without worrying about walking off the | ||
1753 | * end of memory etc. | ||
1754 | * | ||
1755 | * Thus the read_seqcount_retry() and the "duplicate" info | ||
1756 | * in arguments (the low-level filesystem should not look | ||
1757 | * at the dentry inode or name contents directly, since | ||
1758 | * rename can change them while we're in RCU mode). | ||
1759 | */ | ||
1760 | enum slow_d_compare { | ||
1761 | D_COMP_OK, | ||
1762 | D_COMP_NOMATCH, | ||
1763 | D_COMP_SEQRETRY, | ||
1764 | }; | ||
1765 | |||
1766 | static noinline enum slow_d_compare slow_dentry_cmp( | ||
1767 | const struct dentry *parent, | ||
1768 | struct inode *inode, | ||
1769 | struct dentry *dentry, | ||
1770 | unsigned int seq, | ||
1771 | const struct qstr *name) | ||
1772 | { | ||
1773 | int tlen = dentry->d_name.len; | ||
1774 | const char *tname = dentry->d_name.name; | ||
1775 | struct inode *i = dentry->d_inode; | ||
1776 | |||
1777 | if (read_seqcount_retry(&dentry->d_seq, seq)) { | ||
1778 | cpu_relax(); | ||
1779 | return D_COMP_SEQRETRY; | ||
1780 | } | ||
1781 | if (parent->d_op->d_compare(parent, inode, | ||
1782 | dentry, i, | ||
1783 | tlen, tname, name)) | ||
1784 | return D_COMP_NOMATCH; | ||
1785 | return D_COMP_OK; | ||
1786 | } | ||
1787 | |||
1730 | /** | 1788 | /** |
1731 | * __d_lookup_rcu - search for a dentry (racy, store-free) | 1789 | * __d_lookup_rcu - search for a dentry (racy, store-free) |
1732 | * @parent: parent dentry | 1790 | * @parent: parent dentry |
@@ -1753,10 +1811,13 @@ EXPORT_SYMBOL(d_add_ci); | |||
1753 | * the returned dentry, so long as its parent's seqlock is checked after the | 1811 | * the returned dentry, so long as its parent's seqlock is checked after the |
1754 | * child is looked up. Thus, an interlocking stepping of sequence lock checks | 1812 | * child is looked up. Thus, an interlocking stepping of sequence lock checks |
1755 | * is formed, giving integrity down the path walk. | 1813 | * is formed, giving integrity down the path walk. |
1814 | * | ||
1815 | * NOTE! The caller *has* to check the resulting dentry against the sequence | ||
1816 | * number we've returned before using any of the resulting dentry state! | ||
1756 | */ | 1817 | */ |
1757 | struct dentry *__d_lookup_rcu(const struct dentry *parent, | 1818 | struct dentry *__d_lookup_rcu(const struct dentry *parent, |
1758 | const struct qstr *name, | 1819 | const struct qstr *name, |
1759 | unsigned *seqp, struct inode **inode) | 1820 | unsigned *seqp, struct inode *inode) |
1760 | { | 1821 | { |
1761 | unsigned int len = name->len; | 1822 | unsigned int len = name->len; |
1762 | unsigned int hash = name->hash; | 1823 | unsigned int hash = name->hash; |
@@ -1787,49 +1848,46 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, | |||
1787 | */ | 1848 | */ |
1788 | hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { | 1849 | hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { |
1789 | unsigned seq; | 1850 | unsigned seq; |
1790 | struct inode *i; | ||
1791 | const char *tname; | ||
1792 | int tlen; | ||
1793 | 1851 | ||
1794 | if (dentry->d_name.hash != hash) | 1852 | if (dentry->d_name.hash != hash) |
1795 | continue; | 1853 | continue; |
1796 | 1854 | ||
1797 | seqretry: | 1855 | seqretry: |
1798 | seq = read_seqcount_begin(&dentry->d_seq); | 1856 | /* |
1857 | * The dentry sequence count protects us from concurrent | ||
1858 | * renames, and thus protects inode, parent and name fields. | ||
1859 | * | ||
1860 | * The caller must perform a seqcount check in order | ||
1861 | * to do anything useful with the returned dentry, | ||
1862 | * including using the 'd_inode' pointer. | ||
1863 | * | ||
1864 | * NOTE! We do a "raw" seqcount_begin here. That means that | ||
1865 | * we don't wait for the sequence count to stabilize if it | ||
1866 | * is in the middle of a sequence change. If we do the slow | ||
1867 | * dentry compare, we will do seqretries until it is stable, | ||
1868 | * and if we end up with a successful lookup, we actually | ||
1869 | * want to exit RCU lookup anyway. | ||
1870 | */ | ||
1871 | seq = raw_seqcount_begin(&dentry->d_seq); | ||
1799 | if (dentry->d_parent != parent) | 1872 | if (dentry->d_parent != parent) |
1800 | continue; | 1873 | continue; |
1801 | if (d_unhashed(dentry)) | 1874 | if (d_unhashed(dentry)) |
1802 | continue; | 1875 | continue; |
1803 | tlen = dentry->d_name.len; | 1876 | *seqp = seq; |
1804 | tname = dentry->d_name.name; | 1877 | |
1805 | i = dentry->d_inode; | ||
1806 | prefetch(tname); | ||
1807 | /* | ||
1808 | * This seqcount check is required to ensure name and | ||
1809 | * len are loaded atomically, so as not to walk off the | ||
1810 | * edge of memory when walking. If we could load this | ||
1811 | * atomically some other way, we could drop this check. | ||
1812 | */ | ||
1813 | if (read_seqcount_retry(&dentry->d_seq, seq)) | ||
1814 | goto seqretry; | ||
1815 | if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { | 1878 | if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { |
1816 | if (parent->d_op->d_compare(parent, *inode, | 1879 | switch (slow_dentry_cmp(parent, inode, dentry, seq, name)) { |
1817 | dentry, i, | 1880 | case D_COMP_OK: |
1818 | tlen, tname, name)) | 1881 | return dentry; |
1819 | continue; | 1882 | case D_COMP_NOMATCH: |
1820 | } else { | ||
1821 | if (dentry_cmp(tname, tlen, str, len)) | ||
1822 | continue; | 1883 | continue; |
1884 | default: | ||
1885 | goto seqretry; | ||
1886 | } | ||
1823 | } | 1887 | } |
1824 | /* | 1888 | |
1825 | * No extra seqcount check is required after the name | 1889 | if (!dentry_cmp(dentry, str, len)) |
1826 | * compare. The caller must perform a seqcount check in | 1890 | return dentry; |
1827 | * order to do anything useful with the returned dentry | ||
1828 | * anyway. | ||
1829 | */ | ||
1830 | *seqp = seq; | ||
1831 | *inode = i; | ||
1832 | return dentry; | ||
1833 | } | 1891 | } |
1834 | return NULL; | 1892 | return NULL; |
1835 | } | 1893 | } |
@@ -1908,8 +1966,6 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name) | |||
1908 | rcu_read_lock(); | 1966 | rcu_read_lock(); |
1909 | 1967 | ||
1910 | hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { | 1968 | hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { |
1911 | const char *tname; | ||
1912 | int tlen; | ||
1913 | 1969 | ||
1914 | if (dentry->d_name.hash != hash) | 1970 | if (dentry->d_name.hash != hash) |
1915 | continue; | 1971 | continue; |
@@ -1924,15 +1980,15 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name) | |||
1924 | * It is safe to compare names since d_move() cannot | 1980 | * It is safe to compare names since d_move() cannot |
1925 | * change the qstr (protected by d_lock). | 1981 | * change the qstr (protected by d_lock). |
1926 | */ | 1982 | */ |
1927 | tlen = dentry->d_name.len; | ||
1928 | tname = dentry->d_name.name; | ||
1929 | if (parent->d_flags & DCACHE_OP_COMPARE) { | 1983 | if (parent->d_flags & DCACHE_OP_COMPARE) { |
1984 | int tlen = dentry->d_name.len; | ||
1985 | const char *tname = dentry->d_name.name; | ||
1930 | if (parent->d_op->d_compare(parent, parent->d_inode, | 1986 | if (parent->d_op->d_compare(parent, parent->d_inode, |
1931 | dentry, dentry->d_inode, | 1987 | dentry, dentry->d_inode, |
1932 | tlen, tname, name)) | 1988 | tlen, tname, name)) |
1933 | goto next; | 1989 | goto next; |
1934 | } else { | 1990 | } else { |
1935 | if (dentry_cmp(tname, tlen, str, len)) | 1991 | if (dentry_cmp(dentry, str, len)) |
1936 | goto next; | 1992 | goto next; |
1937 | } | 1993 | } |
1938 | 1994 | ||
diff --git a/fs/namei.c b/fs/namei.c index c42791914f82..46bd0045575d 100644 --- a/fs/namei.c +++ b/fs/namei.c | |||
@@ -1154,12 +1154,25 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, | |||
1154 | */ | 1154 | */ |
1155 | if (nd->flags & LOOKUP_RCU) { | 1155 | if (nd->flags & LOOKUP_RCU) { |
1156 | unsigned seq; | 1156 | unsigned seq; |
1157 | *inode = nd->inode; | 1157 | dentry = __d_lookup_rcu(parent, name, &seq, nd->inode); |
1158 | dentry = __d_lookup_rcu(parent, name, &seq, inode); | ||
1159 | if (!dentry) | 1158 | if (!dentry) |
1160 | goto unlazy; | 1159 | goto unlazy; |
1161 | 1160 | ||
1162 | /* Memory barrier in read_seqcount_begin of child is enough */ | 1161 | /* |
1162 | * This sequence count validates that the inode matches | ||
1163 | * the dentry name information from lookup. | ||
1164 | */ | ||
1165 | *inode = dentry->d_inode; | ||
1166 | if (read_seqcount_retry(&dentry->d_seq, seq)) | ||
1167 | return -ECHILD; | ||
1168 | |||
1169 | /* | ||
1170 | * This sequence count validates that the parent had no | ||
1171 | * changes while we did the lookup of the dentry above. | ||
1172 | * | ||
1173 | * The memory barrier in read_seqcount_begin of child is | ||
1174 | * enough, we can use __read_seqcount_retry here. | ||
1175 | */ | ||
1163 | if (__read_seqcount_retry(&parent->d_seq, nd->seq)) | 1176 | if (__read_seqcount_retry(&parent->d_seq, nd->seq)) |
1164 | return -ECHILD; | 1177 | return -ECHILD; |
1165 | nd->seq = seq; | 1178 | nd->seq = seq; |