aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-05-04 17:59:14 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-05-04 21:21:14 -0400
commit12f8ad4b0533d9212cb1d5e58ed73d2170114785 (patch)
tree6bab87d6d25b2ea246904aeabc3692e03c89b923 /fs
parent4f988f152ee087831ea5c1c77cda4454cacc052c (diff)
vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces
The calling conventions for __d_lookup_rcu() and dentry_cmp() are annoying in different ways, and there is actually one single underlying reason for both of the annoyances. The fundamental reason is that we do the returned dentry sequence number check inside __d_lookup_rcu() instead of doing it in the caller. This results in two annoyances: - __d_lookup_rcu() now not only needs to return the dentry and the sequence number that goes along with the lookup, it also needs to return the inode pointer that was validated by that sequence number check. - and because we did the sequence number check early (to validate the name pointer and length) we also couldn't just pass the dentry itself to dentry_cmp(), we had to pass the counted string that contained the name. So that sequence number decision caused two separate ugly calling conventions. Both of these problems would be solved if we just did the sequence number check in the caller instead. There's only one caller, and that caller already has to do the sequence number check for the parent anyway, so just do that. That allows us to stop returning the dentry->d_inode in that in-out argument (pointer-to-pointer-to-inode), so we can make the inode argument just a regular input inode pointer. The caller can just load the inode from dentry->d_inode, and then do the sequence number check after that to make sure that it's synchronized with the name we looked up. And it allows us to just pass in the dentry to dentry_cmp(), which is what all the callers really wanted. Sure, dentry_cmp() has to be a bit careful about the dentry (which is not stable during RCU lookup), but that's actually very simple. And now that dentry_cmp() can clearly see that the first string argument is a dentry, we can use the direct word access for that, instead of the careful unaligned zero-padding. The dentry name is always properly aligned, since it is a single path component that is either embedded into the dentry itself, or was allocated with kmalloc() (see __d_alloc). Finally, this also uninlines the nasty slow-case for dentry comparisons: that one *does* need to do a sequence number check, since it will call in to the low-level filesystems, and we want to give those a stable inode pointer and path component length/start arguments. Doing an extra sequence check for that slow case is not a problem, though. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/dcache.c152
-rw-r--r--fs/namei.c19
2 files changed, 120 insertions, 51 deletions
diff --git a/fs/dcache.c b/fs/dcache.c
index b80531c91779..539943eb442c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -153,16 +153,33 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
153 * In contrast, 'ct' and 'tcount' can be from a pathname, and do 153 * In contrast, 'ct' and 'tcount' can be from a pathname, and do
154 * need the careful unaligned handling. 154 * need the careful unaligned handling.
155 */ 155 */
156static inline int dentry_cmp(const unsigned char *cs, size_t scount, 156static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
157 const unsigned char *ct, size_t tcount)
158{ 157{
159 unsigned long a,b,mask; 158 unsigned long a,b,mask;
159 const unsigned char *cs;
160 160
161 if (unlikely(scount != tcount)) 161 if (unlikely(dentry->d_name.len != tcount))
162 return 1; 162 return 1;
163 /*
164 * Be careful about RCU walk racing with rename:
165 * use ACCESS_ONCE to fetch the name pointer.
166 *
167 * NOTE! Even if a rename will mean that the length
168 * was not loaded atomically, we don't care. The
169 * RCU walk will check the sequence count eventually,
170 * and catch it. And we won't overrun the buffer,
171 * because we're reading the name pointer atomically,
172 * and a dentry name is guaranteed to be properly
173 * terminated with a NUL byte.
174 *
175 * End result: even if 'len' is wrong, we'll exit
176 * early because the data cannot match (there can
177 * be no NUL in the ct/tcount data)
178 */
179 cs = ACCESS_ONCE(dentry->d_name.name);
163 180
164 for (;;) { 181 for (;;) {
165 a = load_unaligned_zeropad(cs); 182 a = *(unsigned long *)cs;
166 b = load_unaligned_zeropad(ct); 183 b = load_unaligned_zeropad(ct);
167 if (tcount < sizeof(unsigned long)) 184 if (tcount < sizeof(unsigned long))
168 break; 185 break;
@@ -180,10 +197,11 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount,
180 197
181#else 198#else
182 199
183static inline int dentry_cmp(const unsigned char *cs, size_t scount, 200static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
184 const unsigned char *ct, size_t tcount)
185{ 201{
186 if (scount != tcount) 202 const unsigned char *cs = dentry->d_name.name;
203
204 if (dentry->d_name.len != tcount)
187 return 1; 205 return 1;
188 206
189 do { 207 do {
@@ -1439,18 +1457,16 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
1439 } 1457 }
1440 1458
1441 list_for_each_entry(alias, &inode->i_dentry, d_alias) { 1459 list_for_each_entry(alias, &inode->i_dentry, d_alias) {
1442 struct qstr *qstr = &alias->d_name;
1443
1444 /* 1460 /*
1445 * Don't need alias->d_lock here, because aliases with 1461 * Don't need alias->d_lock here, because aliases with
1446 * d_parent == entry->d_parent are not subject to name or 1462 * d_parent == entry->d_parent are not subject to name or
1447 * parent changes, because the parent inode i_mutex is held. 1463 * parent changes, because the parent inode i_mutex is held.
1448 */ 1464 */
1449 if (qstr->hash != hash) 1465 if (alias->d_name.hash != hash)
1450 continue; 1466 continue;
1451 if (alias->d_parent != entry->d_parent) 1467 if (alias->d_parent != entry->d_parent)
1452 continue; 1468 continue;
1453 if (dentry_cmp(qstr->name, qstr->len, name, len)) 1469 if (dentry_cmp(alias, name, len))
1454 continue; 1470 continue;
1455 __dget(alias); 1471 __dget(alias);
1456 return alias; 1472 return alias;
@@ -1727,6 +1743,48 @@ err_out:
1727} 1743}
1728EXPORT_SYMBOL(d_add_ci); 1744EXPORT_SYMBOL(d_add_ci);
1729 1745
1746/*
1747 * Do the slow-case of the dentry name compare.
1748 *
1749 * Unlike the dentry_cmp() function, we need to atomically
1750 * load the name, length and inode information, so that the
1751 * filesystem can rely on them, and can use the 'name' and
1752 * 'len' information without worrying about walking off the
1753 * end of memory etc.
1754 *
1755 * Thus the read_seqcount_retry() and the "duplicate" info
1756 * in arguments (the low-level filesystem should not look
1757 * at the dentry inode or name contents directly, since
1758 * rename can change them while we're in RCU mode).
1759 */
1760enum slow_d_compare {
1761 D_COMP_OK,
1762 D_COMP_NOMATCH,
1763 D_COMP_SEQRETRY,
1764};
1765
1766static noinline enum slow_d_compare slow_dentry_cmp(
1767 const struct dentry *parent,
1768 struct inode *inode,
1769 struct dentry *dentry,
1770 unsigned int seq,
1771 const struct qstr *name)
1772{
1773 int tlen = dentry->d_name.len;
1774 const char *tname = dentry->d_name.name;
1775 struct inode *i = dentry->d_inode;
1776
1777 if (read_seqcount_retry(&dentry->d_seq, seq)) {
1778 cpu_relax();
1779 return D_COMP_SEQRETRY;
1780 }
1781 if (parent->d_op->d_compare(parent, inode,
1782 dentry, i,
1783 tlen, tname, name))
1784 return D_COMP_NOMATCH;
1785 return D_COMP_OK;
1786}
1787
1730/** 1788/**
1731 * __d_lookup_rcu - search for a dentry (racy, store-free) 1789 * __d_lookup_rcu - search for a dentry (racy, store-free)
1732 * @parent: parent dentry 1790 * @parent: parent dentry
@@ -1753,10 +1811,13 @@ EXPORT_SYMBOL(d_add_ci);
1753 * the returned dentry, so long as its parent's seqlock is checked after the 1811 * the returned dentry, so long as its parent's seqlock is checked after the
1754 * child is looked up. Thus, an interlocking stepping of sequence lock checks 1812 * child is looked up. Thus, an interlocking stepping of sequence lock checks
1755 * is formed, giving integrity down the path walk. 1813 * is formed, giving integrity down the path walk.
1814 *
1815 * NOTE! The caller *has* to check the resulting dentry against the sequence
1816 * number we've returned before using any of the resulting dentry state!
1756 */ 1817 */
1757struct dentry *__d_lookup_rcu(const struct dentry *parent, 1818struct dentry *__d_lookup_rcu(const struct dentry *parent,
1758 const struct qstr *name, 1819 const struct qstr *name,
1759 unsigned *seqp, struct inode **inode) 1820 unsigned *seqp, struct inode *inode)
1760{ 1821{
1761 unsigned int len = name->len; 1822 unsigned int len = name->len;
1762 unsigned int hash = name->hash; 1823 unsigned int hash = name->hash;
@@ -1787,49 +1848,46 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
1787 */ 1848 */
1788 hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { 1849 hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
1789 unsigned seq; 1850 unsigned seq;
1790 struct inode *i;
1791 const char *tname;
1792 int tlen;
1793 1851
1794 if (dentry->d_name.hash != hash) 1852 if (dentry->d_name.hash != hash)
1795 continue; 1853 continue;
1796 1854
1797seqretry: 1855seqretry:
1798 seq = read_seqcount_begin(&dentry->d_seq); 1856 /*
1857 * The dentry sequence count protects us from concurrent
1858 * renames, and thus protects inode, parent and name fields.
1859 *
1860 * The caller must perform a seqcount check in order
1861 * to do anything useful with the returned dentry,
1862 * including using the 'd_inode' pointer.
1863 *
1864 * NOTE! We do a "raw" seqcount_begin here. That means that
1865 * we don't wait for the sequence count to stabilize if it
1866 * is in the middle of a sequence change. If we do the slow
1867 * dentry compare, we will do seqretries until it is stable,
1868 * and if we end up with a successful lookup, we actually
1869 * want to exit RCU lookup anyway.
1870 */
1871 seq = raw_seqcount_begin(&dentry->d_seq);
1799 if (dentry->d_parent != parent) 1872 if (dentry->d_parent != parent)
1800 continue; 1873 continue;
1801 if (d_unhashed(dentry)) 1874 if (d_unhashed(dentry))
1802 continue; 1875 continue;
1803 tlen = dentry->d_name.len; 1876 *seqp = seq;
1804 tname = dentry->d_name.name; 1877
1805 i = dentry->d_inode;
1806 prefetch(tname);
1807 /*
1808 * This seqcount check is required to ensure name and
1809 * len are loaded atomically, so as not to walk off the
1810 * edge of memory when walking. If we could load this
1811 * atomically some other way, we could drop this check.
1812 */
1813 if (read_seqcount_retry(&dentry->d_seq, seq))
1814 goto seqretry;
1815 if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { 1878 if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
1816 if (parent->d_op->d_compare(parent, *inode, 1879 switch (slow_dentry_cmp(parent, inode, dentry, seq, name)) {
1817 dentry, i, 1880 case D_COMP_OK:
1818 tlen, tname, name)) 1881 return dentry;
1819 continue; 1882 case D_COMP_NOMATCH:
1820 } else {
1821 if (dentry_cmp(tname, tlen, str, len))
1822 continue; 1883 continue;
1884 default:
1885 goto seqretry;
1886 }
1823 } 1887 }
1824 /* 1888
1825 * No extra seqcount check is required after the name 1889 if (!dentry_cmp(dentry, str, len))
1826 * compare. The caller must perform a seqcount check in 1890 return dentry;
1827 * order to do anything useful with the returned dentry
1828 * anyway.
1829 */
1830 *seqp = seq;
1831 *inode = i;
1832 return dentry;
1833 } 1891 }
1834 return NULL; 1892 return NULL;
1835} 1893}
@@ -1908,8 +1966,6 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
1908 rcu_read_lock(); 1966 rcu_read_lock();
1909 1967
1910 hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { 1968 hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
1911 const char *tname;
1912 int tlen;
1913 1969
1914 if (dentry->d_name.hash != hash) 1970 if (dentry->d_name.hash != hash)
1915 continue; 1971 continue;
@@ -1924,15 +1980,15 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
1924 * It is safe to compare names since d_move() cannot 1980 * It is safe to compare names since d_move() cannot
1925 * change the qstr (protected by d_lock). 1981 * change the qstr (protected by d_lock).
1926 */ 1982 */
1927 tlen = dentry->d_name.len;
1928 tname = dentry->d_name.name;
1929 if (parent->d_flags & DCACHE_OP_COMPARE) { 1983 if (parent->d_flags & DCACHE_OP_COMPARE) {
1984 int tlen = dentry->d_name.len;
1985 const char *tname = dentry->d_name.name;
1930 if (parent->d_op->d_compare(parent, parent->d_inode, 1986 if (parent->d_op->d_compare(parent, parent->d_inode,
1931 dentry, dentry->d_inode, 1987 dentry, dentry->d_inode,
1932 tlen, tname, name)) 1988 tlen, tname, name))
1933 goto next; 1989 goto next;
1934 } else { 1990 } else {
1935 if (dentry_cmp(tname, tlen, str, len)) 1991 if (dentry_cmp(dentry, str, len))
1936 goto next; 1992 goto next;
1937 } 1993 }
1938 1994
diff --git a/fs/namei.c b/fs/namei.c
index c42791914f82..46bd0045575d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1154,12 +1154,25 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
1154 */ 1154 */
1155 if (nd->flags & LOOKUP_RCU) { 1155 if (nd->flags & LOOKUP_RCU) {
1156 unsigned seq; 1156 unsigned seq;
1157 *inode = nd->inode; 1157 dentry = __d_lookup_rcu(parent, name, &seq, nd->inode);
1158 dentry = __d_lookup_rcu(parent, name, &seq, inode);
1159 if (!dentry) 1158 if (!dentry)
1160 goto unlazy; 1159 goto unlazy;
1161 1160
1162 /* Memory barrier in read_seqcount_begin of child is enough */ 1161 /*
1162 * This sequence count validates that the inode matches
1163 * the dentry name information from lookup.
1164 */
1165 *inode = dentry->d_inode;
1166 if (read_seqcount_retry(&dentry->d_seq, seq))
1167 return -ECHILD;
1168
1169 /*
1170 * This sequence count validates that the parent had no
1171 * changes while we did the lookup of the dentry above.
1172 *
1173 * The memory barrier in read_seqcount_begin of child is
1174 * enough, we can use __read_seqcount_retry here.
1175 */
1163 if (__read_seqcount_retry(&parent->d_seq, nd->seq)) 1176 if (__read_seqcount_retry(&parent->d_seq, nd->seq))
1164 return -ECHILD; 1177 return -ECHILD;
1165 nd->seq = seq; 1178 nd->seq = seq;