aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2006-06-25 08:47:58 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-25 13:01:09 -0400
commit01408c4939479ec46c15aa7ef6e2406be50eeeca (patch)
tree106ee144cc7214cc5cb78bc35a49fc654ef16fe9
parent5f507d9e05b4dbfee34f3d967623ad3fbf0f28b3 (diff)
[PATCH] Prepare for __copy_from_user_inatomic to not zero missed bytes
The problem is that when we write to a file, the copy from userspace to pagecache is first done with preemption disabled, so if the source address is not immediately available the copy fails *and* *zeros* *the* *destination*. This is a problem because a concurrent read (which admittedly is an odd thing to do) might see zeros rather that was there before the write, or what was there after, or some mixture of the two (any of these being a reasonable thing to see). If the copy did fail, it will immediately be retried with preemption re-enabled so any transient problem with accessing the source won't cause an error. The first copying does not need to zero any uncopied bytes, and doing so causes the problem. It uses copy_from_user_atomic rather than copy_from_user so the simple expedient is to change copy_from_user_atomic to *not* zero out bytes on failure. The first of these two patches prepares for the change by fixing two places which assume copy_from_user_atomic does zero the tail. The two usages are very similar pieces of code which copy from a userspace iovec into one or more page-cache pages. These are changed to remove the assumption. The second patch changes __copy_from_user_inatomic* to not zero the tail. Once these are accepted, I will look at similar patches of other architectures where this is important (ppc, mips and sparc being the ones I can find). This patch: There is a problem with __copy_from_user_inatomic zeroing the tail of the buffer in the case of an error. As it is called in atomic context, the error may be transient, so it results in zeros being written where maybe they shouldn't be. In the usage in filemap, this opens a window for a well timed read to see data (zeros) which is not consistent with any ordering of reads and writes. Most cases where __copy_from_user_inatomic is called, a failure results in __copy_from_user being called immediately. As long as the latter zeros the tail, the former doesn't need to. However in *copy_from_user_iovec implementations (in both filemap and ntfs/file), it is assumed that copy_from_user_inatomic will zero the tail. This patch removes that assumption, so that after this patch it will be safe for copy_from_user_inatomic to not zero the tail. This patch also adds some commentary to filemap.h and asm-i386/uaccess.h. After this patch, all architectures that might disable preempt when kmap_atomic is called need to have their __copy_from_user_inatomic* "fixed". This includes - powerpc - i386 - mips - sparc Signed-off-by: Neil Brown <neilb@suse.de> Cc: David Howells <dhowells@redhat.com> Cc: Anton Altaparmakov <aia21@cantab.net> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: William Lee Irwin III <wli@holomorphy.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--fs/ntfs/file.c26
-rw-r--r--include/asm-i386/uaccess.h6
-rw-r--r--mm/filemap.c8
-rw-r--r--mm/filemap.h26
4 files changed, 40 insertions, 26 deletions
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 88292f9e4b9b..2e42c2dcae12 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1358,7 +1358,7 @@ err_out:
1358 goto out; 1358 goto out;
1359} 1359}
1360 1360
1361static size_t __ntfs_copy_from_user_iovec(char *vaddr, 1361static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
1362 const struct iovec *iov, size_t iov_ofs, size_t bytes) 1362 const struct iovec *iov, size_t iov_ofs, size_t bytes)
1363{ 1363{
1364 size_t total = 0; 1364 size_t total = 0;
@@ -1376,10 +1376,6 @@ static size_t __ntfs_copy_from_user_iovec(char *vaddr,
1376 bytes -= len; 1376 bytes -= len;
1377 vaddr += len; 1377 vaddr += len;
1378 if (unlikely(left)) { 1378 if (unlikely(left)) {
1379 /*
1380 * Zero the rest of the target like __copy_from_user().
1381 */
1382 memset(vaddr, 0, bytes);
1383 total -= left; 1379 total -= left;
1384 break; 1380 break;
1385 } 1381 }
@@ -1420,11 +1416,13 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
1420 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s 1416 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
1421 * single-segment behaviour. 1417 * single-segment behaviour.
1422 * 1418 *
1423 * We call the same helper (__ntfs_copy_from_user_iovec()) both when atomic and 1419 * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both
1424 * when not atomic. This is ok because __ntfs_copy_from_user_iovec() calls 1420 * when atomic and when not atomic. This is ok because
1425 * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In 1421 * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic()
1426 * fact, the only difference between __copy_from_user_inatomic() and 1422 * and it is ok to call this when non-atomic.
1427 * __copy_from_user() is that the latter calls might_sleep(). And on many 1423 * Infact, the only difference between __copy_from_user_inatomic() and
1424 * __copy_from_user() is that the latter calls might_sleep() and the former
1425 * should not zero the tail of the buffer on error. And on many
1428 * architectures __copy_from_user_inatomic() is just defined to 1426 * architectures __copy_from_user_inatomic() is just defined to
1429 * __copy_from_user() so it makes no difference at all on those architectures. 1427 * __copy_from_user() so it makes no difference at all on those architectures.
1430 */ 1428 */
@@ -1441,14 +1439,18 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1441 if (len > bytes) 1439 if (len > bytes)
1442 len = bytes; 1440 len = bytes;
1443 kaddr = kmap_atomic(*pages, KM_USER0); 1441 kaddr = kmap_atomic(*pages, KM_USER0);
1444 copied = __ntfs_copy_from_user_iovec(kaddr + ofs, 1442 copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs,
1445 *iov, *iov_ofs, len); 1443 *iov, *iov_ofs, len);
1446 kunmap_atomic(kaddr, KM_USER0); 1444 kunmap_atomic(kaddr, KM_USER0);
1447 if (unlikely(copied != len)) { 1445 if (unlikely(copied != len)) {
1448 /* Do it the slow way. */ 1446 /* Do it the slow way. */
1449 kaddr = kmap(*pages); 1447 kaddr = kmap(*pages);
1450 copied = __ntfs_copy_from_user_iovec(kaddr + ofs, 1448 copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs,
1451 *iov, *iov_ofs, len); 1449 *iov, *iov_ofs, len);
1450 /*
1451 * Zero the rest of the target like __copy_from_user().
1452 */
1453 memset(kaddr + ofs + copied, 0, len - copied);
1452 kunmap(*pages); 1454 kunmap(*pages);
1453 if (unlikely(copied != len)) 1455 if (unlikely(copied != len))
1454 goto err_out; 1456 goto err_out;
diff --git a/include/asm-i386/uaccess.h b/include/asm-i386/uaccess.h
index 8462f8e0e658..d0d253277be5 100644
--- a/include/asm-i386/uaccess.h
+++ b/include/asm-i386/uaccess.h
@@ -458,6 +458,12 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
458 * 458 *
459 * If some data could not be copied, this function will pad the copied 459 * If some data could not be copied, this function will pad the copied
460 * data to the requested size using zero bytes. 460 * data to the requested size using zero bytes.
461 *
462 * An alternate version - __copy_from_user_inatomic() - may be called from
463 * atomic context and will fail rather than sleep. In this case the
464 * uncopied bytes will *NOT* be padded with zeros. See fs/filemap.h
465 * for explanation of why this is needed.
466 * FIXME this isn't implimented yet EMXIF
461 */ 467 */
462static __always_inline unsigned long 468static __always_inline unsigned long
463__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) 469__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
diff --git a/mm/filemap.c b/mm/filemap.c
index 807a463fd5ed..1ed4be2a7654 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1892,7 +1892,7 @@ int remove_suid(struct dentry *dentry)
1892EXPORT_SYMBOL(remove_suid); 1892EXPORT_SYMBOL(remove_suid);
1893 1893
1894size_t 1894size_t
1895__filemap_copy_from_user_iovec(char *vaddr, 1895__filemap_copy_from_user_iovec_inatomic(char *vaddr,
1896 const struct iovec *iov, size_t base, size_t bytes) 1896 const struct iovec *iov, size_t base, size_t bytes)
1897{ 1897{
1898 size_t copied = 0, left = 0; 1898 size_t copied = 0, left = 0;
@@ -1908,12 +1908,8 @@ __filemap_copy_from_user_iovec(char *vaddr,
1908 vaddr += copy; 1908 vaddr += copy;
1909 iov++; 1909 iov++;
1910 1910
1911 if (unlikely(left)) { 1911 if (unlikely(left))
1912 /* zero the rest of the target like __copy_from_user */
1913 if (bytes)
1914 memset(vaddr, 0, bytes);
1915 break; 1912 break;
1916 }
1917 } 1913 }
1918 return copied - left; 1914 return copied - left;
1919} 1915}
diff --git a/mm/filemap.h b/mm/filemap.h
index 5683cde22055..536979fb4ba7 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -16,15 +16,23 @@
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17 17
18size_t 18size_t
19__filemap_copy_from_user_iovec(char *vaddr, 19__filemap_copy_from_user_iovec_inatomic(char *vaddr,
20 const struct iovec *iov, 20 const struct iovec *iov,
21 size_t base, 21 size_t base,
22 size_t bytes); 22 size_t bytes);
23 23
24/* 24/*
25 * Copy as much as we can into the page and return the number of bytes which 25 * Copy as much as we can into the page and return the number of bytes which
26 * were sucessfully copied. If a fault is encountered then clear the page 26 * were sucessfully copied. If a fault is encountered then clear the page
27 * out to (offset+bytes) and return the number of bytes which were copied. 27 * out to (offset+bytes) and return the number of bytes which were copied.
28 *
29 * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache
30 * to *NOT* zero any tail of the buffer that it failed to copy. If it does,
31 * and if the following non-atomic copy succeeds, then there is a small window
32 * where the target page contains neither the data before the write, nor the
33 * data after the write (it contains zero). A read at this time will see
34 * data that is inconsistent with any ordering of the read and the write.
35 * (This has been detected in practice).
28 */ 36 */
29static inline size_t 37static inline size_t
30filemap_copy_from_user(struct page *page, unsigned long offset, 38filemap_copy_from_user(struct page *page, unsigned long offset,
@@ -60,13 +68,15 @@ filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
60 size_t copied; 68 size_t copied;
61 69
62 kaddr = kmap_atomic(page, KM_USER0); 70 kaddr = kmap_atomic(page, KM_USER0);
63 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, 71 copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
64 base, bytes); 72 base, bytes);
65 kunmap_atomic(kaddr, KM_USER0); 73 kunmap_atomic(kaddr, KM_USER0);
66 if (copied != bytes) { 74 if (copied != bytes) {
67 kaddr = kmap(page); 75 kaddr = kmap(page);
68 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, 76 copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
69 base, bytes); 77 base, bytes);
78 if (bytes - copied)
79 memset(kaddr + offset + copied, 0, bytes - copied);
70 kunmap(page); 80 kunmap(page);
71 } 81 }
72 return copied; 82 return copied;