diff options
author | Neil Horman <nhorman@tuxdriver.com> | 2011-03-04 19:26:03 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-04 20:28:52 -0500 |
commit | e9e3d724e2145f5039b423c290ce2b2c3d8f94bc (patch) | |
tree | 9b0ff4de361fe358a5a2400b35a48206688b5f71 /fs | |
parent | 3256f80fbbc25bd2504bd564844c615227621e56 (diff) |
nfs4: Ensure that ACL pages sent over NFS were not allocated from the slab (v3)
The "bad_page()" page allocator sanity check was reported recently (call
chain as follows):
bad_page+0x69/0x91
free_hot_cold_page+0x81/0x144
skb_release_data+0x5f/0x98
__kfree_skb+0x11/0x1a
tcp_ack+0x6a3/0x1868
tcp_rcv_established+0x7a6/0x8b9
tcp_v4_do_rcv+0x2a/0x2fa
tcp_v4_rcv+0x9a2/0x9f6
do_timer+0x2df/0x52c
ip_local_deliver+0x19d/0x263
ip_rcv+0x539/0x57c
netif_receive_skb+0x470/0x49f
:virtio_net:virtnet_poll+0x46b/0x5c5
net_rx_action+0xac/0x1b3
__do_softirq+0x89/0x133
call_softirq+0x1c/0x28
do_softirq+0x2c/0x7d
do_IRQ+0xec/0xf5
default_idle+0x0/0x50
ret_from_intr+0x0/0xa
default_idle+0x29/0x50
cpu_idle+0x95/0xb8
start_kernel+0x220/0x225
_sinittext+0x22f/0x236
It occurs because an skb with a fraglist was freed from the tcp
retransmit queue when it was acked, but a page on that fraglist had
PG_Slab set (indicating it was allocated from the Slab allocator (which
means the free path above can't safely free it via put_page.
We tracked this back to an nfsv4 setacl operation, in which the nfs code
attempted to fill convert the passed in buffer to an array of pages in
__nfs4_proc_set_acl, which gets used by the skb->frags list in
xs_sendpages. __nfs4_proc_set_acl just converts each page in the buffer
to a page struct via virt_to_page, but the vfs allocates the buffer via
kmalloc, meaning the PG_slab bit is set. We can't create a buffer with
kmalloc and free it later in the tcp ack path with put_page, so we need
to either:
1) ensure that when we create the list of pages, no page struct has
PG_Slab set
or
2) not use a page list to send this data
Given that these buffers can be multiple pages and arbitrarily sized, I
think (1) is the right way to go. I've written the below patch to
allocate a page from the buddy allocator directly and copy the data over
to it. This ensures that we have a put_page free-able page for every
entry that winds up on an skb frag list, so it can be safely freed when
the frame is acked. We do a put page on each entry after the
rpc_call_sync call so as to drop our own reference count to the page,
leaving only the ref count taken by tcp_sendpages. This way the data
will be properly freed when the ack comes in
Successfully tested by myself to solve the above oops.
Note, as this is the result of a setacl operation that exceeded a page
of data, I think this amounts to a local DOS triggerable by an
uprivlidged user, so I'm CCing security on this as well.
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Trond Myklebust <Trond.Myklebust@netapp.com>
CC: security@kernel.org
CC: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/nfs/nfs4proc.c | 44 |
1 files changed, 42 insertions, 2 deletions
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 78936a8f40ab..1ff76acc7e98 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/sunrpc/bc_xprt.h> | 51 | #include <linux/sunrpc/bc_xprt.h> |
52 | #include <linux/xattr.h> | 52 | #include <linux/xattr.h> |
53 | #include <linux/utsname.h> | 53 | #include <linux/utsname.h> |
54 | #include <linux/mm.h> | ||
54 | 55 | ||
55 | #include "nfs4_fs.h" | 56 | #include "nfs4_fs.h" |
56 | #include "delegation.h" | 57 | #include "delegation.h" |
@@ -3252,6 +3253,35 @@ static void buf_to_pages(const void *buf, size_t buflen, | |||
3252 | } | 3253 | } |
3253 | } | 3254 | } |
3254 | 3255 | ||
3256 | static int buf_to_pages_noslab(const void *buf, size_t buflen, | ||
3257 | struct page **pages, unsigned int *pgbase) | ||
3258 | { | ||
3259 | struct page *newpage, **spages; | ||
3260 | int rc = 0; | ||
3261 | size_t len; | ||
3262 | spages = pages; | ||
3263 | |||
3264 | do { | ||
3265 | len = min(PAGE_CACHE_SIZE, buflen); | ||
3266 | newpage = alloc_page(GFP_KERNEL); | ||
3267 | |||
3268 | if (newpage == NULL) | ||
3269 | goto unwind; | ||
3270 | memcpy(page_address(newpage), buf, len); | ||
3271 | buf += len; | ||
3272 | buflen -= len; | ||
3273 | *pages++ = newpage; | ||
3274 | rc++; | ||
3275 | } while (buflen != 0); | ||
3276 | |||
3277 | return rc; | ||
3278 | |||
3279 | unwind: | ||
3280 | for(; rc > 0; rc--) | ||
3281 | __free_page(spages[rc-1]); | ||
3282 | return -ENOMEM; | ||
3283 | } | ||
3284 | |||
3255 | struct nfs4_cached_acl { | 3285 | struct nfs4_cached_acl { |
3256 | int cached; | 3286 | int cached; |
3257 | size_t len; | 3287 | size_t len; |
@@ -3420,13 +3450,23 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl | |||
3420 | .rpc_argp = &arg, | 3450 | .rpc_argp = &arg, |
3421 | .rpc_resp = &res, | 3451 | .rpc_resp = &res, |
3422 | }; | 3452 | }; |
3423 | int ret; | 3453 | int ret, i; |
3424 | 3454 | ||
3425 | if (!nfs4_server_supports_acls(server)) | 3455 | if (!nfs4_server_supports_acls(server)) |
3426 | return -EOPNOTSUPP; | 3456 | return -EOPNOTSUPP; |
3457 | i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase); | ||
3458 | if (i < 0) | ||
3459 | return i; | ||
3427 | nfs_inode_return_delegation(inode); | 3460 | nfs_inode_return_delegation(inode); |
3428 | buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); | ||
3429 | ret = nfs4_call_sync(server, &msg, &arg, &res, 1); | 3461 | ret = nfs4_call_sync(server, &msg, &arg, &res, 1); |
3462 | |||
3463 | /* | ||
3464 | * Free each page after tx, so the only ref left is | ||
3465 | * held by the network stack | ||
3466 | */ | ||
3467 | for (; i > 0; i--) | ||
3468 | put_page(pages[i-1]); | ||
3469 | |||
3430 | /* | 3470 | /* |
3431 | * Acl update can result in inode attribute update. | 3471 | * Acl update can result in inode attribute update. |
3432 | * so mark the attribute cache invalid. | 3472 | * so mark the attribute cache invalid. |