aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorJarek Poplawski <jarkao2@gmail.com>2009-01-19 20:03:56 -0500
committerDavid S. Miller <davem@davemloft.net>2009-01-19 20:03:56 -0500
commit8b9d3728977760f6bd1317c4420890f73695354e (patch)
tree5e397a8ab86e69eb429b3fd0e3c2585c798239e5 /net/core
parent9e9fd12dc0679643c191fc9795a3021807e77de4 (diff)
net: Fix data corruption when splicing from sockets.
The trick in socket splicing where we try to convert the skb->data into a page based reference using virt_to_page() does not work so well. The idea is to pass the virt_to_page() reference via the pipe buffer, and refcount the buffer using a SKB reference. But if we are splicing from a socket to a socket (via sendpage) this doesn't work. The from side processing will grab the page (and SKB) references. The sendpage() calls will grab page references only, return, and then the from side processing completes and drops the SKB ref. The page based reference to skb->data is not enough to keep the kmalloc() buffer backing it from being reused. Yet, that is all that the socket send side has at this point. This leads to data corruption if the skb->data buffer is reused by SLAB before the send side socket actually gets the TX packet out to the device. The fix employed here is to simply allocate a page and copy the skb->data bytes into that page. This will hurt performance, but there is no clear way to fix this properly without a copy at the present time, and it is important to get rid of the data corruption. With fixes from Herbert Xu. Tested-by: Willy Tarreau <w@1wt.eu> Foreseen-by: Changli Gao <xiaosuo@gmail.com> Diagnosed-by: Willy Tarreau <w@1wt.eu> Reported-by: Willy Tarreau <w@1wt.eu> Fixed-by: Jens Axboe <jens.axboe@oracle.com> Signed-off-by: Jarek Poplawski <jarkao2@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core')
-rw-r--r--net/core/skbuff.c61
1 files changed, 29 insertions, 32 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 65eac7739033..56272ac6dfd8 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -73,17 +73,13 @@ static struct kmem_cache *skbuff_fclone_cache __read_mostly;
73static void sock_pipe_buf_release(struct pipe_inode_info *pipe, 73static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
74 struct pipe_buffer *buf) 74 struct pipe_buffer *buf)
75{ 75{
76 struct sk_buff *skb = (struct sk_buff *) buf->private; 76 put_page(buf->page);
77
78 kfree_skb(skb);
79} 77}
80 78
81static void sock_pipe_buf_get(struct pipe_inode_info *pipe, 79static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
82 struct pipe_buffer *buf) 80 struct pipe_buffer *buf)
83{ 81{
84 struct sk_buff *skb = (struct sk_buff *) buf->private; 82 get_page(buf->page);
85
86 skb_get(skb);
87} 83}
88 84
89static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, 85static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
@@ -1334,9 +1330,19 @@ fault:
1334 */ 1330 */
1335static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) 1331static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
1336{ 1332{
1337 struct sk_buff *skb = (struct sk_buff *) spd->partial[i].private; 1333 put_page(spd->pages[i]);
1334}
1338 1335
1339 kfree_skb(skb); 1336static inline struct page *linear_to_page(struct page *page, unsigned int len,
1337 unsigned int offset)
1338{
1339 struct page *p = alloc_pages(GFP_KERNEL, 0);
1340
1341 if (!p)
1342 return NULL;
1343 memcpy(page_address(p) + offset, page_address(page) + offset, len);
1344
1345 return p;
1340} 1346}
1341 1347
1342/* 1348/*
@@ -1344,16 +1350,23 @@ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
1344 */ 1350 */
1345static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, 1351static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
1346 unsigned int len, unsigned int offset, 1352 unsigned int len, unsigned int offset,
1347 struct sk_buff *skb) 1353 struct sk_buff *skb, int linear)
1348{ 1354{
1349 if (unlikely(spd->nr_pages == PIPE_BUFFERS)) 1355 if (unlikely(spd->nr_pages == PIPE_BUFFERS))
1350 return 1; 1356 return 1;
1351 1357
1358 if (linear) {
1359 page = linear_to_page(page, len, offset);
1360 if (!page)
1361 return 1;
1362 } else
1363 get_page(page);
1364
1352 spd->pages[spd->nr_pages] = page; 1365 spd->pages[spd->nr_pages] = page;
1353 spd->partial[spd->nr_pages].len = len; 1366 spd->partial[spd->nr_pages].len = len;
1354 spd->partial[spd->nr_pages].offset = offset; 1367 spd->partial[spd->nr_pages].offset = offset;
1355 spd->partial[spd->nr_pages].private = (unsigned long) skb_get(skb);
1356 spd->nr_pages++; 1368 spd->nr_pages++;
1369
1357 return 0; 1370 return 0;
1358} 1371}
1359 1372
@@ -1369,7 +1382,7 @@ static inline void __segment_seek(struct page **page, unsigned int *poff,
1369static inline int __splice_segment(struct page *page, unsigned int poff, 1382static inline int __splice_segment(struct page *page, unsigned int poff,
1370 unsigned int plen, unsigned int *off, 1383 unsigned int plen, unsigned int *off,
1371 unsigned int *len, struct sk_buff *skb, 1384 unsigned int *len, struct sk_buff *skb,
1372 struct splice_pipe_desc *spd) 1385 struct splice_pipe_desc *spd, int linear)
1373{ 1386{
1374 if (!*len) 1387 if (!*len)
1375 return 1; 1388 return 1;
@@ -1392,7 +1405,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
1392 /* the linear region may spread across several pages */ 1405 /* the linear region may spread across several pages */
1393 flen = min_t(unsigned int, flen, PAGE_SIZE - poff); 1406 flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
1394 1407
1395 if (spd_fill_page(spd, page, flen, poff, skb)) 1408 if (spd_fill_page(spd, page, flen, poff, skb, linear))
1396 return 1; 1409 return 1;
1397 1410
1398 __segment_seek(&page, &poff, &plen, flen); 1411 __segment_seek(&page, &poff, &plen, flen);
@@ -1419,7 +1432,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
1419 if (__splice_segment(virt_to_page(skb->data), 1432 if (__splice_segment(virt_to_page(skb->data),
1420 (unsigned long) skb->data & (PAGE_SIZE - 1), 1433 (unsigned long) skb->data & (PAGE_SIZE - 1),
1421 skb_headlen(skb), 1434 skb_headlen(skb),
1422 offset, len, skb, spd)) 1435 offset, len, skb, spd, 1))
1423 return 1; 1436 return 1;
1424 1437
1425 /* 1438 /*
@@ -1429,7 +1442,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
1429 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; 1442 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
1430 1443
1431 if (__splice_segment(f->page, f->page_offset, f->size, 1444 if (__splice_segment(f->page, f->page_offset, f->size,
1432 offset, len, skb, spd)) 1445 offset, len, skb, spd, 0))
1433 return 1; 1446 return 1;
1434 } 1447 }
1435 1448
@@ -1442,7 +1455,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
1442 * the frag list, if such a thing exists. We'd probably need to recurse to 1455 * the frag list, if such a thing exists. We'd probably need to recurse to
1443 * handle that cleanly. 1456 * handle that cleanly.
1444 */ 1457 */
1445int skb_splice_bits(struct sk_buff *__skb, unsigned int offset, 1458int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
1446 struct pipe_inode_info *pipe, unsigned int tlen, 1459 struct pipe_inode_info *pipe, unsigned int tlen,
1447 unsigned int flags) 1460 unsigned int flags)
1448{ 1461{
@@ -1455,16 +1468,6 @@ int skb_splice_bits(struct sk_buff *__skb, unsigned int offset,
1455 .ops = &sock_pipe_buf_ops, 1468 .ops = &sock_pipe_buf_ops,
1456 .spd_release = sock_spd_release, 1469 .spd_release = sock_spd_release,
1457 }; 1470 };
1458 struct sk_buff *skb;
1459
1460 /*
1461 * I'd love to avoid the clone here, but tcp_read_sock()
1462 * ignores reference counts and unconditonally kills the sk_buff
1463 * on return from the actor.
1464 */
1465 skb = skb_clone(__skb, GFP_KERNEL);
1466 if (unlikely(!skb))
1467 return -ENOMEM;
1468 1471
1469 /* 1472 /*
1470 * __skb_splice_bits() only fails if the output has no room left, 1473 * __skb_splice_bits() only fails if the output has no room left,
@@ -1488,15 +1491,9 @@ int skb_splice_bits(struct sk_buff *__skb, unsigned int offset,
1488 } 1491 }
1489 1492
1490done: 1493done:
1491 /*
1492 * drop our reference to the clone, the pipe consumption will
1493 * drop the rest.
1494 */
1495 kfree_skb(skb);
1496
1497 if (spd.nr_pages) { 1494 if (spd.nr_pages) {
1495 struct sock *sk = skb->sk;
1498 int ret; 1496 int ret;
1499 struct sock *sk = __skb->sk;
1500 1497
1501 /* 1498 /*
1502 * Drop the socket lock, otherwise we have reverse 1499 * Drop the socket lock, otherwise we have reverse