aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPatrick McHardy <kaber@trash.net>2007-06-25 07:35:20 -0400
committerDavid S. Miller <davem@sunset.davemloft.net>2007-07-11 01:15:37 -0400
commit334a8132d9950f769f390f0f35c233d099688e7a (patch)
tree0a9e988971d4c20e720e99bccfa6f5feeca5d94a
parente50c41b53d7aa48152dd9c633b04fc7abd536f1f (diff)
[SKBUFF]: Keep track of writable header len of headerless clones
Currently NAT (and others) that want to modify cloned skbs copy them, even if in the vast majority of cases its not necessary because the skb is a clone made by TCP and the portion NAT wants to modify is actually writable because TCP release the header reference before cloning. The problem is that there is no clean way for NAT to find out how long the writable header area is, so this patch introduces skb->hdr_len to hold this length. When a headerless skb is cloned skb->hdr_len is set to the current headroom, for regular clones it is copied from the original. A new function skb_clone_writable(skb, len) returns whether the skb is writable up to len bytes from skb->data. To avoid enlarging the skb the mac_len field is reduced to 16 bit and the new hdr_len field is put in the remaining 16 bit. I've done a few rough benchmarks of NAT (not with this exact patch, but a very similar one). As expected it saves huge amounts of system time in case of sendfile, bringing it down to basically the same amount as without NAT, with sendmsg it only helps on loopback, probably because of the large MTU. Transmit a 1GB file using sendfile/sendmsg over eth0/lo with and without NAT: - sendfile eth0, no NAT: sys 0m0.388s - sendfile eth0, NAT: sys 0m1.835s - sendfile eth0: NAT + path: sys 0m0.370s (~ -80%) - sendfile lo, no NAT: sys 0m0.258s - sendfile lo, NAT: sys 0m2.609s - sendfile lo, NAT + patch: sys 0m0.260s (~ -90%) - sendmsg eth0, no NAT: sys 0m2.508s - sendmsg eth0, NAT: sys 0m2.539s - sendmsg eth0, NAT + patch: sys 0m2.445s (no change) - sendmsg lo, no NAT: sys 0m2.151s - sendmsg lo, NAT: sys 0m3.557s - sendmsg lo, NAT + patch: sys 0m2.159s (~ -40%) I expect other users can see a similar performance improvement, packet mangling iptables targets, ipip and ip_gre come to mind .. Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/skbuff.h24
-rw-r--r--net/core/skbuff.c2
-rw-r--r--net/netfilter/core.c4
3 files changed, 25 insertions, 5 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6f0b2f7d0010..881fe80f01d0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -147,8 +147,8 @@ struct skb_shared_info {
147 147
148/* We divide dataref into two halves. The higher 16 bits hold references 148/* We divide dataref into two halves. The higher 16 bits hold references
149 * to the payload part of skb->data. The lower 16 bits hold references to 149 * to the payload part of skb->data. The lower 16 bits hold references to
150 * the entire skb->data. It is up to the users of the skb to agree on 150 * the entire skb->data. A clone of a headerless skb holds the length of
151 * where the payload starts. 151 * the header in skb->hdr_len.
152 * 152 *
153 * All users must obey the rule that the skb->data reference count must be 153 * All users must obey the rule that the skb->data reference count must be
154 * greater than or equal to the payload reference count. 154 * greater than or equal to the payload reference count.
@@ -206,6 +206,7 @@ typedef unsigned char *sk_buff_data_t;
206 * @len: Length of actual data 206 * @len: Length of actual data
207 * @data_len: Data length 207 * @data_len: Data length
208 * @mac_len: Length of link layer header 208 * @mac_len: Length of link layer header
209 * @hdr_len: writable header length of cloned skb
209 * @csum: Checksum (must include start/offset pair) 210 * @csum: Checksum (must include start/offset pair)
210 * @csum_start: Offset from skb->head where checksumming should start 211 * @csum_start: Offset from skb->head where checksumming should start
211 * @csum_offset: Offset from csum_start where checksum should be stored 212 * @csum_offset: Offset from csum_start where checksum should be stored
@@ -260,8 +261,9 @@ struct sk_buff {
260 char cb[48]; 261 char cb[48];
261 262
262 unsigned int len, 263 unsigned int len,
263 data_len, 264 data_len;
264 mac_len; 265 __u16 mac_len,
266 hdr_len;
265 union { 267 union {
266 __wsum csum; 268 __wsum csum;
267 struct { 269 struct {
@@ -1322,6 +1324,20 @@ static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev,
1322} 1324}
1323 1325
1324/** 1326/**
1327 * skb_clone_writable - is the header of a clone writable
1328 * @skb: buffer to check
1329 * @len: length up to which to write
1330 *
1331 * Returns true if modifying the header part of the cloned buffer
1332 * does not requires the data to be copied.
1333 */
1334static inline int skb_clone_writable(struct sk_buff *skb, int len)
1335{
1336 return !skb_header_cloned(skb) &&
1337 skb_headroom(skb) + len <= skb->hdr_len;
1338}
1339
1340/**
1325 * skb_cow - copy header of skb when it is required 1341 * skb_cow - copy header of skb when it is required
1326 * @skb: buffer to cow 1342 * @skb: buffer to cow
1327 * @headroom: needed headroom 1343 * @headroom: needed headroom
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3943c3ad9145..c989c3a0f907 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -415,6 +415,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
415 C(csum); 415 C(csum);
416 C(local_df); 416 C(local_df);
417 n->cloned = 1; 417 n->cloned = 1;
418 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
418 n->nohdr = 0; 419 n->nohdr = 0;
419 C(pkt_type); 420 C(pkt_type);
420 C(ip_summed); 421 C(ip_summed);
@@ -676,6 +677,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
676 skb->network_header += off; 677 skb->network_header += off;
677 skb->mac_header += off; 678 skb->mac_header += off;
678 skb->cloned = 0; 679 skb->cloned = 0;
680 skb->hdr_len = 0;
679 skb->nohdr = 0; 681 skb->nohdr = 0;
680 atomic_set(&skb_shinfo(skb)->dataref, 1); 682 atomic_set(&skb_shinfo(skb)->dataref, 1);
681 return 0; 683 return 0;
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index a84478ee2ded..3aaabec70d19 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -203,7 +203,9 @@ int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len)
203 return 0; 203 return 0;
204 204
205 /* Not exclusive use of packet? Must copy. */ 205 /* Not exclusive use of packet? Must copy. */
206 if (skb_shared(*pskb) || skb_cloned(*pskb)) 206 if (skb_cloned(*pskb) && !skb_clone_writable(*pskb, writable_len))
207 goto copy_skb;
208 if (skb_shared(*pskb))
207 goto copy_skb; 209 goto copy_skb;
208 210
209 return pskb_may_pull(*pskb, writable_len); 211 return pskb_may_pull(*pskb, writable_len);