aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2014-05-07 15:49:16 -0400
committerDavid S. Miller <davem@davemloft.net>2014-05-07 15:49:16 -0400
commitd32aebfd64c8c8649b39cd6789b141525cc9d7c0 (patch)
tree965fe4107b8a28f594832657610a74152e3f27a1
parent418a31561d594a2b636c1e2fa94ecd9e1245abb1 (diff)
parentc1e756bfcbcac838a86a23f3e4501b556a961e3c (diff)
Merge branch 'gso_forward'
Florian Westphal says: ==================== net: ip: push gso skb forwarding handling down the stack Turns out doing the segmentation in forwarding was not a bright idea, there are corner-cases where this has unintended side-effects. This patch pushes the segmentation downwards. After this, netif_skb_dev_features() function can be removed again, it was only added to fetch the features of the output device, we can just use skb->dev after the pushdown. Tested with following setup: host -> kvm_router -> kvm_host mtu 1500 mtu1280 - 'host' has route to kvm_host with locked mtu of 1500 - gso/gro enabled on all interfaces Did tests with all of following combinations: - netfilter conntrack off and on on kvm_router - virtio-net and e1000 driver on kvm_router - tcp and udp bulk xmit from host to kvm_host for tcp, I added TCPMSS mangling on kvm_host to make it lie about tcp mss. Also added a dummy '-t mangle -A POSTROUTING -p udp -f' rule to make sure no udp fragments are seen in the 'conntrack on' and 'virtio-net' case. Also checked (with ping -M do -s 1400)' that it still sends the wanted icmp error message when size exceeds 1280. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/netdevice.h7
-rw-r--r--net/core/dev.c22
-rw-r--r--net/ipv4/ip_forward.c50
-rw-r--r--net/ipv4/ip_output.c51
4 files changed, 59 insertions, 71 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7ed3a3aa6604..20e99efb1ca6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3180,12 +3180,7 @@ void netdev_change_features(struct net_device *dev);
3180void netif_stacked_transfer_operstate(const struct net_device *rootdev, 3180void netif_stacked_transfer_operstate(const struct net_device *rootdev,
3181 struct net_device *dev); 3181 struct net_device *dev);
3182 3182
3183netdev_features_t netif_skb_dev_features(struct sk_buff *skb, 3183netdev_features_t netif_skb_features(struct sk_buff *skb);
3184 const struct net_device *dev);
3185static inline netdev_features_t netif_skb_features(struct sk_buff *skb)
3186{
3187 return netif_skb_dev_features(skb, skb->dev);
3188}
3189 3184
3190static inline bool net_gso_ok(netdev_features_t features, int gso_type) 3185static inline bool net_gso_ok(netdev_features_t features, int gso_type)
3191{ 3186{
diff --git a/net/core/dev.c b/net/core/dev.c
index d2c8a06b3a98..c619b8641337 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2418,7 +2418,7 @@ EXPORT_SYMBOL(netdev_rx_csum_fault);
2418 * 2. No high memory really exists on this machine. 2418 * 2. No high memory really exists on this machine.
2419 */ 2419 */
2420 2420
2421static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb) 2421static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2422{ 2422{
2423#ifdef CONFIG_HIGHMEM 2423#ifdef CONFIG_HIGHMEM
2424 int i; 2424 int i;
@@ -2493,38 +2493,36 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2493} 2493}
2494 2494
2495static netdev_features_t harmonize_features(struct sk_buff *skb, 2495static netdev_features_t harmonize_features(struct sk_buff *skb,
2496 const struct net_device *dev, 2496 netdev_features_t features)
2497 netdev_features_t features)
2498{ 2497{
2499 int tmp; 2498 int tmp;
2500 2499
2501 if (skb->ip_summed != CHECKSUM_NONE && 2500 if (skb->ip_summed != CHECKSUM_NONE &&
2502 !can_checksum_protocol(features, skb_network_protocol(skb, &tmp))) { 2501 !can_checksum_protocol(features, skb_network_protocol(skb, &tmp))) {
2503 features &= ~NETIF_F_ALL_CSUM; 2502 features &= ~NETIF_F_ALL_CSUM;
2504 } else if (illegal_highdma(dev, skb)) { 2503 } else if (illegal_highdma(skb->dev, skb)) {
2505 features &= ~NETIF_F_SG; 2504 features &= ~NETIF_F_SG;
2506 } 2505 }
2507 2506
2508 return features; 2507 return features;
2509} 2508}
2510 2509
2511netdev_features_t netif_skb_dev_features(struct sk_buff *skb, 2510netdev_features_t netif_skb_features(struct sk_buff *skb)
2512 const struct net_device *dev)
2513{ 2511{
2514 __be16 protocol = skb->protocol; 2512 __be16 protocol = skb->protocol;
2515 netdev_features_t features = dev->features; 2513 netdev_features_t features = skb->dev->features;
2516 2514
2517 if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs) 2515 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2518 features &= ~NETIF_F_GSO_MASK; 2516 features &= ~NETIF_F_GSO_MASK;
2519 2517
2520 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { 2518 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2521 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; 2519 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2522 protocol = veh->h_vlan_encapsulated_proto; 2520 protocol = veh->h_vlan_encapsulated_proto;
2523 } else if (!vlan_tx_tag_present(skb)) { 2521 } else if (!vlan_tx_tag_present(skb)) {
2524 return harmonize_features(skb, dev, features); 2522 return harmonize_features(skb, features);
2525 } 2523 }
2526 2524
2527 features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | 2525 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2528 NETIF_F_HW_VLAN_STAG_TX); 2526 NETIF_F_HW_VLAN_STAG_TX);
2529 2527
2530 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) 2528 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
@@ -2532,9 +2530,9 @@ netdev_features_t netif_skb_dev_features(struct sk_buff *skb,
2532 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | 2530 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2533 NETIF_F_HW_VLAN_STAG_TX; 2531 NETIF_F_HW_VLAN_STAG_TX;
2534 2532
2535 return harmonize_features(skb, dev, features); 2533 return harmonize_features(skb, features);
2536} 2534}
2537EXPORT_SYMBOL(netif_skb_dev_features); 2535EXPORT_SYMBOL(netif_skb_features);
2538 2536
2539int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, 2537int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2540 struct netdev_queue *txq) 2538 struct netdev_queue *txq)
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index c29ae8371e44..6f111e48e11c 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -56,53 +56,6 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
56 return true; 56 return true;
57} 57}
58 58
59static bool ip_gso_exceeds_dst_mtu(const struct sk_buff *skb)
60{
61 unsigned int mtu;
62
63 if (skb->local_df || !skb_is_gso(skb))
64 return false;
65
66 mtu = ip_dst_mtu_maybe_forward(skb_dst(skb), true);
67
68 /* if seglen > mtu, do software segmentation for IP fragmentation on
69 * output. DF bit cannot be set since ip_forward would have sent
70 * icmp error.
71 */
72 return skb_gso_network_seglen(skb) > mtu;
73}
74
75/* called if GSO skb needs to be fragmented on forward */
76static int ip_forward_finish_gso(struct sk_buff *skb)
77{
78 struct dst_entry *dst = skb_dst(skb);
79 netdev_features_t features;
80 struct sk_buff *segs;
81 int ret = 0;
82
83 features = netif_skb_dev_features(skb, dst->dev);
84 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
85 if (IS_ERR(segs)) {
86 kfree_skb(skb);
87 return -ENOMEM;
88 }
89
90 consume_skb(skb);
91
92 do {
93 struct sk_buff *nskb = segs->next;
94 int err;
95
96 segs->next = NULL;
97 err = dst_output(segs);
98
99 if (err && ret == 0)
100 ret = err;
101 segs = nskb;
102 } while (segs);
103
104 return ret;
105}
106 59
107static int ip_forward_finish(struct sk_buff *skb) 60static int ip_forward_finish(struct sk_buff *skb)
108{ 61{
@@ -114,9 +67,6 @@ static int ip_forward_finish(struct sk_buff *skb)
114 if (unlikely(opt->optlen)) 67 if (unlikely(opt->optlen))
115 ip_forward_options(skb); 68 ip_forward_options(skb);
116 69
117 if (ip_gso_exceeds_dst_mtu(skb))
118 return ip_forward_finish_gso(skb);
119
120 return dst_output(skb); 70 return dst_output(skb);
121} 71}
122 72
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 1cbeba5edff9..a52f50187b54 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -211,6 +211,48 @@ static inline int ip_finish_output2(struct sk_buff *skb)
211 return -EINVAL; 211 return -EINVAL;
212} 212}
213 213
214static int ip_finish_output_gso(struct sk_buff *skb)
215{
216 netdev_features_t features;
217 struct sk_buff *segs;
218 int ret = 0;
219
220 /* common case: locally created skb or seglen is <= mtu */
221 if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
222 skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
223 return ip_finish_output2(skb);
224
225 /* Slowpath - GSO segment length is exceeding the dst MTU.
226 *
227 * This can happen in two cases:
228 * 1) TCP GRO packet, DF bit not set
229 * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly
230 * from host network stack.
231 */
232 features = netif_skb_features(skb);
233 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
234 if (IS_ERR(segs)) {
235 kfree_skb(skb);
236 return -ENOMEM;
237 }
238
239 consume_skb(skb);
240
241 do {
242 struct sk_buff *nskb = segs->next;
243 int err;
244
245 segs->next = NULL;
246 err = ip_fragment(segs, ip_finish_output2);
247
248 if (err && ret == 0)
249 ret = err;
250 segs = nskb;
251 } while (segs);
252
253 return ret;
254}
255
214static int ip_finish_output(struct sk_buff *skb) 256static int ip_finish_output(struct sk_buff *skb)
215{ 257{
216#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 258#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
@@ -220,10 +262,13 @@ static int ip_finish_output(struct sk_buff *skb)
220 return dst_output(skb); 262 return dst_output(skb);
221 } 263 }
222#endif 264#endif
223 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) 265 if (skb_is_gso(skb))
266 return ip_finish_output_gso(skb);
267
268 if (skb->len > ip_skb_dst_mtu(skb))
224 return ip_fragment(skb, ip_finish_output2); 269 return ip_fragment(skb, ip_finish_output2);
225 else 270
226 return ip_finish_output2(skb); 271 return ip_finish_output2(skb);
227} 272}
228 273
229int ip_mc_output(struct sock *sk, struct sk_buff *skb) 274int ip_mc_output(struct sock *sk, struct sk_buff *skb)