aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJerry Chu <hkchu@google.com>2014-01-07 13:23:19 -0500
committerDavid S. Miller <davem@davemloft.net>2014-01-07 16:21:31 -0500
commitbf5a755f5e9186406bbf50f4087100af5bd68e40 (patch)
treec971c1aafbcb999a65b5f088bf2627c48006072a
parentcdb3f4a31b64c3a1c6eef40bc01ebc9594c58a8c (diff)
net-gre-gro: Add GRE support to the GRO stack
This patch built on top of Commit 299603e8370a93dd5d8e8d800f0dff1ce2c53d36 ("net-gro: Prepare GRO stack for the upcoming tunneling support") to add the support of the standard GRE (RFC1701/RFC2784/RFC2890) to the GRO stack. It also serves as an example for supporting other encapsulation protocols in the GRO stack in the future. The patch supports version 0 and all the flags (key, csum, seq#) but will flush any pkt with the S (seq#) flag. This is because the S flag is not support by GSO, and a GRO pkt may end up in the forwarding path, thus requiring GSO support to break it up correctly. Currently the "packet_offload" structure only contains L3 (ETH_P_IP/ ETH_P_IPV6) GRO offload support so the encapped pkts are limited to IP pkts (i.e., w/o L2 hdr). But support for other protocol type can be easily added, so is the support for GRE variations like NVGRE. The patch also support csum offload. Specifically if the csum flag is on and the h/w is capable of checksumming the payload (CHECKSUM_COMPLETE), the code will take advantage of the csum computed by the h/w when validating the GRE csum. Note that commit 60769a5dcd8755715c7143b4571d5c44f01796f1 "ipv4: gre: add GRO capability" already introduces GRO capability to IPv4 GRE tunnels, using the gro_cells infrastructure. But GRO is done after GRE hdr has been removed (i.e., decapped). The following patch applies GRO when pkts first come in (before hitting the GRE tunnel code). There is some performance advantage for applying GRO as early as possible. Also this approach is transparent to other subsystem like Open vSwitch where GRE decap is handled outside of the IP stack hence making it harder for the gro_cells stuff to apply. On the other hand, some NICs are still not capable of hashing on the inner hdr of a GRE pkt (RSS). In that case the GRO processing of pkts from the same remote host will all happen on the same CPU and the performance may be suboptimal. I'm including some rough preliminary performance numbers below. Note that the performance will be highly dependent on traffic load, mix as usual. Moreover it also depends on NIC offload features hence the following is by no means a comprehesive study. Local testing and tuning will be needed to decide the best setting. All tests spawned 50 copies of netperf TCP_STREAM and ran for 30 secs. (super_netperf 50 -H 192.168.1.18 -l 30) An IP GRE tunnel with only the key flag on (e.g., ip tunnel add gre1 mode gre local 10.246.17.18 remote 10.246.17.17 ttl 255 key 123) is configured. The GRO support for pkts AFTER decap are controlled through the device feature of the GRE device (e.g., ethtool -K gre1 gro on/off). 1.1 ethtool -K gre1 gro off; ethtool -K eth0 gro off thruput: 9.16Gbps CPU utilization: 19% 1.2 ethtool -K gre1 gro on; ethtool -K eth0 gro off thruput: 5.9Gbps CPU utilization: 15% 1.3 ethtool -K gre1 gro off; ethtool -K eth0 gro on thruput: 9.26Gbps CPU utilization: 12-13% 1.4 ethtool -K gre1 gro on; ethtool -K eth0 gro on thruput: 9.26Gbps CPU utilization: 10% The following tests were performed on a different NIC that is capable of csum offload. I.e., the h/w is capable of computing IP payload csum (CHECKSUM_COMPLETE). 2.1 ethtool -K gre1 gro on (hence will use gro_cells) 2.1.1 ethtool -K eth0 gro off; csum offload disabled thruput: 8.53Gbps CPU utilization: 9% 2.1.2 ethtool -K eth0 gro off; csum offload enabled thruput: 8.97Gbps CPU utilization: 7-8% 2.1.3 ethtool -K eth0 gro on; csum offload disabled thruput: 8.83Gbps CPU utilization: 5-6% 2.1.4 ethtool -K eth0 gro on; csum offload enabled thruput: 8.98Gbps CPU utilization: 5% 2.2 ethtool -K gre1 gro off 2.2.1 ethtool -K eth0 gro off; csum offload disabled thruput: 5.93Gbps CPU utilization: 9% 2.2.2 ethtool -K eth0 gro off; csum offload enabled thruput: 5.62Gbps CPU utilization: 8% 2.2.3 ethtool -K eth0 gro on; csum offload disabled thruput: 7.69Gbps CPU utilization: 8% 2.2.4 ethtool -K eth0 gro on; csum offload enabled thruput: 8.96Gbps CPU utilization: 5-6% Signed-off-by: H.K. Jerry Chu <hkchu@google.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/netdevice.h18
-rw-r--r--net/core/dev.c26
-rw-r--r--net/ipv4/af_inet.c10
-rw-r--r--net/ipv4/gre_offload.c160
-rw-r--r--net/ipv4/tcp_offload.c7
-rw-r--r--net/ipv6/ip6_offload.c2
6 files changed, 216 insertions, 7 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d9c961aa6a7f..a2a70cc70e7b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1632,7 +1632,10 @@ struct napi_gro_cb {
1632 int data_offset; 1632 int data_offset;
1633 1633
1634 /* This is non-zero if the packet cannot be merged with the new skb. */ 1634 /* This is non-zero if the packet cannot be merged with the new skb. */
1635 int flush; 1635 u16 flush;
1636
1637 /* Save the IP ID here and check when we get to the transport layer */
1638 u16 flush_id;
1636 1639
1637 /* Number of segments aggregated. */ 1640 /* Number of segments aggregated. */
1638 u16 count; 1641 u16 count;
@@ -1651,6 +1654,9 @@ struct napi_gro_cb {
1651 /* Used in ipv6_gro_receive() */ 1654 /* Used in ipv6_gro_receive() */
1652 int proto; 1655 int proto;
1653 1656
1657 /* used to support CHECKSUM_COMPLETE for tunneling protocols */
1658 __wsum csum;
1659
1654 /* used in skb_gro_receive() slow path */ 1660 /* used in skb_gro_receive() slow path */
1655 struct sk_buff *last; 1661 struct sk_buff *last;
1656}; 1662};
@@ -1900,6 +1906,14 @@ static inline void *skb_gro_network_header(struct sk_buff *skb)
1900 skb_network_offset(skb); 1906 skb_network_offset(skb);
1901} 1907}
1902 1908
1909static inline void skb_gro_postpull_rcsum(struct sk_buff *skb,
1910 const void *start, unsigned int len)
1911{
1912 if (skb->ip_summed == CHECKSUM_COMPLETE)
1913 NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum,
1914 csum_partial(start, len, 0));
1915}
1916
1903static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, 1917static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
1904 unsigned short type, 1918 unsigned short type,
1905 const void *daddr, const void *saddr, 1919 const void *daddr, const void *saddr,
@@ -2440,6 +2454,8 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
2440void napi_gro_flush(struct napi_struct *napi, bool flush_old); 2454void napi_gro_flush(struct napi_struct *napi, bool flush_old);
2441struct sk_buff *napi_get_frags(struct napi_struct *napi); 2455struct sk_buff *napi_get_frags(struct napi_struct *napi);
2442gro_result_t napi_gro_frags(struct napi_struct *napi); 2456gro_result_t napi_gro_frags(struct napi_struct *napi);
2457struct packet_offload *gro_find_receive_by_type(__be16 type);
2458struct packet_offload *gro_find_complete_by_type(__be16 type);
2443 2459
2444static inline void napi_free_frags(struct napi_struct *napi) 2460static inline void napi_free_frags(struct napi_struct *napi)
2445{ 2461{
diff --git a/net/core/dev.c b/net/core/dev.c
index b3c574a88026..ce01847793c0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3846,6 +3846,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
3846 3846
3847 skb_gro_reset_offset(skb); 3847 skb_gro_reset_offset(skb);
3848 gro_list_prepare(napi, skb); 3848 gro_list_prepare(napi, skb);
3849 NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
3849 3850
3850 rcu_read_lock(); 3851 rcu_read_lock();
3851 list_for_each_entry_rcu(ptype, head, list) { 3852 list_for_each_entry_rcu(ptype, head, list) {
@@ -3922,6 +3923,31 @@ normal:
3922 goto pull; 3923 goto pull;
3923} 3924}
3924 3925
3926struct packet_offload *gro_find_receive_by_type(__be16 type)
3927{
3928 struct list_head *offload_head = &offload_base;
3929 struct packet_offload *ptype;
3930
3931 list_for_each_entry_rcu(ptype, offload_head, list) {
3932 if (ptype->type != type || !ptype->callbacks.gro_receive)
3933 continue;
3934 return ptype;
3935 }
3936 return NULL;
3937}
3938
3939struct packet_offload *gro_find_complete_by_type(__be16 type)
3940{
3941 struct list_head *offload_head = &offload_base;
3942 struct packet_offload *ptype;
3943
3944 list_for_each_entry_rcu(ptype, offload_head, list) {
3945 if (ptype->type != type || !ptype->callbacks.gro_complete)
3946 continue;
3947 return ptype;
3948 }
3949 return NULL;
3950}
3925 3951
3926static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) 3952static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3927{ 3953{
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b8bc1a3d5cf1..6268a4751e64 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1391,9 +1391,15 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1391 NAPI_GRO_CB(p)->flush |= 1391 NAPI_GRO_CB(p)->flush |=
1392 (iph->ttl ^ iph2->ttl) | 1392 (iph->ttl ^ iph2->ttl) |
1393 (iph->tos ^ iph2->tos) | 1393 (iph->tos ^ iph2->tos) |
1394 (__force int)((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)) | 1394 ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
1395 ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
1396 1395
1396 /* Save the IP ID check to be included later when we get to
1397 * the transport layer so only the inner most IP ID is checked.
1398 * This is because some GSO/TSO implementations do not
1399 * correctly increment the IP ID for the outer hdrs.
1400 */
1401 NAPI_GRO_CB(p)->flush_id =
1402 ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
1397 NAPI_GRO_CB(p)->flush |= flush; 1403 NAPI_GRO_CB(p)->flush |= flush;
1398 } 1404 }
1399 1405
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 9138cfb10140..746a7b10d434 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -116,10 +116,170 @@ out:
116 return segs; 116 return segs;
117} 117}
118 118
119/* Compute the whole skb csum in s/w and store it, then verify GRO csum
120 * starting from gro_offset.
121 */
122static __sum16 gro_skb_checksum(struct sk_buff *skb)
123{
124 __sum16 sum;
125
126 skb->csum = skb_checksum(skb, 0, skb->len, 0);
127 NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum,
128 csum_partial(skb->data, skb_gro_offset(skb), 0));
129 sum = csum_fold(NAPI_GRO_CB(skb)->csum);
130 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) {
131 if (unlikely(!sum))
132 netdev_rx_csum_fault(skb->dev);
133 } else
134 skb->ip_summed = CHECKSUM_COMPLETE;
135
136 return sum;
137}
138
139static struct sk_buff **gre_gro_receive(struct sk_buff **head,
140 struct sk_buff *skb)
141{
142 struct sk_buff **pp = NULL;
143 struct sk_buff *p;
144 const struct gre_base_hdr *greh;
145 unsigned int hlen, grehlen;
146 unsigned int off;
147 int flush = 1;
148 struct packet_offload *ptype;
149 __be16 type;
150
151 off = skb_gro_offset(skb);
152 hlen = off + sizeof(*greh);
153 greh = skb_gro_header_fast(skb, off);
154 if (skb_gro_header_hard(skb, hlen)) {
155 greh = skb_gro_header_slow(skb, hlen, off);
156 if (unlikely(!greh))
157 goto out;
158 }
159
160 /* Only support version 0 and K (key), C (csum) flags. Note that
161 * although the support for the S (seq#) flag can be added easily
162 * for GRO, this is problematic for GSO hence can not be enabled
163 * here because a GRO pkt may end up in the forwarding path, thus
164 * requiring GSO support to break it up correctly.
165 */
166 if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0)
167 goto out;
168
169 type = greh->protocol;
170
171 rcu_read_lock();
172 ptype = gro_find_receive_by_type(type);
173 if (ptype == NULL)
174 goto out_unlock;
175
176 grehlen = GRE_HEADER_SECTION;
177
178 if (greh->flags & GRE_KEY)
179 grehlen += GRE_HEADER_SECTION;
180
181 if (greh->flags & GRE_CSUM)
182 grehlen += GRE_HEADER_SECTION;
183
184 hlen = off + grehlen;
185 if (skb_gro_header_hard(skb, hlen)) {
186 greh = skb_gro_header_slow(skb, hlen, off);
187 if (unlikely(!greh))
188 goto out_unlock;
189 }
190 if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */
191 __sum16 csum = 0;
192
193 if (skb->ip_summed == CHECKSUM_COMPLETE)
194 csum = csum_fold(NAPI_GRO_CB(skb)->csum);
195 /* Don't trust csum error calculated/reported by h/w */
196 if (skb->ip_summed == CHECKSUM_NONE || csum != 0)
197 csum = gro_skb_checksum(skb);
198
199 /* GRE CSUM is the 1's complement of the 1's complement sum
200 * of the GRE hdr plus payload so it should add up to 0xffff
201 * (and 0 after csum_fold()) just like the IPv4 hdr csum.
202 */
203 if (csum)
204 goto out_unlock;
205 }
206 flush = 0;
207
208 for (p = *head; p; p = p->next) {
209 const struct gre_base_hdr *greh2;
210
211 if (!NAPI_GRO_CB(p)->same_flow)
212 continue;
213
214 /* The following checks are needed to ensure only pkts
215 * from the same tunnel are considered for aggregation.
216 * The criteria for "the same tunnel" includes:
217 * 1) same version (we only support version 0 here)
218 * 2) same protocol (we only support ETH_P_IP for now)
219 * 3) same set of flags
220 * 4) same key if the key field is present.
221 */
222 greh2 = (struct gre_base_hdr *)(p->data + off);
223
224 if (greh2->flags != greh->flags ||
225 greh2->protocol != greh->protocol) {
226 NAPI_GRO_CB(p)->same_flow = 0;
227 continue;
228 }
229 if (greh->flags & GRE_KEY) {
230 /* compare keys */
231 if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) {
232 NAPI_GRO_CB(p)->same_flow = 0;
233 continue;
234 }
235 }
236 }
237
238 skb_gro_pull(skb, grehlen);
239
240 /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
241 skb_gro_postpull_rcsum(skb, greh, grehlen);
242
243 pp = ptype->callbacks.gro_receive(head, skb);
244
245out_unlock:
246 rcu_read_unlock();
247out:
248 NAPI_GRO_CB(skb)->flush |= flush;
249
250 return pp;
251}
252
253int gre_gro_complete(struct sk_buff *skb, int nhoff)
254{
255 struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff);
256 struct packet_offload *ptype;
257 unsigned int grehlen = sizeof(*greh);
258 int err = -ENOENT;
259 __be16 type;
260
261 type = greh->protocol;
262 if (greh->flags & GRE_KEY)
263 grehlen += GRE_HEADER_SECTION;
264
265 if (greh->flags & GRE_CSUM)
266 grehlen += GRE_HEADER_SECTION;
267
268 rcu_read_lock();
269 ptype = gro_find_complete_by_type(type);
270 if (ptype != NULL)
271 err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
272
273 rcu_read_unlock();
274 return err;
275}
276
119static const struct net_offload gre_offload = { 277static const struct net_offload gre_offload = {
120 .callbacks = { 278 .callbacks = {
121 .gso_send_check = gre_gso_send_check, 279 .gso_send_check = gre_gso_send_check,
122 .gso_segment = gre_gso_segment, 280 .gso_segment = gre_gso_segment,
281 .gro_receive = gre_gro_receive,
282 .gro_complete = gre_gro_complete,
123 }, 283 },
124}; 284};
125 285
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 2658a27f540d..771a3950d87a 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -197,7 +197,8 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
197 goto out_check_final; 197 goto out_check_final;
198 198
199found: 199found:
200 flush = NAPI_GRO_CB(p)->flush; 200 /* Include the IP ID check below from the inner most IP hdr */
201 flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;
201 flush |= (__force int)(flags & TCP_FLAG_CWR); 202 flush |= (__force int)(flags & TCP_FLAG_CWR);
202 flush |= (__force int)((flags ^ tcp_flag_word(th2)) & 203 flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
203 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); 204 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
@@ -230,7 +231,7 @@ out_check_final:
230 pp = head; 231 pp = head;
231 232
232out: 233out:
233 NAPI_GRO_CB(skb)->flush |= flush; 234 NAPI_GRO_CB(skb)->flush |= (flush != 0);
234 235
235 return pp; 236 return pp;
236} 237}
@@ -280,7 +281,7 @@ static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *
280 if (NAPI_GRO_CB(skb)->flush) 281 if (NAPI_GRO_CB(skb)->flush)
281 goto skip_csum; 282 goto skip_csum;
282 283
283 wsum = skb->csum; 284 wsum = NAPI_GRO_CB(skb)->csum;
284 285
285 switch (skb->ip_summed) { 286 switch (skb->ip_summed) {
286 case CHECKSUM_NONE: 287 case CHECKSUM_NONE:
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 6fb4162fa785..1e8683b135bb 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -190,7 +190,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
190 unsigned int nlen; 190 unsigned int nlen;
191 unsigned int hlen; 191 unsigned int hlen;
192 unsigned int off; 192 unsigned int off;
193 int flush = 1; 193 u16 flush = 1;
194 int proto; 194 int proto;
195 __wsum csum; 195 __wsum csum;
196 196