aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2012-06-20 00:22:05 -0400
committerDavid S. Miller <davem@davemloft.net>2012-06-20 00:22:05 -0400
commit41063e9dd11956f2d285e12e4342e1d232ba0ea2 (patch)
treed4df2f51044b4724a4767f0498c3df2f606b5ad7 /net/ipv4
parentf9242b6b28d61295f2bf7e8adfb1060b382e5381 (diff)
ipv4: Early TCP socket demux.
Input packet processing for local sockets involves two major demuxes. One for the route and one for the socket. But we can optimize this down to one demux for certain kinds of local sockets. Currently we only do this for established TCP sockets, but it could at least in theory be expanded to other kinds of connections. If a TCP socket is established then it's identity is fully specified. This means that whatever input route was used during the three-way handshake must work equally well for the rest of the connection since the keys will not change. Once we move to established state, we cache the receive packet's input route to use later. Like the existing cached route in sk->sk_dst_cache used for output packets, we have to check for route invalidations using dst->obsolete and dst->ops->check(). Early demux occurs outside of a socket locked section, so when a route invalidation occurs we defer the fixup of sk->sk_rx_dst until we are actually inside of established state packet processing and thus have the socket locked. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c18
-rw-r--r--net/ipv4/ip_input.c39
-rw-r--r--net/ipv4/tcp_input.c16
-rw-r--r--net/ipv4/tcp_ipv4.c46
-rw-r--r--net/ipv4/tcp_minisocks.c2
5 files changed, 99 insertions, 22 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 85a3b1763136..07a02f6e9696 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk)
157 157
158 kfree(rcu_dereference_protected(inet->inet_opt, 1)); 158 kfree(rcu_dereference_protected(inet->inet_opt, 1));
159 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); 159 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
160 dst_release(sk->sk_rx_dst);
160 sk_refcnt_debug_dec(sk); 161 sk_refcnt_debug_dec(sk);
161} 162}
162EXPORT_SYMBOL(inet_sock_destruct); 163EXPORT_SYMBOL(inet_sock_destruct);
@@ -1518,14 +1519,15 @@ static const struct net_protocol igmp_protocol = {
1518#endif 1519#endif
1519 1520
1520static const struct net_protocol tcp_protocol = { 1521static const struct net_protocol tcp_protocol = {
1521 .handler = tcp_v4_rcv, 1522 .early_demux = tcp_v4_early_demux,
1522 .err_handler = tcp_v4_err, 1523 .handler = tcp_v4_rcv,
1523 .gso_send_check = tcp_v4_gso_send_check, 1524 .err_handler = tcp_v4_err,
1524 .gso_segment = tcp_tso_segment, 1525 .gso_send_check = tcp_v4_gso_send_check,
1525 .gro_receive = tcp4_gro_receive, 1526 .gso_segment = tcp_tso_segment,
1526 .gro_complete = tcp4_gro_complete, 1527 .gro_receive = tcp4_gro_receive,
1527 .no_policy = 1, 1528 .gro_complete = tcp4_gro_complete,
1528 .netns_ok = 1, 1529 .no_policy = 1,
1530 .netns_ok = 1,
1529}; 1531};
1530 1532
1531static const struct net_protocol udp_protocol = { 1533static const struct net_protocol udp_protocol = {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index c4fe1d271131..93b092c9a394 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -323,19 +323,32 @@ static int ip_rcv_finish(struct sk_buff *skb)
323 * how the packet travels inside Linux networking. 323 * how the packet travels inside Linux networking.
324 */ 324 */
325 if (skb_dst(skb) == NULL) { 325 if (skb_dst(skb) == NULL) {
326 int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, 326 const struct net_protocol *ipprot;
327 iph->tos, skb->dev); 327 int protocol = iph->protocol;
328 if (unlikely(err)) { 328 int err;
329 if (err == -EHOSTUNREACH) 329
330 IP_INC_STATS_BH(dev_net(skb->dev), 330 rcu_read_lock();
331 IPSTATS_MIB_INADDRERRORS); 331 ipprot = rcu_dereference(inet_protos[protocol]);
332 else if (err == -ENETUNREACH) 332 err = -ENOENT;
333 IP_INC_STATS_BH(dev_net(skb->dev), 333 if (ipprot && ipprot->early_demux)
334 IPSTATS_MIB_INNOROUTES); 334 err = ipprot->early_demux(skb);
335 else if (err == -EXDEV) 335 rcu_read_unlock();
336 NET_INC_STATS_BH(dev_net(skb->dev), 336
337 LINUX_MIB_IPRPFILTER); 337 if (err) {
338 goto drop; 338 err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
339 iph->tos, skb->dev);
340 if (unlikely(err)) {
341 if (err == -EHOSTUNREACH)
342 IP_INC_STATS_BH(dev_net(skb->dev),
343 IPSTATS_MIB_INADDRERRORS);
344 else if (err == -ENETUNREACH)
345 IP_INC_STATS_BH(dev_net(skb->dev),
346 IPSTATS_MIB_INNOROUTES);
347 else if (err == -EXDEV)
348 NET_INC_STATS_BH(dev_net(skb->dev),
349 LINUX_MIB_IPRPFILTER);
350 goto drop;
351 }
339 } 352 }
340 } 353 }
341 354
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b224eb8bce8b..8416f8a68e65 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5518,6 +5518,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5518 struct tcp_sock *tp = tcp_sk(sk); 5518 struct tcp_sock *tp = tcp_sk(sk);
5519 int res; 5519 int res;
5520 5520
5521 if (sk->sk_rx_dst) {
5522 struct dst_entry *dst = sk->sk_rx_dst;
5523 if (unlikely(dst->obsolete)) {
5524 if (dst->ops->check(dst, 0) == NULL) {
5525 dst_release(dst);
5526 sk->sk_rx_dst = NULL;
5527 }
5528 }
5529 }
5530 if (unlikely(sk->sk_rx_dst == NULL))
5531 sk->sk_rx_dst = dst_clone(skb_dst(skb));
5532
5521 /* 5533 /*
5522 * Header prediction. 5534 * Header prediction.
5523 * The code loosely follows the one in the famous 5535 * The code loosely follows the one in the famous
@@ -5729,8 +5741,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5729 5741
5730 tcp_set_state(sk, TCP_ESTABLISHED); 5742 tcp_set_state(sk, TCP_ESTABLISHED);
5731 5743
5732 if (skb != NULL) 5744 if (skb != NULL) {
5745 sk->sk_rx_dst = dst_clone(skb_dst(skb));
5733 security_inet_conn_established(sk, skb); 5746 security_inet_conn_established(sk, skb);
5747 }
5734 5748
5735 /* Make sure socket is routed, for correct metrics. */ 5749 /* Make sure socket is routed, for correct metrics. */
5736 icsk->icsk_af_ops->rebuild_header(sk); 5750 icsk->icsk_af_ops->rebuild_header(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fda2ca17135e..13857df1dae1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1671,6 +1671,52 @@ csum_err:
1671} 1671}
1672EXPORT_SYMBOL(tcp_v4_do_rcv); 1672EXPORT_SYMBOL(tcp_v4_do_rcv);
1673 1673
1674int tcp_v4_early_demux(struct sk_buff *skb)
1675{
1676 struct net *net = dev_net(skb->dev);
1677 const struct iphdr *iph;
1678 const struct tcphdr *th;
1679 struct sock *sk;
1680 int err;
1681
1682 err = -ENOENT;
1683 if (skb->pkt_type != PACKET_HOST)
1684 goto out_err;
1685
1686 if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
1687 goto out_err;
1688
1689 iph = ip_hdr(skb);
1690 th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
1691
1692 if (th->doff < sizeof(struct tcphdr) / 4)
1693 goto out_err;
1694
1695 if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
1696 goto out_err;
1697
1698 sk = __inet_lookup_established(net, &tcp_hashinfo,
1699 iph->saddr, th->source,
1700 iph->daddr, th->dest,
1701 skb->dev->ifindex);
1702 if (sk) {
1703 skb->sk = sk;
1704 skb->destructor = sock_edemux;
1705 if (sk->sk_state != TCP_TIME_WAIT) {
1706 struct dst_entry *dst = sk->sk_rx_dst;
1707 if (dst)
1708 dst = dst_check(dst, 0);
1709 if (dst) {
1710 skb_dst_set_noref(skb, dst);
1711 err = 0;
1712 }
1713 }
1714 }
1715
1716out_err:
1717 return err;
1718}
1719
1674/* 1720/*
1675 * From tcp_input.c 1721 * From tcp_input.c
1676 */ 1722 */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index cb015317c9f7..72b7c63b1a39 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -445,6 +445,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
445 struct tcp_sock *oldtp = tcp_sk(sk); 445 struct tcp_sock *oldtp = tcp_sk(sk);
446 struct tcp_cookie_values *oldcvp = oldtp->cookie_values; 446 struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
447 447
448 newsk->sk_rx_dst = dst_clone(skb_dst(skb));
449
448 /* TCP Cookie Transactions require space for the cookie pair, 450 /* TCP Cookie Transactions require space for the cookie pair,
449 * as it differs for each connection. There is no need to 451 * as it differs for each connection. There is no need to
450 * copy any s_data_payload stored at the original socket. 452 * copy any s_data_payload stored at the original socket.