aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorEric Dumazet <eric.dumazet@gmail.com>2010-04-08 19:03:29 -0400
committerDavid S. Miller <davem@davemloft.net>2010-04-13 04:41:33 -0400
commitb6c6712a42ca3f9fa7f4a3d7c40e3a9dd1fd9e03 (patch)
tree42032b4978874e8ffcf6c851d13324b8c8c7c113 /net
parent7a161ea92471087a1579239d7a58dd06eaa5601c (diff)
net: sk_dst_cache RCUification
With latest CONFIG_PROVE_RCU stuff, I felt more comfortable to make this work. sk->sk_dst_cache is currently protected by a rwlock (sk_dst_lock) This rwlock is readlocked for a very small amount of time, and dst entries are already freed after RCU grace period. This calls for RCU again :) This patch converts sk_dst_lock to a spinlock, and use RCU for readers. __sk_dst_get() is supposed to be called with rcu_read_lock() or if socket locked by user, so use appropriate rcu_dereference_check() condition (rcu_read_lock_held() || sock_owned_by_user(sk)) This patch avoids two atomic ops per tx packet on UDP connected sockets, for example, and permits sk_dst_lock to be much less dirtied. Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/core/dev.c2
-rw-r--r--net/core/sock.c8
-rw-r--r--net/dccp/timer.c4
-rw-r--r--net/decnet/af_decnet.c6
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/tcp_input.c4
-rw-r--r--net/ipv4/tcp_timer.c4
-rw-r--r--net/ipv6/ipv6_sockglue.c25
8 files changed, 28 insertions, 27 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 0eb79e35671f..ca4cdef74a1b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2015,7 +2015,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2015 if (dev->real_num_tx_queues > 1) 2015 if (dev->real_num_tx_queues > 1)
2016 queue_index = skb_tx_hash(dev, skb); 2016 queue_index = skb_tx_hash(dev, skb);
2017 2017
2018 if (sk && sk->sk_dst_cache) 2018 if (sk && rcu_dereference_check(sk->sk_dst_cache, 1))
2019 sk_tx_queue_set(sk, queue_index); 2019 sk_tx_queue_set(sk, queue_index);
2020 } 2020 }
2021 } 2021 }
diff --git a/net/core/sock.c b/net/core/sock.c
index c5812bbc2cc9..7effa1e689df 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -364,11 +364,11 @@ EXPORT_SYMBOL(sk_reset_txq);
364 364
365struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 365struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
366{ 366{
367 struct dst_entry *dst = sk->sk_dst_cache; 367 struct dst_entry *dst = __sk_dst_get(sk);
368 368
369 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 369 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
370 sk_tx_queue_clear(sk); 370 sk_tx_queue_clear(sk);
371 sk->sk_dst_cache = NULL; 371 rcu_assign_pointer(sk->sk_dst_cache, NULL);
372 dst_release(dst); 372 dst_release(dst);
373 return NULL; 373 return NULL;
374 } 374 }
@@ -1157,7 +1157,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1157 skb_queue_head_init(&newsk->sk_async_wait_queue); 1157 skb_queue_head_init(&newsk->sk_async_wait_queue);
1158#endif 1158#endif
1159 1159
1160 rwlock_init(&newsk->sk_dst_lock); 1160 spin_lock_init(&newsk->sk_dst_lock);
1161 rwlock_init(&newsk->sk_callback_lock); 1161 rwlock_init(&newsk->sk_callback_lock);
1162 lockdep_set_class_and_name(&newsk->sk_callback_lock, 1162 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1163 af_callback_keys + newsk->sk_family, 1163 af_callback_keys + newsk->sk_family,
@@ -1898,7 +1898,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
1898 } else 1898 } else
1899 sk->sk_sleep = NULL; 1899 sk->sk_sleep = NULL;
1900 1900
1901 rwlock_init(&sk->sk_dst_lock); 1901 spin_lock_init(&sk->sk_dst_lock);
1902 rwlock_init(&sk->sk_callback_lock); 1902 rwlock_init(&sk->sk_callback_lock);
1903 lockdep_set_class_and_name(&sk->sk_callback_lock, 1903 lockdep_set_class_and_name(&sk->sk_callback_lock,
1904 af_callback_keys + sk->sk_family, 1904 af_callback_keys + sk->sk_family,
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index bbfeb5eae46a..1a9aa05d4dc4 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -38,7 +38,7 @@ static int dccp_write_timeout(struct sock *sk)
38 38
39 if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { 39 if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
40 if (icsk->icsk_retransmits != 0) 40 if (icsk->icsk_retransmits != 0)
41 dst_negative_advice(&sk->sk_dst_cache, sk); 41 dst_negative_advice(sk);
42 retry_until = icsk->icsk_syn_retries ? 42 retry_until = icsk->icsk_syn_retries ?
43 : sysctl_dccp_request_retries; 43 : sysctl_dccp_request_retries;
44 } else { 44 } else {
@@ -63,7 +63,7 @@ static int dccp_write_timeout(struct sock *sk)
63 Golden words :-). 63 Golden words :-).
64 */ 64 */
65 65
66 dst_negative_advice(&sk->sk_dst_cache, sk); 66 dst_negative_advice(sk);
67 } 67 }
68 68
69 retry_until = sysctl_dccp_retries2; 69 retry_until = sysctl_dccp_retries2;
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 2b494fac9468..55e3b6b0061a 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -446,7 +446,7 @@ static void dn_destruct(struct sock *sk)
446 skb_queue_purge(&scp->other_xmit_queue); 446 skb_queue_purge(&scp->other_xmit_queue);
447 skb_queue_purge(&scp->other_receive_queue); 447 skb_queue_purge(&scp->other_receive_queue);
448 448
449 dst_release(xchg(&sk->sk_dst_cache, NULL)); 449 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
450} 450}
451 451
452static int dn_memory_pressure; 452static int dn_memory_pressure;
@@ -1105,7 +1105,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
1105 release_sock(sk); 1105 release_sock(sk);
1106 1106
1107 dst = skb_dst(skb); 1107 dst = skb_dst(skb);
1108 dst_release(xchg(&newsk->sk_dst_cache, dst)); 1108 sk_dst_set(newsk, dst);
1109 skb_dst_set(skb, NULL); 1109 skb_dst_set(skb, NULL);
1110 1110
1111 DN_SK(newsk)->state = DN_CR; 1111 DN_SK(newsk)->state = DN_CR;
@@ -1956,7 +1956,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
1956 } 1956 }
1957 1957
1958 if ((flags & MSG_TRYHARD) && sk->sk_dst_cache) 1958 if ((flags & MSG_TRYHARD) && sk->sk_dst_cache)
1959 dst_negative_advice(&sk->sk_dst_cache, sk); 1959 dst_negative_advice(sk);
1960 1960
1961 mss = scp->segsize_rem; 1961 mss = scp->segsize_rem;
1962 fctype = scp->services_rem & NSP_FC_MASK; 1962 fctype = scp->services_rem & NSP_FC_MASK;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a0beb32beaa3..193dcd6ed64f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -154,7 +154,7 @@ void inet_sock_destruct(struct sock *sk)
154 WARN_ON(sk->sk_forward_alloc); 154 WARN_ON(sk->sk_forward_alloc);
155 155
156 kfree(inet->opt); 156 kfree(inet->opt);
157 dst_release(sk->sk_dst_cache); 157 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
158 sk_refcnt_debug_dec(sk); 158 sk_refcnt_debug_dec(sk);
159} 159}
160EXPORT_SYMBOL(inet_sock_destruct); 160EXPORT_SYMBOL(inet_sock_destruct);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4000b10610b7..ae3ec15fb630 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3710,7 +3710,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3710 } 3710 }
3711 3711
3712 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) 3712 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
3713 dst_confirm(sk->sk_dst_cache); 3713 dst_confirm(__sk_dst_get(sk));
3714 3714
3715 return 1; 3715 return 1;
3716 3716
@@ -5833,7 +5833,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5833 if (tp->snd_una == tp->write_seq) { 5833 if (tp->snd_una == tp->write_seq) {
5834 tcp_set_state(sk, TCP_FIN_WAIT2); 5834 tcp_set_state(sk, TCP_FIN_WAIT2);
5835 sk->sk_shutdown |= SEND_SHUTDOWN; 5835 sk->sk_shutdown |= SEND_SHUTDOWN;
5836 dst_confirm(sk->sk_dst_cache); 5836 dst_confirm(__sk_dst_get(sk));
5837 5837
5838 if (!sock_flag(sk, SOCK_DEAD)) 5838 if (!sock_flag(sk, SOCK_DEAD))
5839 /* Wake up lingering close() */ 5839 /* Wake up lingering close() */
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 8a0ab2977f1f..c732be00606b 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -172,14 +172,14 @@ static int tcp_write_timeout(struct sock *sk)
172 172
173 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 173 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
174 if (icsk->icsk_retransmits) 174 if (icsk->icsk_retransmits)
175 dst_negative_advice(&sk->sk_dst_cache, sk); 175 dst_negative_advice(sk);
176 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 176 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
177 } else { 177 } else {
178 if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { 178 if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
179 /* Black hole detection */ 179 /* Black hole detection */
180 tcp_mtu_probing(icsk, sk); 180 tcp_mtu_probing(icsk, sk);
181 181
182 dst_negative_advice(&sk->sk_dst_cache, sk); 182 dst_negative_advice(sk);
183 } 183 }
184 184
185 retry_until = sysctl_tcp_retries2; 185 retry_until = sysctl_tcp_retries2;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 33f60fca7aa7..1160400e9dbd 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -114,9 +114,9 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
114 } 114 }
115 opt = xchg(&inet6_sk(sk)->opt, opt); 115 opt = xchg(&inet6_sk(sk)->opt, opt);
116 } else { 116 } else {
117 write_lock(&sk->sk_dst_lock); 117 spin_lock(&sk->sk_dst_lock);
118 opt = xchg(&inet6_sk(sk)->opt, opt); 118 opt = xchg(&inet6_sk(sk)->opt, opt);
119 write_unlock(&sk->sk_dst_lock); 119 spin_unlock(&sk->sk_dst_lock);
120 } 120 }
121 sk_dst_reset(sk); 121 sk_dst_reset(sk);
122 122
@@ -971,14 +971,13 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
971 case IPV6_MTU: 971 case IPV6_MTU:
972 { 972 {
973 struct dst_entry *dst; 973 struct dst_entry *dst;
974
974 val = 0; 975 val = 0;
975 lock_sock(sk); 976 rcu_read_lock();
976 dst = sk_dst_get(sk); 977 dst = __sk_dst_get(sk);
977 if (dst) { 978 if (dst)
978 val = dst_mtu(dst); 979 val = dst_mtu(dst);
979 dst_release(dst); 980 rcu_read_unlock();
980 }
981 release_sock(sk);
982 if (!val) 981 if (!val)
983 return -ENOTCONN; 982 return -ENOTCONN;
984 break; 983 break;
@@ -1066,12 +1065,14 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
1066 else 1065 else
1067 val = np->mcast_hops; 1066 val = np->mcast_hops;
1068 1067
1069 dst = sk_dst_get(sk); 1068 if (val < 0) {
1070 if (dst) { 1069 rcu_read_lock();
1071 if (val < 0) 1070 dst = __sk_dst_get(sk);
1071 if (dst)
1072 val = ip6_dst_hoplimit(dst); 1072 val = ip6_dst_hoplimit(dst);
1073 dst_release(dst); 1073 rcu_read_unlock();
1074 } 1074 }
1075
1075 if (val < 0) 1076 if (val < 0)
1076 val = sock_net(sk)->ipv6.devconf_all->hop_limit; 1077 val = sock_net(sk)->ipv6.devconf_all->hop_limit;
1077 break; 1078 break;