aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <eric.dumazet@gmail.com>2010-04-08 19:03:29 -0400
committerDavid S. Miller <davem@davemloft.net>2010-04-13 04:41:33 -0400
commitb6c6712a42ca3f9fa7f4a3d7c40e3a9dd1fd9e03 (patch)
tree42032b4978874e8ffcf6c851d13324b8c8c7c113
parent7a161ea92471087a1579239d7a58dd06eaa5601c (diff)
net: sk_dst_cache RCUification
With latest CONFIG_PROVE_RCU stuff, I felt more comfortable to make this work. sk->sk_dst_cache is currently protected by a rwlock (sk_dst_lock) This rwlock is readlocked for a very small amount of time, and dst entries are already freed after RCU grace period. This calls for RCU again :) This patch converts sk_dst_lock to a spinlock, and use RCU for readers. __sk_dst_get() is supposed to be called with rcu_read_lock() or if socket locked by user, so use appropriate rcu_dereference_check() condition (rcu_read_lock_held() || sock_owned_by_user(sk)) This patch avoids two atomic ops per tx packet on UDP connected sockets, for example, and permits sk_dst_lock to be much less dirtied. Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/dst.h15
-rw-r--r--include/net/ip6_route.h4
-rw-r--r--include/net/sock.h47
-rw-r--r--net/core/dev.c2
-rw-r--r--net/core/sock.c8
-rw-r--r--net/dccp/timer.c4
-rw-r--r--net/decnet/af_decnet.c6
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/tcp_input.c4
-rw-r--r--net/ipv4/tcp_timer.c4
-rw-r--r--net/ipv6/ipv6_sockglue.c25
11 files changed, 60 insertions, 61 deletions
diff --git a/include/net/dst.h b/include/net/dst.h
index ce078cda6b74..aac5a5fcfda9 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -225,21 +225,6 @@ static inline void dst_confirm(struct dst_entry *dst)
225 neigh_confirm(dst->neighbour); 225 neigh_confirm(dst->neighbour);
226} 226}
227 227
228static inline void dst_negative_advice(struct dst_entry **dst_p,
229 struct sock *sk)
230{
231 struct dst_entry * dst = *dst_p;
232 if (dst && dst->ops->negative_advice) {
233 *dst_p = dst->ops->negative_advice(dst);
234
235 if (dst != *dst_p) {
236 extern void sk_reset_txq(struct sock *sk);
237
238 sk_reset_txq(sk);
239 }
240 }
241}
242
243static inline void dst_link_failure(struct sk_buff *skb) 228static inline void dst_link_failure(struct sk_buff *skb)
244{ 229{
245 struct dst_entry *dst = skb_dst(skb); 230 struct dst_entry *dst = skb_dst(skb);
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 68f67836e146..278312c95f96 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -152,9 +152,9 @@ static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst,
152static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, 152static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
153 struct in6_addr *daddr, struct in6_addr *saddr) 153 struct in6_addr *daddr, struct in6_addr *saddr)
154{ 154{
155 write_lock(&sk->sk_dst_lock); 155 spin_lock(&sk->sk_dst_lock);
156 __ip6_dst_store(sk, dst, daddr, saddr); 156 __ip6_dst_store(sk, dst, daddr, saddr);
157 write_unlock(&sk->sk_dst_lock); 157 spin_unlock(&sk->sk_dst_lock);
158} 158}
159 159
160static inline int ipv6_unicast_destination(struct sk_buff *skb) 160static inline int ipv6_unicast_destination(struct sk_buff *skb)
diff --git a/include/net/sock.h b/include/net/sock.h
index b4603cd54fcd..56df440a950b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -262,7 +262,7 @@ struct sock {
262#ifdef CONFIG_XFRM 262#ifdef CONFIG_XFRM
263 struct xfrm_policy *sk_policy[2]; 263 struct xfrm_policy *sk_policy[2];
264#endif 264#endif
265 rwlock_t sk_dst_lock; 265 spinlock_t sk_dst_lock;
266 atomic_t sk_rmem_alloc; 266 atomic_t sk_rmem_alloc;
267 atomic_t sk_wmem_alloc; 267 atomic_t sk_wmem_alloc;
268 atomic_t sk_omem_alloc; 268 atomic_t sk_omem_alloc;
@@ -1192,7 +1192,8 @@ extern unsigned long sock_i_ino(struct sock *sk);
1192static inline struct dst_entry * 1192static inline struct dst_entry *
1193__sk_dst_get(struct sock *sk) 1193__sk_dst_get(struct sock *sk)
1194{ 1194{
1195 return sk->sk_dst_cache; 1195 return rcu_dereference_check(sk->sk_dst_cache, rcu_read_lock_held() ||
1196 sock_owned_by_user(sk));
1196} 1197}
1197 1198
1198static inline struct dst_entry * 1199static inline struct dst_entry *
@@ -1200,50 +1201,62 @@ sk_dst_get(struct sock *sk)
1200{ 1201{
1201 struct dst_entry *dst; 1202 struct dst_entry *dst;
1202 1203
1203 read_lock(&sk->sk_dst_lock); 1204 rcu_read_lock();
1204 dst = sk->sk_dst_cache; 1205 dst = rcu_dereference(sk->sk_dst_cache);
1205 if (dst) 1206 if (dst)
1206 dst_hold(dst); 1207 dst_hold(dst);
1207 read_unlock(&sk->sk_dst_lock); 1208 rcu_read_unlock();
1208 return dst; 1209 return dst;
1209} 1210}
1210 1211
1212extern void sk_reset_txq(struct sock *sk);
1213
1214static inline void dst_negative_advice(struct sock *sk)
1215{
1216 struct dst_entry *ndst, *dst = __sk_dst_get(sk);
1217
1218 if (dst && dst->ops->negative_advice) {
1219 ndst = dst->ops->negative_advice(dst);
1220
1221 if (ndst != dst) {
1222 rcu_assign_pointer(sk->sk_dst_cache, ndst);
1223 sk_reset_txq(sk);
1224 }
1225 }
1226}
1227
1211static inline void 1228static inline void
1212__sk_dst_set(struct sock *sk, struct dst_entry *dst) 1229__sk_dst_set(struct sock *sk, struct dst_entry *dst)
1213{ 1230{
1214 struct dst_entry *old_dst; 1231 struct dst_entry *old_dst;
1215 1232
1216 sk_tx_queue_clear(sk); 1233 sk_tx_queue_clear(sk);
1217 old_dst = sk->sk_dst_cache; 1234 old_dst = rcu_dereference_check(sk->sk_dst_cache,
1218 sk->sk_dst_cache = dst; 1235 lockdep_is_held(&sk->sk_dst_lock));
1236 rcu_assign_pointer(sk->sk_dst_cache, dst);
1219 dst_release(old_dst); 1237 dst_release(old_dst);
1220} 1238}
1221 1239
1222static inline void 1240static inline void
1223sk_dst_set(struct sock *sk, struct dst_entry *dst) 1241sk_dst_set(struct sock *sk, struct dst_entry *dst)
1224{ 1242{
1225 write_lock(&sk->sk_dst_lock); 1243 spin_lock(&sk->sk_dst_lock);
1226 __sk_dst_set(sk, dst); 1244 __sk_dst_set(sk, dst);
1227 write_unlock(&sk->sk_dst_lock); 1245 spin_unlock(&sk->sk_dst_lock);
1228} 1246}
1229 1247
1230static inline void 1248static inline void
1231__sk_dst_reset(struct sock *sk) 1249__sk_dst_reset(struct sock *sk)
1232{ 1250{
1233 struct dst_entry *old_dst; 1251 __sk_dst_set(sk, NULL);
1234
1235 sk_tx_queue_clear(sk);
1236 old_dst = sk->sk_dst_cache;
1237 sk->sk_dst_cache = NULL;
1238 dst_release(old_dst);
1239} 1252}
1240 1253
1241static inline void 1254static inline void
1242sk_dst_reset(struct sock *sk) 1255sk_dst_reset(struct sock *sk)
1243{ 1256{
1244 write_lock(&sk->sk_dst_lock); 1257 spin_lock(&sk->sk_dst_lock);
1245 __sk_dst_reset(sk); 1258 __sk_dst_reset(sk);
1246 write_unlock(&sk->sk_dst_lock); 1259 spin_unlock(&sk->sk_dst_lock);
1247} 1260}
1248 1261
1249extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); 1262extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);
diff --git a/net/core/dev.c b/net/core/dev.c
index 0eb79e35671f..ca4cdef74a1b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2015,7 +2015,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2015 if (dev->real_num_tx_queues > 1) 2015 if (dev->real_num_tx_queues > 1)
2016 queue_index = skb_tx_hash(dev, skb); 2016 queue_index = skb_tx_hash(dev, skb);
2017 2017
2018 if (sk && sk->sk_dst_cache) 2018 if (sk && rcu_dereference_check(sk->sk_dst_cache, 1))
2019 sk_tx_queue_set(sk, queue_index); 2019 sk_tx_queue_set(sk, queue_index);
2020 } 2020 }
2021 } 2021 }
diff --git a/net/core/sock.c b/net/core/sock.c
index c5812bbc2cc9..7effa1e689df 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -364,11 +364,11 @@ EXPORT_SYMBOL(sk_reset_txq);
364 364
365struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 365struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
366{ 366{
367 struct dst_entry *dst = sk->sk_dst_cache; 367 struct dst_entry *dst = __sk_dst_get(sk);
368 368
369 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 369 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
370 sk_tx_queue_clear(sk); 370 sk_tx_queue_clear(sk);
371 sk->sk_dst_cache = NULL; 371 rcu_assign_pointer(sk->sk_dst_cache, NULL);
372 dst_release(dst); 372 dst_release(dst);
373 return NULL; 373 return NULL;
374 } 374 }
@@ -1157,7 +1157,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1157 skb_queue_head_init(&newsk->sk_async_wait_queue); 1157 skb_queue_head_init(&newsk->sk_async_wait_queue);
1158#endif 1158#endif
1159 1159
1160 rwlock_init(&newsk->sk_dst_lock); 1160 spin_lock_init(&newsk->sk_dst_lock);
1161 rwlock_init(&newsk->sk_callback_lock); 1161 rwlock_init(&newsk->sk_callback_lock);
1162 lockdep_set_class_and_name(&newsk->sk_callback_lock, 1162 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1163 af_callback_keys + newsk->sk_family, 1163 af_callback_keys + newsk->sk_family,
@@ -1898,7 +1898,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
1898 } else 1898 } else
1899 sk->sk_sleep = NULL; 1899 sk->sk_sleep = NULL;
1900 1900
1901 rwlock_init(&sk->sk_dst_lock); 1901 spin_lock_init(&sk->sk_dst_lock);
1902 rwlock_init(&sk->sk_callback_lock); 1902 rwlock_init(&sk->sk_callback_lock);
1903 lockdep_set_class_and_name(&sk->sk_callback_lock, 1903 lockdep_set_class_and_name(&sk->sk_callback_lock,
1904 af_callback_keys + sk->sk_family, 1904 af_callback_keys + sk->sk_family,
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index bbfeb5eae46a..1a9aa05d4dc4 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -38,7 +38,7 @@ static int dccp_write_timeout(struct sock *sk)
38 38
39 if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { 39 if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
40 if (icsk->icsk_retransmits != 0) 40 if (icsk->icsk_retransmits != 0)
41 dst_negative_advice(&sk->sk_dst_cache, sk); 41 dst_negative_advice(sk);
42 retry_until = icsk->icsk_syn_retries ? 42 retry_until = icsk->icsk_syn_retries ?
43 : sysctl_dccp_request_retries; 43 : sysctl_dccp_request_retries;
44 } else { 44 } else {
@@ -63,7 +63,7 @@ static int dccp_write_timeout(struct sock *sk)
63 Golden words :-). 63 Golden words :-).
64 */ 64 */
65 65
66 dst_negative_advice(&sk->sk_dst_cache, sk); 66 dst_negative_advice(sk);
67 } 67 }
68 68
69 retry_until = sysctl_dccp_retries2; 69 retry_until = sysctl_dccp_retries2;
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 2b494fac9468..55e3b6b0061a 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -446,7 +446,7 @@ static void dn_destruct(struct sock *sk)
446 skb_queue_purge(&scp->other_xmit_queue); 446 skb_queue_purge(&scp->other_xmit_queue);
447 skb_queue_purge(&scp->other_receive_queue); 447 skb_queue_purge(&scp->other_receive_queue);
448 448
449 dst_release(xchg(&sk->sk_dst_cache, NULL)); 449 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
450} 450}
451 451
452static int dn_memory_pressure; 452static int dn_memory_pressure;
@@ -1105,7 +1105,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
1105 release_sock(sk); 1105 release_sock(sk);
1106 1106
1107 dst = skb_dst(skb); 1107 dst = skb_dst(skb);
1108 dst_release(xchg(&newsk->sk_dst_cache, dst)); 1108 sk_dst_set(newsk, dst);
1109 skb_dst_set(skb, NULL); 1109 skb_dst_set(skb, NULL);
1110 1110
1111 DN_SK(newsk)->state = DN_CR; 1111 DN_SK(newsk)->state = DN_CR;
@@ -1956,7 +1956,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
1956 } 1956 }
1957 1957
1958 if ((flags & MSG_TRYHARD) && sk->sk_dst_cache) 1958 if ((flags & MSG_TRYHARD) && sk->sk_dst_cache)
1959 dst_negative_advice(&sk->sk_dst_cache, sk); 1959 dst_negative_advice(sk);
1960 1960
1961 mss = scp->segsize_rem; 1961 mss = scp->segsize_rem;
1962 fctype = scp->services_rem & NSP_FC_MASK; 1962 fctype = scp->services_rem & NSP_FC_MASK;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a0beb32beaa3..193dcd6ed64f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -154,7 +154,7 @@ void inet_sock_destruct(struct sock *sk)
154 WARN_ON(sk->sk_forward_alloc); 154 WARN_ON(sk->sk_forward_alloc);
155 155
156 kfree(inet->opt); 156 kfree(inet->opt);
157 dst_release(sk->sk_dst_cache); 157 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
158 sk_refcnt_debug_dec(sk); 158 sk_refcnt_debug_dec(sk);
159} 159}
160EXPORT_SYMBOL(inet_sock_destruct); 160EXPORT_SYMBOL(inet_sock_destruct);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4000b10610b7..ae3ec15fb630 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3710,7 +3710,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3710 } 3710 }
3711 3711
3712 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) 3712 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
3713 dst_confirm(sk->sk_dst_cache); 3713 dst_confirm(__sk_dst_get(sk));
3714 3714
3715 return 1; 3715 return 1;
3716 3716
@@ -5833,7 +5833,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5833 if (tp->snd_una == tp->write_seq) { 5833 if (tp->snd_una == tp->write_seq) {
5834 tcp_set_state(sk, TCP_FIN_WAIT2); 5834 tcp_set_state(sk, TCP_FIN_WAIT2);
5835 sk->sk_shutdown |= SEND_SHUTDOWN; 5835 sk->sk_shutdown |= SEND_SHUTDOWN;
5836 dst_confirm(sk->sk_dst_cache); 5836 dst_confirm(__sk_dst_get(sk));
5837 5837
5838 if (!sock_flag(sk, SOCK_DEAD)) 5838 if (!sock_flag(sk, SOCK_DEAD))
5839 /* Wake up lingering close() */ 5839 /* Wake up lingering close() */
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 8a0ab2977f1f..c732be00606b 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -172,14 +172,14 @@ static int tcp_write_timeout(struct sock *sk)
172 172
173 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 173 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
174 if (icsk->icsk_retransmits) 174 if (icsk->icsk_retransmits)
175 dst_negative_advice(&sk->sk_dst_cache, sk); 175 dst_negative_advice(sk);
176 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 176 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
177 } else { 177 } else {
178 if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { 178 if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
179 /* Black hole detection */ 179 /* Black hole detection */
180 tcp_mtu_probing(icsk, sk); 180 tcp_mtu_probing(icsk, sk);
181 181
182 dst_negative_advice(&sk->sk_dst_cache, sk); 182 dst_negative_advice(sk);
183 } 183 }
184 184
185 retry_until = sysctl_tcp_retries2; 185 retry_until = sysctl_tcp_retries2;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 33f60fca7aa7..1160400e9dbd 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -114,9 +114,9 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
114 } 114 }
115 opt = xchg(&inet6_sk(sk)->opt, opt); 115 opt = xchg(&inet6_sk(sk)->opt, opt);
116 } else { 116 } else {
117 write_lock(&sk->sk_dst_lock); 117 spin_lock(&sk->sk_dst_lock);
118 opt = xchg(&inet6_sk(sk)->opt, opt); 118 opt = xchg(&inet6_sk(sk)->opt, opt);
119 write_unlock(&sk->sk_dst_lock); 119 spin_unlock(&sk->sk_dst_lock);
120 } 120 }
121 sk_dst_reset(sk); 121 sk_dst_reset(sk);
122 122
@@ -971,14 +971,13 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
971 case IPV6_MTU: 971 case IPV6_MTU:
972 { 972 {
973 struct dst_entry *dst; 973 struct dst_entry *dst;
974
974 val = 0; 975 val = 0;
975 lock_sock(sk); 976 rcu_read_lock();
976 dst = sk_dst_get(sk); 977 dst = __sk_dst_get(sk);
977 if (dst) { 978 if (dst)
978 val = dst_mtu(dst); 979 val = dst_mtu(dst);
979 dst_release(dst); 980 rcu_read_unlock();
980 }
981 release_sock(sk);
982 if (!val) 981 if (!val)
983 return -ENOTCONN; 982 return -ENOTCONN;
984 break; 983 break;
@@ -1066,12 +1065,14 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
1066 else 1065 else
1067 val = np->mcast_hops; 1066 val = np->mcast_hops;
1068 1067
1069 dst = sk_dst_get(sk); 1068 if (val < 0) {
1070 if (dst) { 1069 rcu_read_lock();
1071 if (val < 0) 1070 dst = __sk_dst_get(sk);
1071 if (dst)
1072 val = ip6_dst_hoplimit(dst); 1072 val = ip6_dst_hoplimit(dst);
1073 dst_release(dst); 1073 rcu_read_unlock();
1074 } 1074 }
1075
1075 if (val < 0) 1076 if (val < 0)
1076 val = sock_net(sk)->ipv6.devconf_all->hop_limit; 1077 val = sock_net(sk)->ipv6.devconf_all->hop_limit;
1077 break; 1078 break;