aboutsummaryrefslogtreecommitdiffstats
path: root/include/net/sock.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/net/sock.h')
-rw-r--r--include/net/sock.h192
1 files changed, 149 insertions, 43 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index 1ad6435f252..5697caf8cc7 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -159,7 +159,7 @@ struct sock_common {
159 * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings 159 * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
160 * @sk_lock: synchronizer 160 * @sk_lock: synchronizer
161 * @sk_rcvbuf: size of receive buffer in bytes 161 * @sk_rcvbuf: size of receive buffer in bytes
162 * @sk_sleep: sock wait queue 162 * @sk_wq: sock wait queue and async head
163 * @sk_dst_cache: destination cache 163 * @sk_dst_cache: destination cache
164 * @sk_dst_lock: destination cache lock 164 * @sk_dst_lock: destination cache lock
165 * @sk_policy: flow policy 165 * @sk_policy: flow policy
@@ -177,6 +177,7 @@ struct sock_common {
177 * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings 177 * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
178 * @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets 178 * @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets
179 * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) 179 * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
180 * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
180 * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) 181 * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
181 * @sk_gso_max_size: Maximum GSO segment size to build 182 * @sk_gso_max_size: Maximum GSO segment size to build
182 * @sk_lingertime: %SO_LINGER l_linger setting 183 * @sk_lingertime: %SO_LINGER l_linger setting
@@ -198,6 +199,7 @@ struct sock_common {
198 * @sk_rcvlowat: %SO_RCVLOWAT setting 199 * @sk_rcvlowat: %SO_RCVLOWAT setting
199 * @sk_rcvtimeo: %SO_RCVTIMEO setting 200 * @sk_rcvtimeo: %SO_RCVTIMEO setting
200 * @sk_sndtimeo: %SO_SNDTIMEO setting 201 * @sk_sndtimeo: %SO_SNDTIMEO setting
202 * @sk_rxhash: flow hash received from netif layer
201 * @sk_filter: socket filtering instructions 203 * @sk_filter: socket filtering instructions
202 * @sk_protinfo: private area, net family specific, when not using slab 204 * @sk_protinfo: private area, net family specific, when not using slab
203 * @sk_timer: sock cleanup timer 205 * @sk_timer: sock cleanup timer
@@ -255,14 +257,13 @@ struct sock {
255 struct sk_buff *head; 257 struct sk_buff *head;
256 struct sk_buff *tail; 258 struct sk_buff *tail;
257 int len; 259 int len;
258 int limit;
259 } sk_backlog; 260 } sk_backlog;
260 wait_queue_head_t *sk_sleep; 261 struct socket_wq *sk_wq;
261 struct dst_entry *sk_dst_cache; 262 struct dst_entry *sk_dst_cache;
262#ifdef CONFIG_XFRM 263#ifdef CONFIG_XFRM
263 struct xfrm_policy *sk_policy[2]; 264 struct xfrm_policy *sk_policy[2];
264#endif 265#endif
265 rwlock_t sk_dst_lock; 266 spinlock_t sk_dst_lock;
266 atomic_t sk_rmem_alloc; 267 atomic_t sk_rmem_alloc;
267 atomic_t sk_wmem_alloc; 268 atomic_t sk_wmem_alloc;
268 atomic_t sk_omem_alloc; 269 atomic_t sk_omem_alloc;
@@ -276,9 +277,13 @@ struct sock {
276 int sk_forward_alloc; 277 int sk_forward_alloc;
277 gfp_t sk_allocation; 278 gfp_t sk_allocation;
278 int sk_route_caps; 279 int sk_route_caps;
280 int sk_route_nocaps;
279 int sk_gso_type; 281 int sk_gso_type;
280 unsigned int sk_gso_max_size; 282 unsigned int sk_gso_max_size;
281 int sk_rcvlowat; 283 int sk_rcvlowat;
284#ifdef CONFIG_RPS
285 __u32 sk_rxhash;
286#endif
282 unsigned long sk_flags; 287 unsigned long sk_flags;
283 unsigned long sk_lingertime; 288 unsigned long sk_lingertime;
284 struct sk_buff_head sk_error_queue; 289 struct sk_buff_head sk_error_queue;
@@ -595,19 +600,32 @@ static inline int sk_stream_memory_free(struct sock *sk)
595/* OOB backlog add */ 600/* OOB backlog add */
596static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) 601static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
597{ 602{
598 if (!sk->sk_backlog.tail) { 603 /* dont let skb dst not refcounted, we are going to leave rcu lock */
599 sk->sk_backlog.head = sk->sk_backlog.tail = skb; 604 skb_dst_force(skb);
600 } else { 605
606 if (!sk->sk_backlog.tail)
607 sk->sk_backlog.head = skb;
608 else
601 sk->sk_backlog.tail->next = skb; 609 sk->sk_backlog.tail->next = skb;
602 sk->sk_backlog.tail = skb; 610
603 } 611 sk->sk_backlog.tail = skb;
604 skb->next = NULL; 612 skb->next = NULL;
605} 613}
606 614
615/*
616 * Take into account size of receive queue and backlog queue
617 */
618static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb)
619{
620 unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);
621
622 return qsize + skb->truesize > sk->sk_rcvbuf;
623}
624
607/* The per-socket spinlock must be held here. */ 625/* The per-socket spinlock must be held here. */
608static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb) 626static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb)
609{ 627{
610 if (sk->sk_backlog.len >= max(sk->sk_backlog.limit, sk->sk_rcvbuf << 1)) 628 if (sk_rcvqueues_full(sk, skb))
611 return -ENOBUFS; 629 return -ENOBUFS;
612 630
613 __sk_add_backlog(sk, skb); 631 __sk_add_backlog(sk, skb);
@@ -620,6 +638,40 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
620 return sk->sk_backlog_rcv(sk, skb); 638 return sk->sk_backlog_rcv(sk, skb);
621} 639}
622 640
641static inline void sock_rps_record_flow(const struct sock *sk)
642{
643#ifdef CONFIG_RPS
644 struct rps_sock_flow_table *sock_flow_table;
645
646 rcu_read_lock();
647 sock_flow_table = rcu_dereference(rps_sock_flow_table);
648 rps_record_sock_flow(sock_flow_table, sk->sk_rxhash);
649 rcu_read_unlock();
650#endif
651}
652
653static inline void sock_rps_reset_flow(const struct sock *sk)
654{
655#ifdef CONFIG_RPS
656 struct rps_sock_flow_table *sock_flow_table;
657
658 rcu_read_lock();
659 sock_flow_table = rcu_dereference(rps_sock_flow_table);
660 rps_reset_sock_flow(sock_flow_table, sk->sk_rxhash);
661 rcu_read_unlock();
662#endif
663}
664
665static inline void sock_rps_save_rxhash(struct sock *sk, u32 rxhash)
666{
667#ifdef CONFIG_RPS
668 if (unlikely(sk->sk_rxhash != rxhash)) {
669 sock_rps_reset_flow(sk);
670 sk->sk_rxhash = rxhash;
671 }
672#endif
673}
674
623#define sk_wait_event(__sk, __timeo, __condition) \ 675#define sk_wait_event(__sk, __timeo, __condition) \
624 ({ int __rc; \ 676 ({ int __rc; \
625 release_sock(__sk); \ 677 release_sock(__sk); \
@@ -974,6 +1026,16 @@ extern void release_sock(struct sock *sk);
974 SINGLE_DEPTH_NESTING) 1026 SINGLE_DEPTH_NESTING)
975#define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) 1027#define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock))
976 1028
1029static inline void lock_sock_bh(struct sock *sk)
1030{
1031 spin_lock_bh(&sk->sk_lock.slock);
1032}
1033
1034static inline void unlock_sock_bh(struct sock *sk)
1035{
1036 spin_unlock_bh(&sk->sk_lock.slock);
1037}
1038
977extern struct sock *sk_alloc(struct net *net, int family, 1039extern struct sock *sk_alloc(struct net *net, int family,
978 gfp_t priority, 1040 gfp_t priority,
979 struct proto *prot); 1041 struct proto *prot);
@@ -1160,6 +1222,10 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock)
1160 sk->sk_socket = sock; 1222 sk->sk_socket = sock;
1161} 1223}
1162 1224
1225static inline wait_queue_head_t *sk_sleep(struct sock *sk)
1226{
1227 return &sk->sk_wq->wait;
1228}
1163/* Detach socket from process context. 1229/* Detach socket from process context.
1164 * Announce socket dead, detach it from wait queue and inode. 1230 * Announce socket dead, detach it from wait queue and inode.
1165 * Note that parent inode held reference count on this struct sock, 1231 * Note that parent inode held reference count on this struct sock,
@@ -1172,14 +1238,14 @@ static inline void sock_orphan(struct sock *sk)
1172 write_lock_bh(&sk->sk_callback_lock); 1238 write_lock_bh(&sk->sk_callback_lock);
1173 sock_set_flag(sk, SOCK_DEAD); 1239 sock_set_flag(sk, SOCK_DEAD);
1174 sk_set_socket(sk, NULL); 1240 sk_set_socket(sk, NULL);
1175 sk->sk_sleep = NULL; 1241 sk->sk_wq = NULL;
1176 write_unlock_bh(&sk->sk_callback_lock); 1242 write_unlock_bh(&sk->sk_callback_lock);
1177} 1243}
1178 1244
1179static inline void sock_graft(struct sock *sk, struct socket *parent) 1245static inline void sock_graft(struct sock *sk, struct socket *parent)
1180{ 1246{
1181 write_lock_bh(&sk->sk_callback_lock); 1247 write_lock_bh(&sk->sk_callback_lock);
1182 sk->sk_sleep = &parent->wait; 1248 rcu_assign_pointer(sk->sk_wq, parent->wq);
1183 parent->sk = sk; 1249 parent->sk = sk;
1184 sk_set_socket(sk, parent); 1250 sk_set_socket(sk, parent);
1185 security_sock_graft(sk, parent); 1251 security_sock_graft(sk, parent);
@@ -1192,7 +1258,9 @@ extern unsigned long sock_i_ino(struct sock *sk);
1192static inline struct dst_entry * 1258static inline struct dst_entry *
1193__sk_dst_get(struct sock *sk) 1259__sk_dst_get(struct sock *sk)
1194{ 1260{
1195 return sk->sk_dst_cache; 1261 return rcu_dereference_check(sk->sk_dst_cache, rcu_read_lock_held() ||
1262 sock_owned_by_user(sk) ||
1263 lockdep_is_held(&sk->sk_lock.slock));
1196} 1264}
1197 1265
1198static inline struct dst_entry * 1266static inline struct dst_entry *
@@ -1200,50 +1268,65 @@ sk_dst_get(struct sock *sk)
1200{ 1268{
1201 struct dst_entry *dst; 1269 struct dst_entry *dst;
1202 1270
1203 read_lock(&sk->sk_dst_lock); 1271 rcu_read_lock();
1204 dst = sk->sk_dst_cache; 1272 dst = rcu_dereference(sk->sk_dst_cache);
1205 if (dst) 1273 if (dst)
1206 dst_hold(dst); 1274 dst_hold(dst);
1207 read_unlock(&sk->sk_dst_lock); 1275 rcu_read_unlock();
1208 return dst; 1276 return dst;
1209} 1277}
1210 1278
1279extern void sk_reset_txq(struct sock *sk);
1280
1281static inline void dst_negative_advice(struct sock *sk)
1282{
1283 struct dst_entry *ndst, *dst = __sk_dst_get(sk);
1284
1285 if (dst && dst->ops->negative_advice) {
1286 ndst = dst->ops->negative_advice(dst);
1287
1288 if (ndst != dst) {
1289 rcu_assign_pointer(sk->sk_dst_cache, ndst);
1290 sk_reset_txq(sk);
1291 }
1292 }
1293}
1294
1211static inline void 1295static inline void
1212__sk_dst_set(struct sock *sk, struct dst_entry *dst) 1296__sk_dst_set(struct sock *sk, struct dst_entry *dst)
1213{ 1297{
1214 struct dst_entry *old_dst; 1298 struct dst_entry *old_dst;
1215 1299
1216 sk_tx_queue_clear(sk); 1300 sk_tx_queue_clear(sk);
1217 old_dst = sk->sk_dst_cache; 1301 /*
1218 sk->sk_dst_cache = dst; 1302 * This can be called while sk is owned by the caller only,
1303 * with no state that can be checked in a rcu_dereference_check() cond
1304 */
1305 old_dst = rcu_dereference_raw(sk->sk_dst_cache);
1306 rcu_assign_pointer(sk->sk_dst_cache, dst);
1219 dst_release(old_dst); 1307 dst_release(old_dst);
1220} 1308}
1221 1309
1222static inline void 1310static inline void
1223sk_dst_set(struct sock *sk, struct dst_entry *dst) 1311sk_dst_set(struct sock *sk, struct dst_entry *dst)
1224{ 1312{
1225 write_lock(&sk->sk_dst_lock); 1313 spin_lock(&sk->sk_dst_lock);
1226 __sk_dst_set(sk, dst); 1314 __sk_dst_set(sk, dst);
1227 write_unlock(&sk->sk_dst_lock); 1315 spin_unlock(&sk->sk_dst_lock);
1228} 1316}
1229 1317
1230static inline void 1318static inline void
1231__sk_dst_reset(struct sock *sk) 1319__sk_dst_reset(struct sock *sk)
1232{ 1320{
1233 struct dst_entry *old_dst; 1321 __sk_dst_set(sk, NULL);
1234
1235 sk_tx_queue_clear(sk);
1236 old_dst = sk->sk_dst_cache;
1237 sk->sk_dst_cache = NULL;
1238 dst_release(old_dst);
1239} 1322}
1240 1323
1241static inline void 1324static inline void
1242sk_dst_reset(struct sock *sk) 1325sk_dst_reset(struct sock *sk)
1243{ 1326{
1244 write_lock(&sk->sk_dst_lock); 1327 spin_lock(&sk->sk_dst_lock);
1245 __sk_dst_reset(sk); 1328 __sk_dst_reset(sk);
1246 write_unlock(&sk->sk_dst_lock); 1329 spin_unlock(&sk->sk_dst_lock);
1247} 1330}
1248 1331
1249extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); 1332extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);
@@ -1257,6 +1340,12 @@ static inline int sk_can_gso(const struct sock *sk)
1257 1340
1258extern void sk_setup_caps(struct sock *sk, struct dst_entry *dst); 1341extern void sk_setup_caps(struct sock *sk, struct dst_entry *dst);
1259 1342
1343static inline void sk_nocaps_add(struct sock *sk, int flags)
1344{
1345 sk->sk_route_nocaps |= flags;
1346 sk->sk_route_caps &= ~flags;
1347}
1348
1260static inline int skb_copy_to_page(struct sock *sk, char __user *from, 1349static inline int skb_copy_to_page(struct sock *sk, char __user *from,
1261 struct sk_buff *skb, struct page *page, 1350 struct sk_buff *skb, struct page *page,
1262 int off, int copy) 1351 int off, int copy)
@@ -1314,12 +1403,12 @@ static inline int sk_has_allocations(const struct sock *sk)
1314} 1403}
1315 1404
1316/** 1405/**
1317 * sk_has_sleeper - check if there are any waiting processes 1406 * wq_has_sleeper - check if there are any waiting processes
1318 * @sk: socket 1407 * @sk: struct socket_wq
1319 * 1408 *
1320 * Returns true if socket has waiting processes 1409 * Returns true if socket_wq has waiting processes
1321 * 1410 *
1322 * The purpose of the sk_has_sleeper and sock_poll_wait is to wrap the memory 1411 * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory
1323 * barrier call. They were added due to the race found within the tcp code. 1412 * barrier call. They were added due to the race found within the tcp code.
1324 * 1413 *
1325 * Consider following tcp code paths: 1414 * Consider following tcp code paths:
@@ -1332,9 +1421,10 @@ static inline int sk_has_allocations(const struct sock *sk)
1332 * ... ... 1421 * ... ...
1333 * tp->rcv_nxt check sock_def_readable 1422 * tp->rcv_nxt check sock_def_readable
1334 * ... { 1423 * ... {
1335 * schedule ... 1424 * schedule rcu_read_lock();
1336 * if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 1425 * wq = rcu_dereference(sk->sk_wq);
1337 * wake_up_interruptible(sk->sk_sleep) 1426 * if (wq && waitqueue_active(&wq->wait))
1427 * wake_up_interruptible(&wq->wait)
1338 * ... 1428 * ...
1339 * } 1429 * }
1340 * 1430 *
@@ -1343,19 +1433,18 @@ static inline int sk_has_allocations(const struct sock *sk)
1343 * could then endup calling schedule and sleep forever if there are no more 1433 * could then endup calling schedule and sleep forever if there are no more
1344 * data on the socket. 1434 * data on the socket.
1345 * 1435 *
1346 * The sk_has_sleeper is always called right after a call to read_lock, so we
1347 * can use smp_mb__after_lock barrier.
1348 */ 1436 */
1349static inline int sk_has_sleeper(struct sock *sk) 1437static inline bool wq_has_sleeper(struct socket_wq *wq)
1350{ 1438{
1439
1351 /* 1440 /*
1352 * We need to be sure we are in sync with the 1441 * We need to be sure we are in sync with the
1353 * add_wait_queue modifications to the wait queue. 1442 * add_wait_queue modifications to the wait queue.
1354 * 1443 *
1355 * This memory barrier is paired in the sock_poll_wait. 1444 * This memory barrier is paired in the sock_poll_wait.
1356 */ 1445 */
1357 smp_mb__after_lock(); 1446 smp_mb();
1358 return sk->sk_sleep && waitqueue_active(sk->sk_sleep); 1447 return wq && waitqueue_active(&wq->wait);
1359} 1448}
1360 1449
1361/** 1450/**
@@ -1364,7 +1453,7 @@ static inline int sk_has_sleeper(struct sock *sk)
1364 * @wait_address: socket wait queue 1453 * @wait_address: socket wait queue
1365 * @p: poll_table 1454 * @p: poll_table
1366 * 1455 *
1367 * See the comments in the sk_has_sleeper function. 1456 * See the comments in the wq_has_sleeper function.
1368 */ 1457 */
1369static inline void sock_poll_wait(struct file *filp, 1458static inline void sock_poll_wait(struct file *filp,
1370 wait_queue_head_t *wait_address, poll_table *p) 1459 wait_queue_head_t *wait_address, poll_table *p)
@@ -1375,7 +1464,7 @@ static inline void sock_poll_wait(struct file *filp,
1375 * We need to be sure we are in sync with the 1464 * We need to be sure we are in sync with the
1376 * socket flags modification. 1465 * socket flags modification.
1377 * 1466 *
1378 * This memory barrier is paired in the sk_has_sleeper. 1467 * This memory barrier is paired in the wq_has_sleeper.
1379 */ 1468 */
1380 smp_mb(); 1469 smp_mb();
1381 } 1470 }
@@ -1557,7 +1646,24 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
1557 sk->sk_stamp = kt; 1646 sk->sk_stamp = kt;
1558} 1647}
1559 1648
1560extern void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); 1649extern void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
1650 struct sk_buff *skb);
1651
1652static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
1653 struct sk_buff *skb)
1654{
1655#define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \
1656 (1UL << SOCK_RCVTSTAMP) | \
1657 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
1658 (1UL << SOCK_TIMESTAMPING_SOFTWARE) | \
1659 (1UL << SOCK_TIMESTAMPING_RAW_HARDWARE) | \
1660 (1UL << SOCK_TIMESTAMPING_SYS_HARDWARE))
1661
1662 if (sk->sk_flags & FLAGS_TS_OR_DROPS)
1663 __sock_recv_ts_and_drops(msg, sk, skb);
1664 else
1665 sk->sk_stamp = skb->tstamp;
1666}
1561 1667
1562/** 1668/**
1563 * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped 1669 * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped