aboutsummaryrefslogtreecommitdiffstats
path: root/include/net/sock.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/net/sock.h')
-rw-r--r--include/net/sock.h195
1 files changed, 151 insertions, 44 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index 092b0551e77f..5697caf8cc76 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -51,6 +51,7 @@
51#include <linux/skbuff.h> /* struct sk_buff */ 51#include <linux/skbuff.h> /* struct sk_buff */
52#include <linux/mm.h> 52#include <linux/mm.h>
53#include <linux/security.h> 53#include <linux/security.h>
54#include <linux/slab.h>
54 55
55#include <linux/filter.h> 56#include <linux/filter.h>
56#include <linux/rculist_nulls.h> 57#include <linux/rculist_nulls.h>
@@ -73,7 +74,7 @@
73 printk(KERN_DEBUG msg); } while (0) 74 printk(KERN_DEBUG msg); } while (0)
74#else 75#else
75/* Validate arguments and do nothing */ 76/* Validate arguments and do nothing */
76static void inline int __attribute__ ((format (printf, 2, 3))) 77static inline void __attribute__ ((format (printf, 2, 3)))
77SOCK_DEBUG(struct sock *sk, const char *msg, ...) 78SOCK_DEBUG(struct sock *sk, const char *msg, ...)
78{ 79{
79} 80}
@@ -158,7 +159,7 @@ struct sock_common {
158 * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings 159 * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
159 * @sk_lock: synchronizer 160 * @sk_lock: synchronizer
160 * @sk_rcvbuf: size of receive buffer in bytes 161 * @sk_rcvbuf: size of receive buffer in bytes
161 * @sk_sleep: sock wait queue 162 * @sk_wq: sock wait queue and async head
162 * @sk_dst_cache: destination cache 163 * @sk_dst_cache: destination cache
163 * @sk_dst_lock: destination cache lock 164 * @sk_dst_lock: destination cache lock
164 * @sk_policy: flow policy 165 * @sk_policy: flow policy
@@ -176,6 +177,7 @@ struct sock_common {
176 * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings 177 * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
177 * @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets 178 * @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets
178 * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) 179 * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
180 * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
179 * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) 181 * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
180 * @sk_gso_max_size: Maximum GSO segment size to build 182 * @sk_gso_max_size: Maximum GSO segment size to build
181 * @sk_lingertime: %SO_LINGER l_linger setting 183 * @sk_lingertime: %SO_LINGER l_linger setting
@@ -197,6 +199,7 @@ struct sock_common {
197 * @sk_rcvlowat: %SO_RCVLOWAT setting 199 * @sk_rcvlowat: %SO_RCVLOWAT setting
198 * @sk_rcvtimeo: %SO_RCVTIMEO setting 200 * @sk_rcvtimeo: %SO_RCVTIMEO setting
199 * @sk_sndtimeo: %SO_SNDTIMEO setting 201 * @sk_sndtimeo: %SO_SNDTIMEO setting
202 * @sk_rxhash: flow hash received from netif layer
200 * @sk_filter: socket filtering instructions 203 * @sk_filter: socket filtering instructions
201 * @sk_protinfo: private area, net family specific, when not using slab 204 * @sk_protinfo: private area, net family specific, when not using slab
202 * @sk_timer: sock cleanup timer 205 * @sk_timer: sock cleanup timer
@@ -254,14 +257,13 @@ struct sock {
254 struct sk_buff *head; 257 struct sk_buff *head;
255 struct sk_buff *tail; 258 struct sk_buff *tail;
256 int len; 259 int len;
257 int limit;
258 } sk_backlog; 260 } sk_backlog;
259 wait_queue_head_t *sk_sleep; 261 struct socket_wq *sk_wq;
260 struct dst_entry *sk_dst_cache; 262 struct dst_entry *sk_dst_cache;
261#ifdef CONFIG_XFRM 263#ifdef CONFIG_XFRM
262 struct xfrm_policy *sk_policy[2]; 264 struct xfrm_policy *sk_policy[2];
263#endif 265#endif
264 rwlock_t sk_dst_lock; 266 spinlock_t sk_dst_lock;
265 atomic_t sk_rmem_alloc; 267 atomic_t sk_rmem_alloc;
266 atomic_t sk_wmem_alloc; 268 atomic_t sk_wmem_alloc;
267 atomic_t sk_omem_alloc; 269 atomic_t sk_omem_alloc;
@@ -275,9 +277,13 @@ struct sock {
275 int sk_forward_alloc; 277 int sk_forward_alloc;
276 gfp_t sk_allocation; 278 gfp_t sk_allocation;
277 int sk_route_caps; 279 int sk_route_caps;
280 int sk_route_nocaps;
278 int sk_gso_type; 281 int sk_gso_type;
279 unsigned int sk_gso_max_size; 282 unsigned int sk_gso_max_size;
280 int sk_rcvlowat; 283 int sk_rcvlowat;
284#ifdef CONFIG_RPS
285 __u32 sk_rxhash;
286#endif
281 unsigned long sk_flags; 287 unsigned long sk_flags;
282 unsigned long sk_lingertime; 288 unsigned long sk_lingertime;
283 struct sk_buff_head sk_error_queue; 289 struct sk_buff_head sk_error_queue;
@@ -594,19 +600,32 @@ static inline int sk_stream_memory_free(struct sock *sk)
594/* OOB backlog add */ 600/* OOB backlog add */
595static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) 601static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
596{ 602{
597 if (!sk->sk_backlog.tail) { 603 /* dont let skb dst not refcounted, we are going to leave rcu lock */
598 sk->sk_backlog.head = sk->sk_backlog.tail = skb; 604 skb_dst_force(skb);
599 } else { 605
606 if (!sk->sk_backlog.tail)
607 sk->sk_backlog.head = skb;
608 else
600 sk->sk_backlog.tail->next = skb; 609 sk->sk_backlog.tail->next = skb;
601 sk->sk_backlog.tail = skb; 610
602 } 611 sk->sk_backlog.tail = skb;
603 skb->next = NULL; 612 skb->next = NULL;
604} 613}
605 614
615/*
616 * Take into account size of receive queue and backlog queue
617 */
618static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb)
619{
620 unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);
621
622 return qsize + skb->truesize > sk->sk_rcvbuf;
623}
624
606/* The per-socket spinlock must be held here. */ 625/* The per-socket spinlock must be held here. */
607static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb) 626static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb)
608{ 627{
609 if (sk->sk_backlog.len >= max(sk->sk_backlog.limit, sk->sk_rcvbuf << 1)) 628 if (sk_rcvqueues_full(sk, skb))
610 return -ENOBUFS; 629 return -ENOBUFS;
611 630
612 __sk_add_backlog(sk, skb); 631 __sk_add_backlog(sk, skb);
@@ -619,6 +638,40 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
619 return sk->sk_backlog_rcv(sk, skb); 638 return sk->sk_backlog_rcv(sk, skb);
620} 639}
621 640
641static inline void sock_rps_record_flow(const struct sock *sk)
642{
643#ifdef CONFIG_RPS
644 struct rps_sock_flow_table *sock_flow_table;
645
646 rcu_read_lock();
647 sock_flow_table = rcu_dereference(rps_sock_flow_table);
648 rps_record_sock_flow(sock_flow_table, sk->sk_rxhash);
649 rcu_read_unlock();
650#endif
651}
652
653static inline void sock_rps_reset_flow(const struct sock *sk)
654{
655#ifdef CONFIG_RPS
656 struct rps_sock_flow_table *sock_flow_table;
657
658 rcu_read_lock();
659 sock_flow_table = rcu_dereference(rps_sock_flow_table);
660 rps_reset_sock_flow(sock_flow_table, sk->sk_rxhash);
661 rcu_read_unlock();
662#endif
663}
664
665static inline void sock_rps_save_rxhash(struct sock *sk, u32 rxhash)
666{
667#ifdef CONFIG_RPS
668 if (unlikely(sk->sk_rxhash != rxhash)) {
669 sock_rps_reset_flow(sk);
670 sk->sk_rxhash = rxhash;
671 }
672#endif
673}
674
622#define sk_wait_event(__sk, __timeo, __condition) \ 675#define sk_wait_event(__sk, __timeo, __condition) \
623 ({ int __rc; \ 676 ({ int __rc; \
624 release_sock(__sk); \ 677 release_sock(__sk); \
@@ -973,6 +1026,16 @@ extern void release_sock(struct sock *sk);
973 SINGLE_DEPTH_NESTING) 1026 SINGLE_DEPTH_NESTING)
974#define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) 1027#define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock))
975 1028
1029static inline void lock_sock_bh(struct sock *sk)
1030{
1031 spin_lock_bh(&sk->sk_lock.slock);
1032}
1033
1034static inline void unlock_sock_bh(struct sock *sk)
1035{
1036 spin_unlock_bh(&sk->sk_lock.slock);
1037}
1038
976extern struct sock *sk_alloc(struct net *net, int family, 1039extern struct sock *sk_alloc(struct net *net, int family,
977 gfp_t priority, 1040 gfp_t priority,
978 struct proto *prot); 1041 struct proto *prot);
@@ -1159,6 +1222,10 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock)
1159 sk->sk_socket = sock; 1222 sk->sk_socket = sock;
1160} 1223}
1161 1224
1225static inline wait_queue_head_t *sk_sleep(struct sock *sk)
1226{
1227 return &sk->sk_wq->wait;
1228}
1162/* Detach socket from process context. 1229/* Detach socket from process context.
1163 * Announce socket dead, detach it from wait queue and inode. 1230 * Announce socket dead, detach it from wait queue and inode.
1164 * Note that parent inode held reference count on this struct sock, 1231 * Note that parent inode held reference count on this struct sock,
@@ -1171,14 +1238,14 @@ static inline void sock_orphan(struct sock *sk)
1171 write_lock_bh(&sk->sk_callback_lock); 1238 write_lock_bh(&sk->sk_callback_lock);
1172 sock_set_flag(sk, SOCK_DEAD); 1239 sock_set_flag(sk, SOCK_DEAD);
1173 sk_set_socket(sk, NULL); 1240 sk_set_socket(sk, NULL);
1174 sk->sk_sleep = NULL; 1241 sk->sk_wq = NULL;
1175 write_unlock_bh(&sk->sk_callback_lock); 1242 write_unlock_bh(&sk->sk_callback_lock);
1176} 1243}
1177 1244
1178static inline void sock_graft(struct sock *sk, struct socket *parent) 1245static inline void sock_graft(struct sock *sk, struct socket *parent)
1179{ 1246{
1180 write_lock_bh(&sk->sk_callback_lock); 1247 write_lock_bh(&sk->sk_callback_lock);
1181 sk->sk_sleep = &parent->wait; 1248 rcu_assign_pointer(sk->sk_wq, parent->wq);
1182 parent->sk = sk; 1249 parent->sk = sk;
1183 sk_set_socket(sk, parent); 1250 sk_set_socket(sk, parent);
1184 security_sock_graft(sk, parent); 1251 security_sock_graft(sk, parent);
@@ -1191,7 +1258,9 @@ extern unsigned long sock_i_ino(struct sock *sk);
1191static inline struct dst_entry * 1258static inline struct dst_entry *
1192__sk_dst_get(struct sock *sk) 1259__sk_dst_get(struct sock *sk)
1193{ 1260{
1194 return sk->sk_dst_cache; 1261 return rcu_dereference_check(sk->sk_dst_cache, rcu_read_lock_held() ||
1262 sock_owned_by_user(sk) ||
1263 lockdep_is_held(&sk->sk_lock.slock));
1195} 1264}
1196 1265
1197static inline struct dst_entry * 1266static inline struct dst_entry *
@@ -1199,50 +1268,65 @@ sk_dst_get(struct sock *sk)
1199{ 1268{
1200 struct dst_entry *dst; 1269 struct dst_entry *dst;
1201 1270
1202 read_lock(&sk->sk_dst_lock); 1271 rcu_read_lock();
1203 dst = sk->sk_dst_cache; 1272 dst = rcu_dereference(sk->sk_dst_cache);
1204 if (dst) 1273 if (dst)
1205 dst_hold(dst); 1274 dst_hold(dst);
1206 read_unlock(&sk->sk_dst_lock); 1275 rcu_read_unlock();
1207 return dst; 1276 return dst;
1208} 1277}
1209 1278
1279extern void sk_reset_txq(struct sock *sk);
1280
1281static inline void dst_negative_advice(struct sock *sk)
1282{
1283 struct dst_entry *ndst, *dst = __sk_dst_get(sk);
1284
1285 if (dst && dst->ops->negative_advice) {
1286 ndst = dst->ops->negative_advice(dst);
1287
1288 if (ndst != dst) {
1289 rcu_assign_pointer(sk->sk_dst_cache, ndst);
1290 sk_reset_txq(sk);
1291 }
1292 }
1293}
1294
1210static inline void 1295static inline void
1211__sk_dst_set(struct sock *sk, struct dst_entry *dst) 1296__sk_dst_set(struct sock *sk, struct dst_entry *dst)
1212{ 1297{
1213 struct dst_entry *old_dst; 1298 struct dst_entry *old_dst;
1214 1299
1215 sk_tx_queue_clear(sk); 1300 sk_tx_queue_clear(sk);
1216 old_dst = sk->sk_dst_cache; 1301 /*
1217 sk->sk_dst_cache = dst; 1302 * This can be called while sk is owned by the caller only,
1303 * with no state that can be checked in a rcu_dereference_check() cond
1304 */
1305 old_dst = rcu_dereference_raw(sk->sk_dst_cache);
1306 rcu_assign_pointer(sk->sk_dst_cache, dst);
1218 dst_release(old_dst); 1307 dst_release(old_dst);
1219} 1308}
1220 1309
1221static inline void 1310static inline void
1222sk_dst_set(struct sock *sk, struct dst_entry *dst) 1311sk_dst_set(struct sock *sk, struct dst_entry *dst)
1223{ 1312{
1224 write_lock(&sk->sk_dst_lock); 1313 spin_lock(&sk->sk_dst_lock);
1225 __sk_dst_set(sk, dst); 1314 __sk_dst_set(sk, dst);
1226 write_unlock(&sk->sk_dst_lock); 1315 spin_unlock(&sk->sk_dst_lock);
1227} 1316}
1228 1317
1229static inline void 1318static inline void
1230__sk_dst_reset(struct sock *sk) 1319__sk_dst_reset(struct sock *sk)
1231{ 1320{
1232 struct dst_entry *old_dst; 1321 __sk_dst_set(sk, NULL);
1233
1234 sk_tx_queue_clear(sk);
1235 old_dst = sk->sk_dst_cache;
1236 sk->sk_dst_cache = NULL;
1237 dst_release(old_dst);
1238} 1322}
1239 1323
1240static inline void 1324static inline void
1241sk_dst_reset(struct sock *sk) 1325sk_dst_reset(struct sock *sk)
1242{ 1326{
1243 write_lock(&sk->sk_dst_lock); 1327 spin_lock(&sk->sk_dst_lock);
1244 __sk_dst_reset(sk); 1328 __sk_dst_reset(sk);
1245 write_unlock(&sk->sk_dst_lock); 1329 spin_unlock(&sk->sk_dst_lock);
1246} 1330}
1247 1331
1248extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); 1332extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);
@@ -1256,6 +1340,12 @@ static inline int sk_can_gso(const struct sock *sk)
1256 1340
1257extern void sk_setup_caps(struct sock *sk, struct dst_entry *dst); 1341extern void sk_setup_caps(struct sock *sk, struct dst_entry *dst);
1258 1342
1343static inline void sk_nocaps_add(struct sock *sk, int flags)
1344{
1345 sk->sk_route_nocaps |= flags;
1346 sk->sk_route_caps &= ~flags;
1347}
1348
1259static inline int skb_copy_to_page(struct sock *sk, char __user *from, 1349static inline int skb_copy_to_page(struct sock *sk, char __user *from,
1260 struct sk_buff *skb, struct page *page, 1350 struct sk_buff *skb, struct page *page,
1261 int off, int copy) 1351 int off, int copy)
@@ -1313,12 +1403,12 @@ static inline int sk_has_allocations(const struct sock *sk)
1313} 1403}
1314 1404
1315/** 1405/**
1316 * sk_has_sleeper - check if there are any waiting processes 1406 * wq_has_sleeper - check if there are any waiting processes
1317 * @sk: socket 1407 * @sk: struct socket_wq
1318 * 1408 *
1319 * Returns true if socket has waiting processes 1409 * Returns true if socket_wq has waiting processes
1320 * 1410 *
1321 * The purpose of the sk_has_sleeper and sock_poll_wait is to wrap the memory 1411 * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory
1322 * barrier call. They were added due to the race found within the tcp code. 1412 * barrier call. They were added due to the race found within the tcp code.
1323 * 1413 *
1324 * Consider following tcp code paths: 1414 * Consider following tcp code paths:
@@ -1331,9 +1421,10 @@ static inline int sk_has_allocations(const struct sock *sk)
1331 * ... ... 1421 * ... ...
1332 * tp->rcv_nxt check sock_def_readable 1422 * tp->rcv_nxt check sock_def_readable
1333 * ... { 1423 * ... {
1334 * schedule ... 1424 * schedule rcu_read_lock();
1335 * if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 1425 * wq = rcu_dereference(sk->sk_wq);
1336 * wake_up_interruptible(sk->sk_sleep) 1426 * if (wq && waitqueue_active(&wq->wait))
1427 * wake_up_interruptible(&wq->wait)
1337 * ... 1428 * ...
1338 * } 1429 * }
1339 * 1430 *
@@ -1342,19 +1433,18 @@ static inline int sk_has_allocations(const struct sock *sk)
1342 * could then endup calling schedule and sleep forever if there are no more 1433 * could then endup calling schedule and sleep forever if there are no more
1343 * data on the socket. 1434 * data on the socket.
1344 * 1435 *
1345 * The sk_has_sleeper is always called right after a call to read_lock, so we
1346 * can use smp_mb__after_lock barrier.
1347 */ 1436 */
1348static inline int sk_has_sleeper(struct sock *sk) 1437static inline bool wq_has_sleeper(struct socket_wq *wq)
1349{ 1438{
1439
1350 /* 1440 /*
1351 * We need to be sure we are in sync with the 1441 * We need to be sure we are in sync with the
1352 * add_wait_queue modifications to the wait queue. 1442 * add_wait_queue modifications to the wait queue.
1353 * 1443 *
1354 * This memory barrier is paired in the sock_poll_wait. 1444 * This memory barrier is paired in the sock_poll_wait.
1355 */ 1445 */
1356 smp_mb__after_lock(); 1446 smp_mb();
1357 return sk->sk_sleep && waitqueue_active(sk->sk_sleep); 1447 return wq && waitqueue_active(&wq->wait);
1358} 1448}
1359 1449
1360/** 1450/**
@@ -1363,7 +1453,7 @@ static inline int sk_has_sleeper(struct sock *sk)
1363 * @wait_address: socket wait queue 1453 * @wait_address: socket wait queue
1364 * @p: poll_table 1454 * @p: poll_table
1365 * 1455 *
1366 * See the comments in the sk_has_sleeper function. 1456 * See the comments in the wq_has_sleeper function.
1367 */ 1457 */
1368static inline void sock_poll_wait(struct file *filp, 1458static inline void sock_poll_wait(struct file *filp,
1369 wait_queue_head_t *wait_address, poll_table *p) 1459 wait_queue_head_t *wait_address, poll_table *p)
@@ -1374,7 +1464,7 @@ static inline void sock_poll_wait(struct file *filp,
1374 * We need to be sure we are in sync with the 1464 * We need to be sure we are in sync with the
1375 * socket flags modification. 1465 * socket flags modification.
1376 * 1466 *
1377 * This memory barrier is paired in the sk_has_sleeper. 1467 * This memory barrier is paired in the wq_has_sleeper.
1378 */ 1468 */
1379 smp_mb(); 1469 smp_mb();
1380 } 1470 }
@@ -1556,7 +1646,24 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
1556 sk->sk_stamp = kt; 1646 sk->sk_stamp = kt;
1557} 1647}
1558 1648
1559extern void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); 1649extern void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
1650 struct sk_buff *skb);
1651
1652static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
1653 struct sk_buff *skb)
1654{
1655#define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \
1656 (1UL << SOCK_RCVTSTAMP) | \
1657 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
1658 (1UL << SOCK_TIMESTAMPING_SOFTWARE) | \
1659 (1UL << SOCK_TIMESTAMPING_RAW_HARDWARE) | \
1660 (1UL << SOCK_TIMESTAMPING_SYS_HARDWARE))
1661
1662 if (sk->sk_flags & FLAGS_TS_OR_DROPS)
1663 __sock_recv_ts_and_drops(msg, sk, skb);
1664 else
1665 sk->sk_stamp = skb->tstamp;
1666}
1560 1667
1561/** 1668/**
1562 * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped 1669 * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped