aboutsummaryrefslogtreecommitdiffstats
path: root/net/packet
diff options
context:
space:
mode:
authorWillem de Bruijn <willemb@google.com>2015-05-12 11:56:48 -0400
committerDavid S. Miller <davem@davemloft.net>2015-05-13 15:43:00 -0400
commit2ccdbaa6d55b0656244ba57c4b56765a0af76c0a (patch)
tree7e5c1abd185ea9738aedc982fafae5ae05cb61c8 /net/packet
parent9954729bc3896998d53040c46a28830a3a3d5063 (diff)
packet: rollover lock contention avoidance
Rollover has to call packet_rcv_has_room on sockets in the fanout group to find a socket to migrate to. This operation is expensive especially if the packet sockets use rings, when a lock has to be acquired. Avoid pounding on the lock by all sockets by temporarily marking a socket as "under memory pressure" when such pressure is detected. While set, only the socket owner may call packet_rcv_has_room on the socket. Once it detects normal conditions, it clears the flag. The socket is not used as a victim by any other socket in the meantime. Under reasonably balanced load, each socket writer frequently calls packet_rcv_has_room and clears its own pressure field. As a backup for when the socket is rarely written to, also clear the flag on reading (packet_recvmsg, packet_poll) if this can be done cheaply (i.e., without calling packet_rcv_has_room). This is only for edge cases. Tested: Ran bench_rollover: a process with 8 sockets in a single fanout group, each pinned to a single cpu that receives one nic recv interrupt. RPS and RFS are disabled. The benchmark uses packet rx_ring, which has to take a lock when determining whether a socket has room. Sent 3.5 Mpps of UDP traffic with sufficient entropy to spread uniformly across the packet sockets (and inserted an iptables rule to drop in PREROUTING to avoid protocol stack processing). Without this patch, all sockets try to migrate traffic to neighbors, causing lock contention when searching for a non- empty neighbor. The lock is the top 9 entries. perf record -a -g sleep 5 - 17.82% bench_rollover [kernel.kallsyms] [k] _raw_spin_lock - _raw_spin_lock - 99.00% spin_lock + 81.77% packet_rcv_has_room.isra.41 + 18.23% tpacket_rcv + 0.84% packet_rcv_has_room.isra.41 + 5.20% ksoftirqd/6 [kernel.kallsyms] [k] _raw_spin_lock + 5.15% ksoftirqd/1 [kernel.kallsyms] [k] _raw_spin_lock + 5.14% ksoftirqd/2 [kernel.kallsyms] [k] _raw_spin_lock + 5.12% ksoftirqd/7 [kernel.kallsyms] [k] _raw_spin_lock + 5.12% ksoftirqd/5 [kernel.kallsyms] [k] _raw_spin_lock + 5.10% ksoftirqd/4 [kernel.kallsyms] [k] _raw_spin_lock + 4.66% ksoftirqd/0 [kernel.kallsyms] [k] _raw_spin_lock + 4.45% ksoftirqd/3 [kernel.kallsyms] [k] _raw_spin_lock + 1.55% bench_rollover [kernel.kallsyms] [k] packet_rcv_has_room.isra.41 On net-next with this patch, this lock contention is no longer a top entry. Most time is spent in the actual read function. Next up are other locks: + 15.52% bench_rollover bench_rollover [.] reader + 4.68% swapper [kernel.kallsyms] [k] memcpy_erms + 2.77% swapper [kernel.kallsyms] [k] packet_lookup_frame.isra.51 + 2.56% ksoftirqd/1 [kernel.kallsyms] [k] memcpy_erms + 2.16% swapper [kernel.kallsyms] [k] tpacket_rcv + 1.93% swapper [kernel.kallsyms] [k] mlx4_en_process_rx_cq Looking closer at the remaining _raw_spin_lock, the cost of probing in rollover is now comparable to the cost of taking the lock later in tpacket_rcv. - 1.51% swapper [kernel.kallsyms] [k] _raw_spin_lock - _raw_spin_lock + 33.41% packet_rcv_has_room + 28.15% tpacket_rcv + 19.54% enqueue_to_backlog + 6.45% __free_pages_ok + 2.78% packet_rcv_fanout + 2.13% fanout_demux_rollover + 2.01% netif_receive_skb_internal Signed-off-by: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/packet')
-rw-r--r--net/packet/af_packet.c38
-rw-r--r--net/packet/internal.h1
2 files changed, 32 insertions, 7 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index ffa67205f698..3a383fd72f82 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1265,14 +1265,14 @@ static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1265 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); 1265 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1266} 1266}
1267 1267
1268static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) 1268static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1269{ 1269{
1270 struct sock *sk = &po->sk; 1270 struct sock *sk = &po->sk;
1271 int ret = ROOM_NONE; 1271 int ret = ROOM_NONE;
1272 1272
1273 if (po->prot_hook.func != tpacket_rcv) { 1273 if (po->prot_hook.func != tpacket_rcv) {
1274 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc) 1274 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1275 - skb->truesize; 1275 - (skb ? skb->truesize : 0);
1276 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF)) 1276 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1277 return ROOM_NORMAL; 1277 return ROOM_NORMAL;
1278 else if (avail > 0) 1278 else if (avail > 0)
@@ -1281,7 +1281,6 @@ static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1281 return ROOM_NONE; 1281 return ROOM_NONE;
1282 } 1282 }
1283 1283
1284 spin_lock(&sk->sk_receive_queue.lock);
1285 if (po->tp_version == TPACKET_V3) { 1284 if (po->tp_version == TPACKET_V3) {
1286 if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) 1285 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1287 ret = ROOM_NORMAL; 1286 ret = ROOM_NORMAL;
@@ -1293,7 +1292,26 @@ static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1293 else if (__tpacket_has_room(po, 0)) 1292 else if (__tpacket_has_room(po, 0))
1294 ret = ROOM_LOW; 1293 ret = ROOM_LOW;
1295 } 1294 }
1296 spin_unlock(&sk->sk_receive_queue.lock); 1295
1296 return ret;
1297}
1298
1299static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1300{
1301 int ret;
1302 bool has_room;
1303
1304 if (po->prot_hook.func == tpacket_rcv) {
1305 spin_lock(&po->sk.sk_receive_queue.lock);
1306 ret = __packet_rcv_has_room(po, skb);
1307 spin_unlock(&po->sk.sk_receive_queue.lock);
1308 } else {
1309 ret = __packet_rcv_has_room(po, skb);
1310 }
1311
1312 has_room = ret == ROOM_NORMAL;
1313 if (po->pressure == has_room)
1314 xchg(&po->pressure, !has_room);
1297 1315
1298 return ret; 1316 return ret;
1299} 1317}
@@ -1362,7 +1380,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1362 unsigned int idx, bool try_self, 1380 unsigned int idx, bool try_self,
1363 unsigned int num) 1381 unsigned int num)
1364{ 1382{
1365 struct packet_sock *po; 1383 struct packet_sock *po, *po_next;
1366 unsigned int i, j; 1384 unsigned int i, j;
1367 1385
1368 po = pkt_sk(f->arr[idx]); 1386 po = pkt_sk(f->arr[idx]);
@@ -1371,8 +1389,9 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1371 1389
1372 i = j = min_t(int, po->rollover->sock, num - 1); 1390 i = j = min_t(int, po->rollover->sock, num - 1);
1373 do { 1391 do {
1374 if (i != idx && 1392 po_next = pkt_sk(f->arr[i]);
1375 packet_rcv_has_room(pkt_sk(f->arr[i]), skb) == ROOM_NORMAL) { 1393 if (po_next != po && !po_next->pressure &&
1394 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
1376 if (i != j) 1395 if (i != j)
1377 po->rollover->sock = i; 1396 po->rollover->sock = i;
1378 return i; 1397 return i;
@@ -3000,6 +3019,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3000 if (skb == NULL) 3019 if (skb == NULL)
3001 goto out; 3020 goto out;
3002 3021
3022 if (pkt_sk(sk)->pressure)
3023 packet_rcv_has_room(pkt_sk(sk), NULL);
3024
3003 if (pkt_sk(sk)->has_vnet_hdr) { 3025 if (pkt_sk(sk)->has_vnet_hdr) {
3004 struct virtio_net_hdr vnet_hdr = { 0 }; 3026 struct virtio_net_hdr vnet_hdr = { 0 };
3005 3027
@@ -3755,6 +3777,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,
3755 TP_STATUS_KERNEL)) 3777 TP_STATUS_KERNEL))
3756 mask |= POLLIN | POLLRDNORM; 3778 mask |= POLLIN | POLLRDNORM;
3757 } 3779 }
3780 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
3781 xchg(&po->pressure, 0);
3758 spin_unlock_bh(&sk->sk_receive_queue.lock); 3782 spin_unlock_bh(&sk->sk_receive_queue.lock);
3759 spin_lock_bh(&sk->sk_write_queue.lock); 3783 spin_lock_bh(&sk->sk_write_queue.lock);
3760 if (po->tx_ring.pg_vec) { 3784 if (po->tx_ring.pg_vec) {
diff --git a/net/packet/internal.h b/net/packet/internal.h
index a9d33a28a019..22d7d778c5b7 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -105,6 +105,7 @@ struct packet_sock {
105 auxdata:1, 105 auxdata:1,
106 origdev:1, 106 origdev:1,
107 has_vnet_hdr:1; 107 has_vnet_hdr:1;
108 int pressure;
108 int ifindex; /* bound device */ 109 int ifindex; /* bound device */
109 __be16 num; 110 __be16 num;
110 struct packet_rollover *rollover; 111 struct packet_rollover *rollover;