diff options
author | Willem de Bruijn <willemb@google.com> | 2015-05-12 11:56:47 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-05-13 15:42:59 -0400 |
commit | 9954729bc3896998d53040c46a28830a3a3d5063 (patch) | |
tree | e49c1dbf376d7413915ab0f9d630676842f15bcd /net/packet | |
parent | 0648ab70afe6c3bf2369a6d779b44a85121c063d (diff) |
packet: rollover only to socket with headroom
Only migrate flows to sockets that have sufficient headroom, where
sufficient is defined as having at least 25% empty space.
The kernel has three different buffer types: a regular socket, a ring
with frames (TPACKET_V[12]) or a ring with blocks (TPACKET_V3). The
latter two do not expose a read pointer to the kernel, so headroom is
not computed easily. All three needs a different implementation to
estimate free space.
Tested:
Ran bench_rollover for 10 sec with 1.5 Mpps of single flow input.
bench_rollover has as many sockets as there are NIC receive queues
in the system. Each socket is owned by a process that is pinned to
one of the receive cpus. RFS is disabled. RPS is enabled with an
identity mapping (cpu x -> cpu x), to count drops with softnettop.
lpbb5:/export/hda3/willemb# ./bench_rollover -r -l 1000 -s
Press [Enter] to exit
cpu rx rx.k drop.k rollover r.huge r.failed
0 16 16 0 0 0 0
1 21 21 0 0 0 0
2 5227502 5227502 0 0 0 0
3 18 18 0 0 0 0
4 6083289 6083289 0 5227496 0 0
5 22 22 0 0 0 0
6 21 21 0 0 0 0
7 9 9 0 0 0 0
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/packet')
-rw-r--r-- | net/packet/af_packet.c | 76 |
1 files changed, 59 insertions, 17 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ad3d9ff56541..ffa67205f698 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c | |||
@@ -1234,27 +1234,68 @@ static void packet_free_pending(struct packet_sock *po) | |||
1234 | free_percpu(po->tx_ring.pending_refcnt); | 1234 | free_percpu(po->tx_ring.pending_refcnt); |
1235 | } | 1235 | } |
1236 | 1236 | ||
1237 | static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) | 1237 | #define ROOM_POW_OFF 2 |
1238 | #define ROOM_NONE 0x0 | ||
1239 | #define ROOM_LOW 0x1 | ||
1240 | #define ROOM_NORMAL 0x2 | ||
1241 | |||
1242 | static bool __tpacket_has_room(struct packet_sock *po, int pow_off) | ||
1238 | { | 1243 | { |
1239 | struct sock *sk = &po->sk; | 1244 | int idx, len; |
1240 | bool has_room; | 1245 | |
1246 | len = po->rx_ring.frame_max + 1; | ||
1247 | idx = po->rx_ring.head; | ||
1248 | if (pow_off) | ||
1249 | idx += len >> pow_off; | ||
1250 | if (idx >= len) | ||
1251 | idx -= len; | ||
1252 | return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); | ||
1253 | } | ||
1241 | 1254 | ||
1242 | if (po->prot_hook.func != tpacket_rcv) | 1255 | static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) |
1243 | return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize) | 1256 | { |
1244 | <= sk->sk_rcvbuf; | 1257 | int idx, len; |
1258 | |||
1259 | len = po->rx_ring.prb_bdqc.knum_blocks; | ||
1260 | idx = po->rx_ring.prb_bdqc.kactive_blk_num; | ||
1261 | if (pow_off) | ||
1262 | idx += len >> pow_off; | ||
1263 | if (idx >= len) | ||
1264 | idx -= len; | ||
1265 | return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); | ||
1266 | } | ||
1267 | |||
1268 | static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) | ||
1269 | { | ||
1270 | struct sock *sk = &po->sk; | ||
1271 | int ret = ROOM_NONE; | ||
1272 | |||
1273 | if (po->prot_hook.func != tpacket_rcv) { | ||
1274 | int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc) | ||
1275 | - skb->truesize; | ||
1276 | if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF)) | ||
1277 | return ROOM_NORMAL; | ||
1278 | else if (avail > 0) | ||
1279 | return ROOM_LOW; | ||
1280 | else | ||
1281 | return ROOM_NONE; | ||
1282 | } | ||
1245 | 1283 | ||
1246 | spin_lock(&sk->sk_receive_queue.lock); | 1284 | spin_lock(&sk->sk_receive_queue.lock); |
1247 | if (po->tp_version == TPACKET_V3) | 1285 | if (po->tp_version == TPACKET_V3) { |
1248 | has_room = prb_lookup_block(po, &po->rx_ring, | 1286 | if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) |
1249 | po->rx_ring.prb_bdqc.kactive_blk_num, | 1287 | ret = ROOM_NORMAL; |
1250 | TP_STATUS_KERNEL); | 1288 | else if (__tpacket_v3_has_room(po, 0)) |
1251 | else | 1289 | ret = ROOM_LOW; |
1252 | has_room = packet_lookup_frame(po, &po->rx_ring, | 1290 | } else { |
1253 | po->rx_ring.head, | 1291 | if (__tpacket_has_room(po, ROOM_POW_OFF)) |
1254 | TP_STATUS_KERNEL); | 1292 | ret = ROOM_NORMAL; |
1293 | else if (__tpacket_has_room(po, 0)) | ||
1294 | ret = ROOM_LOW; | ||
1295 | } | ||
1255 | spin_unlock(&sk->sk_receive_queue.lock); | 1296 | spin_unlock(&sk->sk_receive_queue.lock); |
1256 | 1297 | ||
1257 | return has_room; | 1298 | return ret; |
1258 | } | 1299 | } |
1259 | 1300 | ||
1260 | static void packet_sock_destruct(struct sock *sk) | 1301 | static void packet_sock_destruct(struct sock *sk) |
@@ -1325,12 +1366,13 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, | |||
1325 | unsigned int i, j; | 1366 | unsigned int i, j; |
1326 | 1367 | ||
1327 | po = pkt_sk(f->arr[idx]); | 1368 | po = pkt_sk(f->arr[idx]); |
1328 | if (try_self && packet_rcv_has_room(po, skb)) | 1369 | if (try_self && packet_rcv_has_room(po, skb) != ROOM_NONE) |
1329 | return idx; | 1370 | return idx; |
1330 | 1371 | ||
1331 | i = j = min_t(int, po->rollover->sock, num - 1); | 1372 | i = j = min_t(int, po->rollover->sock, num - 1); |
1332 | do { | 1373 | do { |
1333 | if (i != idx && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { | 1374 | if (i != idx && |
1375 | packet_rcv_has_room(pkt_sk(f->arr[i]), skb) == ROOM_NORMAL) { | ||
1334 | if (i != j) | 1376 | if (i != j) |
1335 | po->rollover->sock = i; | 1377 | po->rollover->sock = i; |
1336 | return i; | 1378 | return i; |