aboutsummaryrefslogtreecommitdiffstats
path: root/net/packet
diff options
context:
space:
mode:
authorWillem de Bruijn <willemb@google.com>2015-05-12 11:56:47 -0400
committerDavid S. Miller <davem@davemloft.net>2015-05-13 15:42:59 -0400
commit9954729bc3896998d53040c46a28830a3a3d5063 (patch)
treee49c1dbf376d7413915ab0f9d630676842f15bcd /net/packet
parent0648ab70afe6c3bf2369a6d779b44a85121c063d (diff)
packet: rollover only to socket with headroom
Only migrate flows to sockets that have sufficient headroom, where sufficient is defined as having at least 25% empty space. The kernel has three different buffer types: a regular socket, a ring with frames (TPACKET_V[12]) or a ring with blocks (TPACKET_V3). The latter two do not expose a read pointer to the kernel, so headroom is not computed easily. All three needs a different implementation to estimate free space. Tested: Ran bench_rollover for 10 sec with 1.5 Mpps of single flow input. bench_rollover has as many sockets as there are NIC receive queues in the system. Each socket is owned by a process that is pinned to one of the receive cpus. RFS is disabled. RPS is enabled with an identity mapping (cpu x -> cpu x), to count drops with softnettop. lpbb5:/export/hda3/willemb# ./bench_rollover -r -l 1000 -s Press [Enter] to exit cpu rx rx.k drop.k rollover r.huge r.failed 0 16 16 0 0 0 0 1 21 21 0 0 0 0 2 5227502 5227502 0 0 0 0 3 18 18 0 0 0 0 4 6083289 6083289 0 5227496 0 0 5 22 22 0 0 0 0 6 21 21 0 0 0 0 7 9 9 0 0 0 0 Signed-off-by: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/packet')
-rw-r--r--net/packet/af_packet.c76
1 files changed, 59 insertions, 17 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index ad3d9ff56541..ffa67205f698 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1234,27 +1234,68 @@ static void packet_free_pending(struct packet_sock *po)
1234 free_percpu(po->tx_ring.pending_refcnt); 1234 free_percpu(po->tx_ring.pending_refcnt);
1235} 1235}
1236 1236
1237static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) 1237#define ROOM_POW_OFF 2
1238#define ROOM_NONE 0x0
1239#define ROOM_LOW 0x1
1240#define ROOM_NORMAL 0x2
1241
1242static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
1238{ 1243{
1239 struct sock *sk = &po->sk; 1244 int idx, len;
1240 bool has_room; 1245
1246 len = po->rx_ring.frame_max + 1;
1247 idx = po->rx_ring.head;
1248 if (pow_off)
1249 idx += len >> pow_off;
1250 if (idx >= len)
1251 idx -= len;
1252 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1253}
1241 1254
1242 if (po->prot_hook.func != tpacket_rcv) 1255static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1243 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize) 1256{
1244 <= sk->sk_rcvbuf; 1257 int idx, len;
1258
1259 len = po->rx_ring.prb_bdqc.knum_blocks;
1260 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1261 if (pow_off)
1262 idx += len >> pow_off;
1263 if (idx >= len)
1264 idx -= len;
1265 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1266}
1267
1268static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1269{
1270 struct sock *sk = &po->sk;
1271 int ret = ROOM_NONE;
1272
1273 if (po->prot_hook.func != tpacket_rcv) {
1274 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1275 - skb->truesize;
1276 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1277 return ROOM_NORMAL;
1278 else if (avail > 0)
1279 return ROOM_LOW;
1280 else
1281 return ROOM_NONE;
1282 }
1245 1283
1246 spin_lock(&sk->sk_receive_queue.lock); 1284 spin_lock(&sk->sk_receive_queue.lock);
1247 if (po->tp_version == TPACKET_V3) 1285 if (po->tp_version == TPACKET_V3) {
1248 has_room = prb_lookup_block(po, &po->rx_ring, 1286 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1249 po->rx_ring.prb_bdqc.kactive_blk_num, 1287 ret = ROOM_NORMAL;
1250 TP_STATUS_KERNEL); 1288 else if (__tpacket_v3_has_room(po, 0))
1251 else 1289 ret = ROOM_LOW;
1252 has_room = packet_lookup_frame(po, &po->rx_ring, 1290 } else {
1253 po->rx_ring.head, 1291 if (__tpacket_has_room(po, ROOM_POW_OFF))
1254 TP_STATUS_KERNEL); 1292 ret = ROOM_NORMAL;
1293 else if (__tpacket_has_room(po, 0))
1294 ret = ROOM_LOW;
1295 }
1255 spin_unlock(&sk->sk_receive_queue.lock); 1296 spin_unlock(&sk->sk_receive_queue.lock);
1256 1297
1257 return has_room; 1298 return ret;
1258} 1299}
1259 1300
1260static void packet_sock_destruct(struct sock *sk) 1301static void packet_sock_destruct(struct sock *sk)
@@ -1325,12 +1366,13 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1325 unsigned int i, j; 1366 unsigned int i, j;
1326 1367
1327 po = pkt_sk(f->arr[idx]); 1368 po = pkt_sk(f->arr[idx]);
1328 if (try_self && packet_rcv_has_room(po, skb)) 1369 if (try_self && packet_rcv_has_room(po, skb) != ROOM_NONE)
1329 return idx; 1370 return idx;
1330 1371
1331 i = j = min_t(int, po->rollover->sock, num - 1); 1372 i = j = min_t(int, po->rollover->sock, num - 1);
1332 do { 1373 do {
1333 if (i != idx && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { 1374 if (i != idx &&
1375 packet_rcv_has_room(pkt_sk(f->arr[i]), skb) == ROOM_NORMAL) {
1334 if (i != j) 1376 if (i != j)
1335 po->rollover->sock = i; 1377 po->rollover->sock = i;
1336 return i; 1378 return i;