aboutsummaryrefslogtreecommitdiffstats
path: root/net/core/drop_monitor.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2019-08-17 15:40:09 -0400
committerDavid S. Miller <davem@davemloft.net>2019-08-17 15:40:09 -0400
commit83beee5a3aff0fb159b2fb4d0cac8f18a193417e (patch)
treece77ccefee1384488408d9b9e49e2148359f30d9 /net/core/drop_monitor.c
parentf77508308fa76d0efc60ebf3c906f467feb062cb (diff)
parent95766451bfb82f972bf3fea93fc6e91a904cf624 (diff)
Merge branch 'drop_monitor-for-offloaded-paths'
Ido Schimmel says: ==================== Add drop monitor for offloaded data paths Users have several ways to debug the kernel and understand why a packet was dropped. For example, using drop monitor and perf. Both utilities trace kfree_skb(), which is the function called when a packet is freed as part of a failure. The information provided by these tools is invaluable when trying to understand the cause of a packet loss. In recent years, large portions of the kernel data path were offloaded to capable devices. Today, it is possible to perform L2 and L3 forwarding in hardware, as well as tunneling (IP-in-IP and VXLAN). Different TC classifiers and actions are also offloaded to capable devices, at both ingress and egress. However, when the data path is offloaded it is not possible to achieve the same level of introspection since packets are dropped by the underlying device and never reach the kernel. This patchset aims to solve this by allowing users to monitor packets that the underlying device decided to drop along with relevant metadata such as the drop reason and ingress port. The above is achieved by exposing a fundamental capability of devices capable of data path offloading - packet trapping. In much the same way as drop monitor registers its probe function with the kfree_skb() tracepoint, the device is instructed to pass to the CPU (trap) packets that it decided to drop in various places in the pipeline. The configuration of the device to pass such packets to the CPU is performed using devlink, as it is not specific to a port, but rather to a device. In the future, we plan to control the policing of such packets using devlink, in order not to overwhelm the CPU. While devlink is used as the control path, the dropped packets are passed along with metadata to drop monitor, which reports them to userspace as netlink events. This allows users to use the same interface for the monitoring of both software and hardware drops. Logically, the solution looks as follows: Netlink event: Packet w/ metadata Or a summary of recent drops ^ | Userspace | +---------------------------------------------------+ Kernel | | +-------+--------+ | | | drop_monitor | | | +-------^--------+ | | | +----+----+ | | Kernel's Rx path | devlink | (non-drop traps) | | +----^----+ ^ | | +-----------+ | +-------+-------+ | | | Device driver | | | +-------^-------+ Kernel | +---------------------------------------------------+ Hardware | | Trapped packet | +--+---+ | | | ASIC | | | +------+ In order to reduce the patch count, this patchset only includes integration with netdevsim. A follow-up patchset will add devlink-trap support in mlxsw. Patches #1-#7 extend drop monitor to also monitor hardware originated drops. Patches #8-#10 add the devlink-trap infrastructure. Patches #11-#12 add devlink-trap support in netdevsim. Patches #13-#16 add tests for the generic infrastructure over netdevsim. Example ======= Instantiate netdevsim --------------------- List supported traps -------------------- netdevsim/netdevsim10: name source_mac_is_multicast type drop generic true action drop group l2_drops name vlan_tag_mismatch type drop generic true action drop group l2_drops name ingress_vlan_filter type drop generic true action drop group l2_drops name ingress_spanning_tree_filter type drop generic true action drop group l2_drops name port_list_is_empty type drop generic true action drop group l2_drops name port_loopback_filter type drop generic true action drop group l2_drops name fid_miss type exception generic false action trap group l2_drops name blackhole_route type drop generic true action drop group l3_drops name ttl_value_is_too_small type exception generic true action trap group l3_drops name tail_drop type drop generic true action drop group buffer_drops Enable a trap ------------- Query statistics ---------------- netdevsim/netdevsim10: name blackhole_route type drop generic true action trap group l3_drops stats: rx: bytes 7384 packets 52 Monitor dropped packets ----------------------- dropwatch> set alertmode packet Setting alert mode Alert mode successfully set dropwatch> set sw true setting software drops monitoring to 1 dropwatch> set hw true setting hardware drops monitoring to 1 dropwatch> start Enabling monitoring... Kernel monitoring activated. Issue Ctrl-C to stop monitoring drop at: ttl_value_is_too_small (l3_drops) origin: hardware input port ifindex: 55 input port name: eth0 timestamp: Mon Aug 12 10:52:20 2019 445911505 nsec protocol: 0x800 length: 142 original length: 142 drop at: ip6_mc_input+0x8b8/0xef8 (0xffffffff9e2bb0e8) origin: software input port ifindex: 4 timestamp: Mon Aug 12 10:53:37 2019 024444587 nsec protocol: 0x86dd length: 110 original length: 110 Future plans ============ * Provide more drop reasons as well as more metadata * Add dropmon support to libpcap, so that tcpdump/tshark could specifically listen on dropmon traffic, instead of capturing all netlink packets via nlmon interface Changes in v3: * Place test with the rest of the netdevsim tests * Fix test to load netdevsim module * Move devlink helpers from the test to devlink_lib.sh. Will be used by mlxsw tests * Re-order netdevsim includes in alphabetical order * Fix reverse xmas tree in netdevsim * Remove double include in netdevsim Changes in v2: * Use drop monitor to report dropped packets instead of devlink * Add drop monitor patches * Add test cases ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core/drop_monitor.c')
-rw-r--r--net/core/drop_monitor.c724
1 files changed, 701 insertions, 23 deletions
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 39e094907391..bfc024024aa3 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -26,6 +26,7 @@
26#include <linux/bitops.h> 26#include <linux/bitops.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <net/drop_monitor.h>
29#include <net/genetlink.h> 30#include <net/genetlink.h>
30#include <net/netevent.h> 31#include <net/netevent.h>
31 32
@@ -43,6 +44,7 @@
43 * netlink alerts 44 * netlink alerts
44 */ 45 */
45static int trace_state = TRACE_OFF; 46static int trace_state = TRACE_OFF;
47static bool monitor_hw;
46 48
47/* net_dm_mutex 49/* net_dm_mutex
48 * 50 *
@@ -56,9 +58,26 @@ struct net_dm_stats {
56 struct u64_stats_sync syncp; 58 struct u64_stats_sync syncp;
57}; 59};
58 60
61#define NET_DM_MAX_HW_TRAP_NAME_LEN 40
62
63struct net_dm_hw_entry {
64 char trap_name[NET_DM_MAX_HW_TRAP_NAME_LEN];
65 u32 count;
66};
67
68struct net_dm_hw_entries {
69 u32 num_entries;
70 struct net_dm_hw_entry entries[0];
71};
72
59struct per_cpu_dm_data { 73struct per_cpu_dm_data {
60 spinlock_t lock; /* Protects 'skb' and 'send_timer' */ 74 spinlock_t lock; /* Protects 'skb', 'hw_entries' and
61 struct sk_buff *skb; 75 * 'send_timer'
76 */
77 union {
78 struct sk_buff *skb;
79 struct net_dm_hw_entries *hw_entries;
80 };
62 struct sk_buff_head drop_queue; 81 struct sk_buff_head drop_queue;
63 struct work_struct dm_alert_work; 82 struct work_struct dm_alert_work;
64 struct timer_list send_timer; 83 struct timer_list send_timer;
@@ -76,6 +95,7 @@ struct dm_hw_stat_delta {
76static struct genl_family net_drop_monitor_family; 95static struct genl_family net_drop_monitor_family;
77 96
78static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); 97static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
98static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data);
79 99
80static int dm_hit_limit = 64; 100static int dm_hit_limit = 64;
81static int dm_delay = 1; 101static int dm_delay = 1;
@@ -92,10 +112,16 @@ struct net_dm_alert_ops {
92 void (*napi_poll_probe)(void *ignore, struct napi_struct *napi, 112 void (*napi_poll_probe)(void *ignore, struct napi_struct *napi,
93 int work, int budget); 113 int work, int budget);
94 void (*work_item_func)(struct work_struct *work); 114 void (*work_item_func)(struct work_struct *work);
115 void (*hw_work_item_func)(struct work_struct *work);
116 void (*hw_probe)(struct sk_buff *skb,
117 const struct net_dm_hw_metadata *hw_metadata);
95}; 118};
96 119
97struct net_dm_skb_cb { 120struct net_dm_skb_cb {
98 void *pc; 121 union {
122 struct net_dm_hw_metadata *hw_metadata;
123 void *pc;
124 };
99}; 125};
100 126
101#define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0])) 127#define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0]))
@@ -266,10 +292,190 @@ static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi,
266 rcu_read_unlock(); 292 rcu_read_unlock();
267} 293}
268 294
295static struct net_dm_hw_entries *
296net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data)
297{
298 struct net_dm_hw_entries *hw_entries;
299 unsigned long flags;
300
301 hw_entries = kzalloc(struct_size(hw_entries, entries, dm_hit_limit),
302 GFP_KERNEL);
303 if (!hw_entries) {
304 /* If the memory allocation failed, we try to perform another
305 * allocation in 1/10 second. Otherwise, the probe function
306 * will constantly bail out.
307 */
308 mod_timer(&hw_data->send_timer, jiffies + HZ / 10);
309 }
310
311 spin_lock_irqsave(&hw_data->lock, flags);
312 swap(hw_data->hw_entries, hw_entries);
313 spin_unlock_irqrestore(&hw_data->lock, flags);
314
315 return hw_entries;
316}
317
318static int net_dm_hw_entry_put(struct sk_buff *msg,
319 const struct net_dm_hw_entry *hw_entry)
320{
321 struct nlattr *attr;
322
323 attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRY);
324 if (!attr)
325 return -EMSGSIZE;
326
327 if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, hw_entry->trap_name))
328 goto nla_put_failure;
329
330 if (nla_put_u32(msg, NET_DM_ATTR_HW_TRAP_COUNT, hw_entry->count))
331 goto nla_put_failure;
332
333 nla_nest_end(msg, attr);
334
335 return 0;
336
337nla_put_failure:
338 nla_nest_cancel(msg, attr);
339 return -EMSGSIZE;
340}
341
342static int net_dm_hw_entries_put(struct sk_buff *msg,
343 const struct net_dm_hw_entries *hw_entries)
344{
345 struct nlattr *attr;
346 int i;
347
348 attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRIES);
349 if (!attr)
350 return -EMSGSIZE;
351
352 for (i = 0; i < hw_entries->num_entries; i++) {
353 int rc;
354
355 rc = net_dm_hw_entry_put(msg, &hw_entries->entries[i]);
356 if (rc)
357 goto nla_put_failure;
358 }
359
360 nla_nest_end(msg, attr);
361
362 return 0;
363
364nla_put_failure:
365 nla_nest_cancel(msg, attr);
366 return -EMSGSIZE;
367}
368
369static int
370net_dm_hw_summary_report_fill(struct sk_buff *msg,
371 const struct net_dm_hw_entries *hw_entries)
372{
373 struct net_dm_alert_msg anc_hdr = { 0 };
374 void *hdr;
375 int rc;
376
377 hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0,
378 NET_DM_CMD_ALERT);
379 if (!hdr)
380 return -EMSGSIZE;
381
382 /* We need to put the ancillary header in order not to break user
383 * space.
384 */
385 if (nla_put(msg, NLA_UNSPEC, sizeof(anc_hdr), &anc_hdr))
386 goto nla_put_failure;
387
388 rc = net_dm_hw_entries_put(msg, hw_entries);
389 if (rc)
390 goto nla_put_failure;
391
392 genlmsg_end(msg, hdr);
393
394 return 0;
395
396nla_put_failure:
397 genlmsg_cancel(msg, hdr);
398 return -EMSGSIZE;
399}
400
401static void net_dm_hw_summary_work(struct work_struct *work)
402{
403 struct net_dm_hw_entries *hw_entries;
404 struct per_cpu_dm_data *hw_data;
405 struct sk_buff *msg;
406 int rc;
407
408 hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
409
410 hw_entries = net_dm_hw_reset_per_cpu_data(hw_data);
411 if (!hw_entries)
412 return;
413
414 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
415 if (!msg)
416 goto out;
417
418 rc = net_dm_hw_summary_report_fill(msg, hw_entries);
419 if (rc) {
420 nlmsg_free(msg);
421 goto out;
422 }
423
424 genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL);
425
426out:
427 kfree(hw_entries);
428}
429
430static void
431net_dm_hw_summary_probe(struct sk_buff *skb,
432 const struct net_dm_hw_metadata *hw_metadata)
433{
434 struct net_dm_hw_entries *hw_entries;
435 struct net_dm_hw_entry *hw_entry;
436 struct per_cpu_dm_data *hw_data;
437 unsigned long flags;
438 int i;
439
440 hw_data = this_cpu_ptr(&dm_hw_cpu_data);
441 spin_lock_irqsave(&hw_data->lock, flags);
442 hw_entries = hw_data->hw_entries;
443
444 if (!hw_entries)
445 goto out;
446
447 for (i = 0; i < hw_entries->num_entries; i++) {
448 hw_entry = &hw_entries->entries[i];
449 if (!strncmp(hw_entry->trap_name, hw_metadata->trap_name,
450 NET_DM_MAX_HW_TRAP_NAME_LEN - 1)) {
451 hw_entry->count++;
452 goto out;
453 }
454 }
455 if (WARN_ON_ONCE(hw_entries->num_entries == dm_hit_limit))
456 goto out;
457
458 hw_entry = &hw_entries->entries[hw_entries->num_entries];
459 strlcpy(hw_entry->trap_name, hw_metadata->trap_name,
460 NET_DM_MAX_HW_TRAP_NAME_LEN - 1);
461 hw_entry->count = 1;
462 hw_entries->num_entries++;
463
464 if (!timer_pending(&hw_data->send_timer)) {
465 hw_data->send_timer.expires = jiffies + dm_delay * HZ;
466 add_timer(&hw_data->send_timer);
467 }
468
469out:
470 spin_unlock_irqrestore(&hw_data->lock, flags);
471}
472
269static const struct net_dm_alert_ops net_dm_alert_summary_ops = { 473static const struct net_dm_alert_ops net_dm_alert_summary_ops = {
270 .kfree_skb_probe = trace_kfree_skb_hit, 474 .kfree_skb_probe = trace_kfree_skb_hit,
271 .napi_poll_probe = trace_napi_poll_hit, 475 .napi_poll_probe = trace_napi_poll_hit,
272 .work_item_func = send_dm_alert, 476 .work_item_func = send_dm_alert,
477 .hw_work_item_func = net_dm_hw_summary_work,
478 .hw_probe = net_dm_hw_summary_probe,
273}; 479};
274 480
275static void net_dm_packet_trace_kfree_skb_hit(void *ignore, 481static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
@@ -323,7 +529,9 @@ static size_t net_dm_in_port_size(void)
323 /* NET_DM_ATTR_IN_PORT nest */ 529 /* NET_DM_ATTR_IN_PORT nest */
324 return nla_total_size(0) + 530 return nla_total_size(0) +
325 /* NET_DM_ATTR_PORT_NETDEV_IFINDEX */ 531 /* NET_DM_ATTR_PORT_NETDEV_IFINDEX */
326 nla_total_size(sizeof(u32)); 532 nla_total_size(sizeof(u32)) +
533 /* NET_DM_ATTR_PORT_NETDEV_NAME */
534 nla_total_size(IFNAMSIZ + 1);
327} 535}
328 536
329#define NET_DM_MAX_SYMBOL_LEN 40 537#define NET_DM_MAX_SYMBOL_LEN 40
@@ -335,6 +543,8 @@ static size_t net_dm_packet_report_size(size_t payload_len)
335 size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize); 543 size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize);
336 544
337 return NLMSG_ALIGN(size) + 545 return NLMSG_ALIGN(size) +
546 /* NET_DM_ATTR_ORIGIN */
547 nla_total_size(sizeof(u16)) +
338 /* NET_DM_ATTR_PC */ 548 /* NET_DM_ATTR_PC */
339 nla_total_size(sizeof(u64)) + 549 nla_total_size(sizeof(u64)) +
340 /* NET_DM_ATTR_SYMBOL */ 550 /* NET_DM_ATTR_SYMBOL */
@@ -351,7 +561,8 @@ static size_t net_dm_packet_report_size(size_t payload_len)
351 nla_total_size(payload_len); 561 nla_total_size(payload_len);
352} 562}
353 563
354static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex) 564static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex,
565 const char *name)
355{ 566{
356 struct nlattr *attr; 567 struct nlattr *attr;
357 568
@@ -363,6 +574,9 @@ static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex)
363 nla_put_u32(msg, NET_DM_ATTR_PORT_NETDEV_IFINDEX, ifindex)) 574 nla_put_u32(msg, NET_DM_ATTR_PORT_NETDEV_IFINDEX, ifindex))
364 goto nla_put_failure; 575 goto nla_put_failure;
365 576
577 if (name && nla_put_string(msg, NET_DM_ATTR_PORT_NETDEV_NAME, name))
578 goto nla_put_failure;
579
366 nla_nest_end(msg, attr); 580 nla_nest_end(msg, attr);
367 581
368 return 0; 582 return 0;
@@ -387,6 +601,9 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
387 if (!hdr) 601 if (!hdr)
388 return -EMSGSIZE; 602 return -EMSGSIZE;
389 603
604 if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW))
605 goto nla_put_failure;
606
390 if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, pc, NET_DM_ATTR_PAD)) 607 if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, pc, NET_DM_ATTR_PAD))
391 goto nla_put_failure; 608 goto nla_put_failure;
392 609
@@ -394,7 +611,7 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
394 if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf)) 611 if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf))
395 goto nla_put_failure; 612 goto nla_put_failure;
396 613
397 rc = net_dm_packet_report_in_port_put(msg, skb->skb_iif); 614 rc = net_dm_packet_report_in_port_put(msg, skb->skb_iif, NULL);
398 if (rc) 615 if (rc)
399 goto nla_put_failure; 616 goto nla_put_failure;
400 617
@@ -481,10 +698,250 @@ static void net_dm_packet_work(struct work_struct *work)
481 net_dm_packet_report(skb); 698 net_dm_packet_report(skb);
482} 699}
483 700
701static size_t
702net_dm_hw_packet_report_size(size_t payload_len,
703 const struct net_dm_hw_metadata *hw_metadata)
704{
705 size_t size;
706
707 size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize);
708
709 return NLMSG_ALIGN(size) +
710 /* NET_DM_ATTR_ORIGIN */
711 nla_total_size(sizeof(u16)) +
712 /* NET_DM_ATTR_HW_TRAP_GROUP_NAME */
713 nla_total_size(strlen(hw_metadata->trap_group_name) + 1) +
714 /* NET_DM_ATTR_HW_TRAP_NAME */
715 nla_total_size(strlen(hw_metadata->trap_name) + 1) +
716 /* NET_DM_ATTR_IN_PORT */
717 net_dm_in_port_size() +
718 /* NET_DM_ATTR_TIMESTAMP */
719 nla_total_size(sizeof(struct timespec)) +
720 /* NET_DM_ATTR_ORIG_LEN */
721 nla_total_size(sizeof(u32)) +
722 /* NET_DM_ATTR_PROTO */
723 nla_total_size(sizeof(u16)) +
724 /* NET_DM_ATTR_PAYLOAD */
725 nla_total_size(payload_len);
726}
727
728static int net_dm_hw_packet_report_fill(struct sk_buff *msg,
729 struct sk_buff *skb, size_t payload_len)
730{
731 struct net_dm_hw_metadata *hw_metadata;
732 struct nlattr *attr;
733 struct timespec ts;
734 void *hdr;
735
736 hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
737
738 hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0,
739 NET_DM_CMD_PACKET_ALERT);
740 if (!hdr)
741 return -EMSGSIZE;
742
743 if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_HW))
744 goto nla_put_failure;
745
746 if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_GROUP_NAME,
747 hw_metadata->trap_group_name))
748 goto nla_put_failure;
749
750 if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME,
751 hw_metadata->trap_name))
752 goto nla_put_failure;
753
754 if (hw_metadata->input_dev) {
755 struct net_device *dev = hw_metadata->input_dev;
756 int rc;
757
758 rc = net_dm_packet_report_in_port_put(msg, dev->ifindex,
759 dev->name);
760 if (rc)
761 goto nla_put_failure;
762 }
763
764 if (ktime_to_timespec_cond(skb->tstamp, &ts) &&
765 nla_put(msg, NET_DM_ATTR_TIMESTAMP, sizeof(ts), &ts))
766 goto nla_put_failure;
767
768 if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len))
769 goto nla_put_failure;
770
771 if (!payload_len)
772 goto out;
773
774 if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol)))
775 goto nla_put_failure;
776
777 attr = skb_put(msg, nla_total_size(payload_len));
778 attr->nla_type = NET_DM_ATTR_PAYLOAD;
779 attr->nla_len = nla_attr_size(payload_len);
780 if (skb_copy_bits(skb, 0, nla_data(attr), payload_len))
781 goto nla_put_failure;
782
783out:
784 genlmsg_end(msg, hdr);
785
786 return 0;
787
788nla_put_failure:
789 genlmsg_cancel(msg, hdr);
790 return -EMSGSIZE;
791}
792
793static struct net_dm_hw_metadata *
794net_dm_hw_metadata_clone(const struct net_dm_hw_metadata *hw_metadata)
795{
796 struct net_dm_hw_metadata *n_hw_metadata;
797 const char *trap_group_name;
798 const char *trap_name;
799
800 n_hw_metadata = kmalloc(sizeof(*hw_metadata), GFP_ATOMIC);
801 if (!n_hw_metadata)
802 return NULL;
803
804 trap_group_name = kmemdup(hw_metadata->trap_group_name,
805 strlen(hw_metadata->trap_group_name) + 1,
806 GFP_ATOMIC | __GFP_ZERO);
807 if (!trap_group_name)
808 goto free_hw_metadata;
809 n_hw_metadata->trap_group_name = trap_group_name;
810
811 trap_name = kmemdup(hw_metadata->trap_name,
812 strlen(hw_metadata->trap_name) + 1,
813 GFP_ATOMIC | __GFP_ZERO);
814 if (!trap_name)
815 goto free_trap_group;
816 n_hw_metadata->trap_name = trap_name;
817
818 n_hw_metadata->input_dev = hw_metadata->input_dev;
819 if (n_hw_metadata->input_dev)
820 dev_hold(n_hw_metadata->input_dev);
821
822 return n_hw_metadata;
823
824free_trap_group:
825 kfree(trap_group_name);
826free_hw_metadata:
827 kfree(n_hw_metadata);
828 return NULL;
829}
830
831static void
832net_dm_hw_metadata_free(const struct net_dm_hw_metadata *hw_metadata)
833{
834 if (hw_metadata->input_dev)
835 dev_put(hw_metadata->input_dev);
836 kfree(hw_metadata->trap_name);
837 kfree(hw_metadata->trap_group_name);
838 kfree(hw_metadata);
839}
840
841static void net_dm_hw_packet_report(struct sk_buff *skb)
842{
843 struct net_dm_hw_metadata *hw_metadata;
844 struct sk_buff *msg;
845 size_t payload_len;
846 int rc;
847
848 if (skb->data > skb_mac_header(skb))
849 skb_push(skb, skb->data - skb_mac_header(skb));
850 else
851 skb_pull(skb, skb_mac_header(skb) - skb->data);
852
853 payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE);
854 if (net_dm_trunc_len)
855 payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
856
857 hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
858 msg = nlmsg_new(net_dm_hw_packet_report_size(payload_len, hw_metadata),
859 GFP_KERNEL);
860 if (!msg)
861 goto out;
862
863 rc = net_dm_hw_packet_report_fill(msg, skb, payload_len);
864 if (rc) {
865 nlmsg_free(msg);
866 goto out;
867 }
868
869 genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL);
870
871out:
872 net_dm_hw_metadata_free(NET_DM_SKB_CB(skb)->hw_metadata);
873 consume_skb(skb);
874}
875
876static void net_dm_hw_packet_work(struct work_struct *work)
877{
878 struct per_cpu_dm_data *hw_data;
879 struct sk_buff_head list;
880 struct sk_buff *skb;
881 unsigned long flags;
882
883 hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
884
885 __skb_queue_head_init(&list);
886
887 spin_lock_irqsave(&hw_data->drop_queue.lock, flags);
888 skb_queue_splice_tail_init(&hw_data->drop_queue, &list);
889 spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags);
890
891 while ((skb = __skb_dequeue(&list)))
892 net_dm_hw_packet_report(skb);
893}
894
895static void
896net_dm_hw_packet_probe(struct sk_buff *skb,
897 const struct net_dm_hw_metadata *hw_metadata)
898{
899 struct net_dm_hw_metadata *n_hw_metadata;
900 ktime_t tstamp = ktime_get_real();
901 struct per_cpu_dm_data *hw_data;
902 struct sk_buff *nskb;
903 unsigned long flags;
904
905 nskb = skb_clone(skb, GFP_ATOMIC);
906 if (!nskb)
907 return;
908
909 n_hw_metadata = net_dm_hw_metadata_clone(hw_metadata);
910 if (!n_hw_metadata)
911 goto free;
912
913 NET_DM_SKB_CB(nskb)->hw_metadata = n_hw_metadata;
914 nskb->tstamp = tstamp;
915
916 hw_data = this_cpu_ptr(&dm_hw_cpu_data);
917
918 spin_lock_irqsave(&hw_data->drop_queue.lock, flags);
919 if (skb_queue_len(&hw_data->drop_queue) < net_dm_queue_len)
920 __skb_queue_tail(&hw_data->drop_queue, nskb);
921 else
922 goto unlock_free;
923 spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags);
924
925 schedule_work(&hw_data->dm_alert_work);
926
927 return;
928
929unlock_free:
930 spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags);
931 u64_stats_update_begin(&hw_data->stats.syncp);
932 hw_data->stats.dropped++;
933 u64_stats_update_end(&hw_data->stats.syncp);
934 net_dm_hw_metadata_free(n_hw_metadata);
935free:
936 consume_skb(nskb);
937}
938
484static const struct net_dm_alert_ops net_dm_alert_packet_ops = { 939static const struct net_dm_alert_ops net_dm_alert_packet_ops = {
485 .kfree_skb_probe = net_dm_packet_trace_kfree_skb_hit, 940 .kfree_skb_probe = net_dm_packet_trace_kfree_skb_hit,
486 .napi_poll_probe = net_dm_packet_trace_napi_poll_hit, 941 .napi_poll_probe = net_dm_packet_trace_napi_poll_hit,
487 .work_item_func = net_dm_packet_work, 942 .work_item_func = net_dm_packet_work,
943 .hw_work_item_func = net_dm_hw_packet_work,
944 .hw_probe = net_dm_hw_packet_probe,
488}; 945};
489 946
490static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = { 947static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = {
@@ -492,6 +949,85 @@ static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = {
492 [NET_DM_ALERT_MODE_PACKET] = &net_dm_alert_packet_ops, 949 [NET_DM_ALERT_MODE_PACKET] = &net_dm_alert_packet_ops,
493}; 950};
494 951
952void net_dm_hw_report(struct sk_buff *skb,
953 const struct net_dm_hw_metadata *hw_metadata)
954{
955 rcu_read_lock();
956
957 if (!monitor_hw)
958 goto out;
959
960 net_dm_alert_ops_arr[net_dm_alert_mode]->hw_probe(skb, hw_metadata);
961
962out:
963 rcu_read_unlock();
964}
965EXPORT_SYMBOL_GPL(net_dm_hw_report);
966
967static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack)
968{
969 const struct net_dm_alert_ops *ops;
970 int cpu;
971
972 if (monitor_hw) {
973 NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already enabled");
974 return -EAGAIN;
975 }
976
977 ops = net_dm_alert_ops_arr[net_dm_alert_mode];
978
979 if (!try_module_get(THIS_MODULE)) {
980 NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module");
981 return -ENODEV;
982 }
983
984 for_each_possible_cpu(cpu) {
985 struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
986 struct net_dm_hw_entries *hw_entries;
987
988 INIT_WORK(&hw_data->dm_alert_work, ops->hw_work_item_func);
989 timer_setup(&hw_data->send_timer, sched_send_work, 0);
990 hw_entries = net_dm_hw_reset_per_cpu_data(hw_data);
991 kfree(hw_entries);
992 }
993
994 monitor_hw = true;
995
996 return 0;
997}
998
999static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack)
1000{
1001 int cpu;
1002
1003 if (!monitor_hw)
1004 NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already disabled");
1005
1006 monitor_hw = false;
1007
1008 /* After this call returns we are guaranteed that no CPU is processing
1009 * any hardware drops.
1010 */
1011 synchronize_rcu();
1012
1013 for_each_possible_cpu(cpu) {
1014 struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
1015 struct sk_buff *skb;
1016
1017 del_timer_sync(&hw_data->send_timer);
1018 cancel_work_sync(&hw_data->dm_alert_work);
1019 while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
1020 struct net_dm_hw_metadata *hw_metadata;
1021
1022 hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
1023 net_dm_hw_metadata_free(hw_metadata);
1024 consume_skb(skb);
1025 }
1026 }
1027
1028 module_put(THIS_MODULE);
1029}
1030
495static int net_dm_trace_on_set(struct netlink_ext_ack *extack) 1031static int net_dm_trace_on_set(struct netlink_ext_ack *extack)
496{ 1032{
497 const struct net_dm_alert_ops *ops; 1033 const struct net_dm_alert_ops *ops;
@@ -604,6 +1140,11 @@ static int set_all_monitor_traces(int state, struct netlink_ext_ack *extack)
604 return rc; 1140 return rc;
605} 1141}
606 1142
1143static bool net_dm_is_monitoring(void)
1144{
1145 return trace_state == TRACE_ON || monitor_hw;
1146}
1147
607static int net_dm_alert_mode_get_from_info(struct genl_info *info, 1148static int net_dm_alert_mode_get_from_info(struct genl_info *info,
608 enum net_dm_alert_mode *p_alert_mode) 1149 enum net_dm_alert_mode *p_alert_mode)
609{ 1150{
@@ -665,8 +1206,8 @@ static int net_dm_cmd_config(struct sk_buff *skb,
665 struct netlink_ext_ack *extack = info->extack; 1206 struct netlink_ext_ack *extack = info->extack;
666 int rc; 1207 int rc;
667 1208
668 if (trace_state == TRACE_ON) { 1209 if (net_dm_is_monitoring()) {
669 NL_SET_ERR_MSG_MOD(extack, "Cannot configure drop monitor while tracing is on"); 1210 NL_SET_ERR_MSG_MOD(extack, "Cannot configure drop monitor during monitoring");
670 return -EBUSY; 1211 return -EBUSY;
671 } 1212 }
672 1213
@@ -681,14 +1222,61 @@ static int net_dm_cmd_config(struct sk_buff *skb,
681 return 0; 1222 return 0;
682} 1223}
683 1224
1225static int net_dm_monitor_start(bool set_sw, bool set_hw,
1226 struct netlink_ext_ack *extack)
1227{
1228 bool sw_set = false;
1229 int rc;
1230
1231 if (set_sw) {
1232 rc = set_all_monitor_traces(TRACE_ON, extack);
1233 if (rc)
1234 return rc;
1235 sw_set = true;
1236 }
1237
1238 if (set_hw) {
1239 rc = net_dm_hw_monitor_start(extack);
1240 if (rc)
1241 goto err_monitor_hw;
1242 }
1243
1244 return 0;
1245
1246err_monitor_hw:
1247 if (sw_set)
1248 set_all_monitor_traces(TRACE_OFF, extack);
1249 return rc;
1250}
1251
1252static void net_dm_monitor_stop(bool set_sw, bool set_hw,
1253 struct netlink_ext_ack *extack)
1254{
1255 if (set_hw)
1256 net_dm_hw_monitor_stop(extack);
1257 if (set_sw)
1258 set_all_monitor_traces(TRACE_OFF, extack);
1259}
1260
684static int net_dm_cmd_trace(struct sk_buff *skb, 1261static int net_dm_cmd_trace(struct sk_buff *skb,
685 struct genl_info *info) 1262 struct genl_info *info)
686{ 1263{
1264 bool set_sw = !!info->attrs[NET_DM_ATTR_SW_DROPS];
1265 bool set_hw = !!info->attrs[NET_DM_ATTR_HW_DROPS];
1266 struct netlink_ext_ack *extack = info->extack;
1267
1268 /* To maintain backward compatibility, we start / stop monitoring of
1269 * software drops if no flag is specified.
1270 */
1271 if (!set_sw && !set_hw)
1272 set_sw = true;
1273
687 switch (info->genlhdr->cmd) { 1274 switch (info->genlhdr->cmd) {
688 case NET_DM_CMD_START: 1275 case NET_DM_CMD_START:
689 return set_all_monitor_traces(TRACE_ON, info->extack); 1276 return net_dm_monitor_start(set_sw, set_hw, extack);
690 case NET_DM_CMD_STOP: 1277 case NET_DM_CMD_STOP:
691 return set_all_monitor_traces(TRACE_OFF, info->extack); 1278 net_dm_monitor_stop(set_sw, set_hw, extack);
1279 return 0;
692 } 1280 }
693 1281
694 return -EOPNOTSUPP; 1282 return -EOPNOTSUPP;
@@ -785,6 +1373,50 @@ nla_put_failure:
785 return -EMSGSIZE; 1373 return -EMSGSIZE;
786} 1374}
787 1375
1376static void net_dm_hw_stats_read(struct net_dm_stats *stats)
1377{
1378 int cpu;
1379
1380 memset(stats, 0, sizeof(*stats));
1381 for_each_possible_cpu(cpu) {
1382 struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
1383 struct net_dm_stats *cpu_stats = &hw_data->stats;
1384 unsigned int start;
1385 u64 dropped;
1386
1387 do {
1388 start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
1389 dropped = cpu_stats->dropped;
1390 } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
1391
1392 stats->dropped += dropped;
1393 }
1394}
1395
1396static int net_dm_hw_stats_put(struct sk_buff *msg)
1397{
1398 struct net_dm_stats stats;
1399 struct nlattr *attr;
1400
1401 net_dm_hw_stats_read(&stats);
1402
1403 attr = nla_nest_start(msg, NET_DM_ATTR_HW_STATS);
1404 if (!attr)
1405 return -EMSGSIZE;
1406
1407 if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED,
1408 stats.dropped, NET_DM_ATTR_PAD))
1409 goto nla_put_failure;
1410
1411 nla_nest_end(msg, attr);
1412
1413 return 0;
1414
1415nla_put_failure:
1416 nla_nest_cancel(msg, attr);
1417 return -EMSGSIZE;
1418}
1419
788static int net_dm_stats_fill(struct sk_buff *msg, struct genl_info *info) 1420static int net_dm_stats_fill(struct sk_buff *msg, struct genl_info *info)
789{ 1421{
790 void *hdr; 1422 void *hdr;
@@ -799,6 +1431,10 @@ static int net_dm_stats_fill(struct sk_buff *msg, struct genl_info *info)
799 if (rc) 1431 if (rc)
800 goto nla_put_failure; 1432 goto nla_put_failure;
801 1433
1434 rc = net_dm_hw_stats_put(msg);
1435 if (rc)
1436 goto nla_put_failure;
1437
802 genlmsg_end(msg, hdr); 1438 genlmsg_end(msg, hdr);
803 1439
804 return 0; 1440 return 0;
@@ -872,6 +1508,8 @@ static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = {
872 [NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 }, 1508 [NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 },
873 [NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 }, 1509 [NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 },
874 [NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 }, 1510 [NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 },
1511 [NET_DM_ATTR_SW_DROPS] = {. type = NLA_FLAG },
1512 [NET_DM_ATTR_HW_DROPS] = {. type = NLA_FLAG },
875}; 1513};
876 1514
877static const struct genl_ops dropmon_ops[] = { 1515static const struct genl_ops dropmon_ops[] = {
@@ -934,9 +1572,57 @@ static struct notifier_block dropmon_net_notifier = {
934 .notifier_call = dropmon_net_event 1572 .notifier_call = dropmon_net_event
935}; 1573};
936 1574
937static int __init init_net_drop_monitor(void) 1575static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data)
1576{
1577 spin_lock_init(&data->lock);
1578 skb_queue_head_init(&data->drop_queue);
1579 u64_stats_init(&data->stats.syncp);
1580}
1581
1582static void __net_dm_cpu_data_fini(struct per_cpu_dm_data *data)
1583{
1584 WARN_ON(!skb_queue_empty(&data->drop_queue));
1585}
1586
1587static void net_dm_cpu_data_init(int cpu)
938{ 1588{
939 struct per_cpu_dm_data *data; 1589 struct per_cpu_dm_data *data;
1590
1591 data = &per_cpu(dm_cpu_data, cpu);
1592 __net_dm_cpu_data_init(data);
1593}
1594
1595static void net_dm_cpu_data_fini(int cpu)
1596{
1597 struct per_cpu_dm_data *data;
1598
1599 data = &per_cpu(dm_cpu_data, cpu);
1600 /* At this point, we should have exclusive access
1601 * to this struct and can free the skb inside it.
1602 */
1603 consume_skb(data->skb);
1604 __net_dm_cpu_data_fini(data);
1605}
1606
1607static void net_dm_hw_cpu_data_init(int cpu)
1608{
1609 struct per_cpu_dm_data *hw_data;
1610
1611 hw_data = &per_cpu(dm_hw_cpu_data, cpu);
1612 __net_dm_cpu_data_init(hw_data);
1613}
1614
1615static void net_dm_hw_cpu_data_fini(int cpu)
1616{
1617 struct per_cpu_dm_data *hw_data;
1618
1619 hw_data = &per_cpu(dm_hw_cpu_data, cpu);
1620 kfree(hw_data->hw_entries);
1621 __net_dm_cpu_data_fini(hw_data);
1622}
1623
1624static int __init init_net_drop_monitor(void)
1625{
940 int cpu, rc; 1626 int cpu, rc;
941 1627
942 pr_info("Initializing network drop monitor service\n"); 1628 pr_info("Initializing network drop monitor service\n");
@@ -962,10 +1648,8 @@ static int __init init_net_drop_monitor(void)
962 rc = 0; 1648 rc = 0;
963 1649
964 for_each_possible_cpu(cpu) { 1650 for_each_possible_cpu(cpu) {
965 data = &per_cpu(dm_cpu_data, cpu); 1651 net_dm_cpu_data_init(cpu);
966 spin_lock_init(&data->lock); 1652 net_dm_hw_cpu_data_init(cpu);
967 skb_queue_head_init(&data->drop_queue);
968 u64_stats_init(&data->stats.syncp);
969 } 1653 }
970 1654
971 goto out; 1655 goto out;
@@ -978,7 +1662,6 @@ out:
978 1662
979static void exit_net_drop_monitor(void) 1663static void exit_net_drop_monitor(void)
980{ 1664{
981 struct per_cpu_dm_data *data;
982 int cpu; 1665 int cpu;
983 1666
984 BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); 1667 BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
@@ -989,13 +1672,8 @@ static void exit_net_drop_monitor(void)
989 */ 1672 */
990 1673
991 for_each_possible_cpu(cpu) { 1674 for_each_possible_cpu(cpu) {
992 data = &per_cpu(dm_cpu_data, cpu); 1675 net_dm_hw_cpu_data_fini(cpu);
993 /* 1676 net_dm_cpu_data_fini(cpu);
994 * At this point, we should have exclusive access
995 * to this struct and can free the skb inside it
996 */
997 kfree_skb(data->skb);
998 WARN_ON(!skb_queue_empty(&data->drop_queue));
999 } 1677 }
1000 1678
1001 BUG_ON(genl_unregister_family(&net_drop_monitor_family)); 1679 BUG_ON(genl_unregister_family(&net_drop_monitor_family));