rps: selective flow shedding during softnet overflow

A cpu executing the network receive path sheds packets when its input queue grows to netdev_max_backlog. A single high rate flow (such as a spoofed source DoS) can exceed a single cpu processing rate and will degrade throughput of other flows hashed onto the same cpu. This patch adds a more fine grained hashtable. If the netdev backlog is above a threshold, IRQ cpus track the ratio of total traffic of each flow (using 4096 buckets, configurable). The ratio is measured by counting the number of packets per flow over the last 256 packets from the source cpu. Any flow that occupies a large fraction of this (set at 50%) will see packet drop while above the threshold. Tested: Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0, kernel receive (RPS) on cpu0 and application threads on cpus 2--7 each handling 20k req/s. Throughput halves when hit with a 400 kpps antagonist storm. With this patch applied, antagonist overload is dropped and the server processes its complete load. The patch is effective when kernel receive processing is the bottleneck. The above RPS scenario is a extreme, but the same is reached with RFS and sufficient kernel processing (iptables, packet socket tap, ..). Signed-off-by: Willem de Bruijn <willemb@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Willem de Bruijn <willemb@google.com> 2013-05-20 00:02:32 -0400
committer: David S. Miller <davem@davemloft.net> 2013-05-20 16:48:04 -0400
commit: 99bbc70741903c063b3ccad90a3e06fc55df9245 (patch)
tree: a3377d2461242bf1134464ce3fe6d69f82c907c2 /net
parent: 4a5bddf7ea6b6c5916eccbc2fa1950555073ff48 (diff)
4 files changed, 177 insertions, 3 deletions
diff --git a/net/Kconfig b/net/Kconfig
index 2ddc9046868e..08de901415ee 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -259,6 +259,18 @@ config BPF_JIT
          packet sniffing (libpcap/tcpdump). Note : Admin should enable
          this feature changing /proc/sys/net/core/bpf_jit_enable
+config NET_FLOW_LIMIT
+        boolean
+        depends on RPS
+        default y
+        ---help---
+          The network stack has to drop packets when a receive processing CPU's
+          backlog reaches netdev_max_backlog. If a few out of many active flows
+          generate the vast majority of load, drop their traffic earlier to
+          maintain capacity for the other flows. This feature provides servers
+          with many clients some protection against DoS by a single (spoofed)
+          flow that greatly exceeds average workload.
 menu "Network testing"
 config NET_PKTGEN
diff --git a/net/core/dev.c b/net/core/dev.c
index 18e9730cc4be..7229bc30e509 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3064,6 +3064,46 @@ static int rps_ipi_queued(struct softnet_data *sd)
        return 0;
 }
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+        struct sd_flow_limit *fl;
+        struct softnet_data *sd;
+        unsigned int old_flow, new_flow;
+        if (qlen < (netdev_max_backlog >> 1))
+                return false;
+        sd = &__get_cpu_var(softnet_data);
+        rcu_read_lock();
+        fl = rcu_dereference(sd->flow_limit);
+        if (fl) {
+                new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
+                old_flow = fl->history[fl->history_head];
+                fl->history[fl->history_head] = new_flow;
+                fl->history_head++;
+                fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+                if (likely(fl->buckets[old_flow]))
+                        fl->buckets[old_flow]--;
+                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+                        fl->count++;
+                        rcu_read_unlock();
+                        return true;
+                }
+        }
+        rcu_read_unlock();
+#endif
+        return false;
+}
 /*
 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 * queue (may be a remote CPU queue).
@@ -3073,13 +3113,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 {
        struct softnet_data *sd;
        unsigned long flags;
+        unsigned int qlen;
        sd = &per_cpu(softnet_data, cpu);
        local_irq_save(flags);
        rps_lock(sd);
-        if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+        qlen = skb_queue_len(&sd->input_pkt_queue);
+        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
                if (skb_queue_len(&sd->input_pkt_queue)) {
 enqueue:
                        __skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -6269,6 +6311,10 @@ static int __init net_dev_init(void)
                sd->backlog.weight = weight_p;
                sd->backlog.gro_list = NULL;
                sd->backlog.gro_count = 0;
+#ifdef CONFIG_NET_FLOW_LIMIT
+                sd->flow_limit = NULL;
+#endif
        }
        dev_boot_phase = 0;
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 569d355fec3e..2bf83299600a 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -146,11 +146,23 @@ static void softnet_seq_stop(struct seq_file *seq, void *v)
 static int softnet_seq_show(struct seq_file *seq, void *v)
 {
        struct softnet_data *sd = v;
+        unsigned int flow_limit_count = 0;
-        seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+#ifdef CONFIG_NET_FLOW_LIMIT
+        struct sd_flow_limit *fl;
+        rcu_read_lock();
+        fl = rcu_dereference(sd->flow_limit);
+        if (fl)
+                flow_limit_count = fl->count;
+        rcu_read_unlock();
+#endif
+        seq_printf(seq,
+                   "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
                   sd->processed, sd->dropped, sd->time_squeeze, 0,
                   0, 0, 0, 0, /* was fastroute */
-                   sd->cpu_collision, sd->received_rps);
+                   sd->cpu_collision, sd->received_rps, flow_limit_count);
        return 0;
 }
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cfdb46ab3a7f..741db5fc7806 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -87,6 +87,96 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
 }
 #endif /* CONFIG_RPS */
+#ifdef CONFIG_NET_FLOW_LIMIT
+static DEFINE_MUTEX(flow_limit_update_mutex);
+static int flow_limit_cpu_sysctl(ctl_table *table, int write,
+                                 void __user *buffer, size_t *lenp,
+                                 loff_t *ppos)
+{
+        struct sd_flow_limit *cur;
+        struct softnet_data *sd;
+        cpumask_var_t mask;
+        int i, len, ret = 0;
+        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+                return -ENOMEM;
+        if (write) {
+                ret = cpumask_parse_user(buffer, *lenp, mask);
+                if (ret)
+                        goto done;
+                mutex_lock(&flow_limit_update_mutex);
+                len = sizeof(*cur) + netdev_flow_limit_table_len;
+                for_each_possible_cpu(i) {
+                        sd = &per_cpu(softnet_data, i);
+                        cur = rcu_dereference_protected(sd->flow_limit,
+                                     lockdep_is_held(&flow_limit_update_mutex));
+                        if (cur && !cpumask_test_cpu(i, mask)) {
+                                RCU_INIT_POINTER(sd->flow_limit, NULL);
+                                synchronize_rcu();
+                                kfree(cur);
+                        } else if (!cur && cpumask_test_cpu(i, mask)) {
+                                cur = kzalloc(len, GFP_KERNEL);
+                                if (!cur) {
+                                        /* not unwinding previous changes */
+                                        ret = -ENOMEM;
+                                        goto write_unlock;
+                                }
+                                cur->num_buckets = netdev_flow_limit_table_len;
+                                rcu_assign_pointer(sd->flow_limit, cur);
+                        }
+                }
+write_unlock:
+                mutex_unlock(&flow_limit_update_mutex);
+        } else {
+                if (*ppos || !*lenp) {
+                        *lenp = 0;
+                        goto done;
+                }
+                cpumask_clear(mask);
+                rcu_read_lock();
+                for_each_possible_cpu(i) {
+                        sd = &per_cpu(softnet_data, i);
+                        if (rcu_dereference(sd->flow_limit))
+                                cpumask_set_cpu(i, mask);
+                }
+                rcu_read_unlock();
+                len = cpumask_scnprintf(buffer, *lenp, mask);
+                *lenp = len + 1;
+                *ppos += len + 1;
+        }
+done:
+        free_cpumask_var(mask);
+        return ret;
+}
+static int flow_limit_table_len_sysctl(ctl_table *table, int write,
+                                       void __user *buffer, size_t *lenp,
+                                       loff_t *ppos)
+{
+        unsigned int old, *ptr;
+        int ret;
+        mutex_lock(&flow_limit_update_mutex);
+        ptr = table->data;
+        old = *ptr;
+        ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (!ret && write && !is_power_of_2(*ptr)) {
+                *ptr = old;
+                ret = -EINVAL;
+        }
+        mutex_unlock(&flow_limit_update_mutex);
+        return ret;
+}
+#endif /* CONFIG_NET_FLOW_LIMIT */
 static struct ctl_table net_core_table[] = {
 #ifdef CONFIG_NET
        {
@@ -180,6 +270,20 @@ static struct ctl_table net_core_table[] = {
                .proc_handler   = rps_sock_flow_sysctl
        },
 #endif
+#ifdef CONFIG_NET_FLOW_LIMIT
+        {
+                .procname       = "flow_limit_cpu_bitmap",
+                .mode           = 0644,
+                .proc_handler   = flow_limit_cpu_sysctl
+        },
+        {
+                .procname       = "flow_limit_table_len",
+                .data           = &netdev_flow_limit_table_len,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = flow_limit_table_len_sysctl
+        },
+#endif /* CONFIG_NET_FLOW_LIMIT */
 #endif /* CONFIG_NET */
        {
                .procname       = "netdev_budget",
author	Willem de Bruijn <willemb@google.com>	2013-05-20 00:02:32 -0400
committer	David S. Miller <davem@davemloft.net>	2013-05-20 16:48:04 -0400
commit	99bbc70741903c063b3ccad90a3e06fc55df9245 (patch)
tree	a3377d2461242bf1134464ce3fe6d69f82c907c2 /net
parent	4a5bddf7ea6b6c5916eccbc2fa1950555073ff48 (diff)

diff --git a/net/Kconfig b/net/Kconfig index 2ddc9046868e..08de901415ee 100644 --- a/net/Kconfig +++ b/net/Kconfig
@@ -259,6 +259,18 @@ config BPF_JIT
259	packet sniffing (libpcap/tcpdump). Note : Admin should enable	259	packet sniffing (libpcap/tcpdump). Note : Admin should enable
260	this feature changing /proc/sys/net/core/bpf_jit_enable	260	this feature changing /proc/sys/net/core/bpf_jit_enable
261		261
		262	config NET_FLOW_LIMIT
		263	boolean
		264	depends on RPS
		265	default y
		266	---help---
		267	The network stack has to drop packets when a receive processing CPU's
		268	backlog reaches netdev_max_backlog. If a few out of many active flows
		269	generate the vast majority of load, drop their traffic earlier to
		270	maintain capacity for the other flows. This feature provides servers
		271	with many clients some protection against DoS by a single (spoofed)
		272	flow that greatly exceeds average workload.
		273
262	menu "Network testing"	274	menu "Network testing"
263		275
264	config NET_PKTGEN	276	config NET_PKTGEN


diff --git a/net/core/dev.c b/net/core/dev.c index 18e9730cc4be..7229bc30e509 100644 --- a/net/core/dev.c +++ b/net/core/dev.c
@@ -3064,6 +3064,46 @@ static int rps_ipi_queued(struct softnet_data *sd)
3064	return 0;	3064	return 0;
3065	}	3065	}
3066		3066
		3067	#ifdef CONFIG_NET_FLOW_LIMIT
		3068	int netdev_flow_limit_table_len __read_mostly = (1 << 12);
		3069	#endif
		3070
		3071	static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
		3072	{
		3073	#ifdef CONFIG_NET_FLOW_LIMIT
		3074	struct sd_flow_limit *fl;
		3075	struct softnet_data *sd;
		3076	unsigned int old_flow, new_flow;
		3077
		3078	if (qlen < (netdev_max_backlog >> 1))
		3079	return false;
		3080
		3081	sd = &__get_cpu_var(softnet_data);
		3082
		3083	rcu_read_lock();
		3084	fl = rcu_dereference(sd->flow_limit);
		3085	if (fl) {
		3086	new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
		3087	old_flow = fl->history[fl->history_head];
		3088	fl->history[fl->history_head] = new_flow;
		3089
		3090	fl->history_head++;
		3091	fl->history_head &= FLOW_LIMIT_HISTORY - 1;
		3092
		3093	if (likely(fl->buckets[old_flow]))
		3094	fl->buckets[old_flow]--;
		3095
		3096	if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
		3097	fl->count++;
		3098	rcu_read_unlock();
		3099	return true;
		3100	}
		3101	}
		3102	rcu_read_unlock();
		3103	#endif
		3104	return false;
		3105	}
		3106
3067	/*	3107	/*
3068	* enqueue_to_backlog is called to queue an skb to a per CPU backlog	3108	* enqueue_to_backlog is called to queue an skb to a per CPU backlog
3069	* queue (may be a remote CPU queue).	3109	* queue (may be a remote CPU queue).
@@ -3073,13 +3113,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3073	{	3113	{
3074	struct softnet_data *sd;	3114	struct softnet_data *sd;
3075	unsigned long flags;	3115	unsigned long flags;
		3116	unsigned int qlen;
3076		3117
3077	sd = &per_cpu(softnet_data, cpu);	3118	sd = &per_cpu(softnet_data, cpu);
3078		3119
3079	local_irq_save(flags);	3120	local_irq_save(flags);
3080		3121
3081	rps_lock(sd);	3122	rps_lock(sd);
3082	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {	3123	qlen = skb_queue_len(&sd->input_pkt_queue);
		3124	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3083	if (skb_queue_len(&sd->input_pkt_queue)) {	3125	if (skb_queue_len(&sd->input_pkt_queue)) {
3084	enqueue:	3126	enqueue:
3085	__skb_queue_tail(&sd->input_pkt_queue, skb);	3127	__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -6269,6 +6311,10 @@ static int __init net_dev_init(void)
6269	sd->backlog.weight = weight_p;	6311	sd->backlog.weight = weight_p;
6270	sd->backlog.gro_list = NULL;	6312	sd->backlog.gro_list = NULL;
6271	sd->backlog.gro_count = 0;	6313	sd->backlog.gro_count = 0;
		6314
		6315	#ifdef CONFIG_NET_FLOW_LIMIT
		6316	sd->flow_limit = NULL;
		6317	#endif
6272	}	6318	}
6273		6319
6274	dev_boot_phase = 0;	6320	dev_boot_phase = 0;


diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 569d355fec3e..2bf83299600a 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c
@@ -146,11 +146,23 @@ static void softnet_seq_stop(struct seq_file seq, void v)
146	static int softnet_seq_show(struct seq_file seq, void v)	146	static int softnet_seq_show(struct seq_file seq, void v)
147	{	147	{
148	struct softnet_data *sd = v;	148	struct softnet_data *sd = v;
		149	unsigned int flow_limit_count = 0;
149		150
150	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",	151	#ifdef CONFIG_NET_FLOW_LIMIT
		152	struct sd_flow_limit *fl;
		153
		154	rcu_read_lock();
		155	fl = rcu_dereference(sd->flow_limit);
		156	if (fl)
		157	flow_limit_count = fl->count;
		158	rcu_read_unlock();
		159	#endif
		160
		161	seq_printf(seq,
		162	"%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
151	sd->processed, sd->dropped, sd->time_squeeze, 0,	163	sd->processed, sd->dropped, sd->time_squeeze, 0,
152	0, 0, 0, 0, /* was fastroute */	164	0, 0, 0, 0, /* was fastroute */
153	sd->cpu_collision, sd->received_rps);	165	sd->cpu_collision, sd->received_rps, flow_limit_count);
154	return 0;	166	return 0;
155	}	167	}
156		168


diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cfdb46ab3a7f..741db5fc7806 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c
@@ -87,6 +87,96 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
87	}	87	}
88	#endif /* CONFIG_RPS */	88	#endif /* CONFIG_RPS */
89		89
		90	#ifdef CONFIG_NET_FLOW_LIMIT
		91	static DEFINE_MUTEX(flow_limit_update_mutex);
		92
		93	static int flow_limit_cpu_sysctl(ctl_table *table, int write,
		94	void __user buffer, size_t lenp,
		95	loff_t *ppos)
		96	{
		97	struct sd_flow_limit *cur;
		98	struct softnet_data *sd;
		99	cpumask_var_t mask;
		100	int i, len, ret = 0;
		101
		102	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
		103	return -ENOMEM;
		104
		105	if (write) {
		106	ret = cpumask_parse_user(buffer, *lenp, mask);
		107	if (ret)
		108	goto done;
		109
		110	mutex_lock(&flow_limit_update_mutex);
		111	len = sizeof(*cur) + netdev_flow_limit_table_len;
		112	for_each_possible_cpu(i) {
		113	sd = &per_cpu(softnet_data, i);
		114	cur = rcu_dereference_protected(sd->flow_limit,
		115	lockdep_is_held(&flow_limit_update_mutex));
		116	if (cur && !cpumask_test_cpu(i, mask)) {
		117	RCU_INIT_POINTER(sd->flow_limit, NULL);
		118	synchronize_rcu();
		119	kfree(cur);
		120	} else if (!cur && cpumask_test_cpu(i, mask)) {
		121	cur = kzalloc(len, GFP_KERNEL);
		122	if (!cur) {
		123	/* not unwinding previous changes */
		124	ret = -ENOMEM;
		125	goto write_unlock;
		126	}
		127	cur->num_buckets = netdev_flow_limit_table_len;
		128	rcu_assign_pointer(sd->flow_limit, cur);
		129	}
		130	}
		131	write_unlock:
		132	mutex_unlock(&flow_limit_update_mutex);
		133	} else {
		134	if (ppos \|\| !lenp) {
		135	*lenp = 0;
		136	goto done;
		137	}
		138
		139	cpumask_clear(mask);
		140	rcu_read_lock();
		141	for_each_possible_cpu(i) {
		142	sd = &per_cpu(softnet_data, i);
		143	if (rcu_dereference(sd->flow_limit))
		144	cpumask_set_cpu(i, mask);
		145	}
		146	rcu_read_unlock();
		147
		148	len = cpumask_scnprintf(buffer, *lenp, mask);
		149	*lenp = len + 1;
		150	*ppos += len + 1;
		151	}
		152
		153	done:
		154	free_cpumask_var(mask);
		155	return ret;
		156	}
		157
		158	static int flow_limit_table_len_sysctl(ctl_table *table, int write,
		159	void __user buffer, size_t lenp,
		160	loff_t *ppos)
		161	{
		162	unsigned int old, *ptr;
		163	int ret;
		164
		165	mutex_lock(&flow_limit_update_mutex);
		166
		167	ptr = table->data;
		168	old = *ptr;
		169	ret = proc_dointvec(table, write, buffer, lenp, ppos);
		170	if (!ret && write && !is_power_of_2(*ptr)) {
		171	*ptr = old;
		172	ret = -EINVAL;
		173	}
		174
		175	mutex_unlock(&flow_limit_update_mutex);
		176	return ret;
		177	}
		178	#endif /* CONFIG_NET_FLOW_LIMIT */
		179
90	static struct ctl_table net_core_table[] = {	180	static struct ctl_table net_core_table[] = {
91	#ifdef CONFIG_NET	181	#ifdef CONFIG_NET
92	{	182	{
@@ -180,6 +270,20 @@ static struct ctl_table net_core_table[] = {
180	.proc_handler = rps_sock_flow_sysctl	270	.proc_handler = rps_sock_flow_sysctl
181	},	271	},
182	#endif	272	#endif
		273	#ifdef CONFIG_NET_FLOW_LIMIT
		274	{
		275	.procname = "flow_limit_cpu_bitmap",
		276	.mode = 0644,
		277	.proc_handler = flow_limit_cpu_sysctl
		278	},
		279	{
		280	.procname = "flow_limit_table_len",
		281	.data = &netdev_flow_limit_table_len,
		282	.maxlen = sizeof(int),
		283	.mode = 0644,
		284	.proc_handler = flow_limit_table_len_sysctl
		285	},
		286	#endif /* CONFIG_NET_FLOW_LIMIT */
183	#endif /* CONFIG_NET */	287	#endif /* CONFIG_NET */
184	{	288	{
185	.procname = "netdev_budget",	289	.procname = "netdev_budget",