5 files changed, 194 insertions, 3 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a94a5a0ab122..7dd535d4b41e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1778,6 +1778,19 @@ static inline int unregister_gifconf(unsigned int family)
        return register_gifconf(family, NULL);
 }
+#ifdef CONFIG_NET_FLOW_LIMIT
+#define FLOW_LIMIT_HISTORY      (1 << 8)        /* must be ^2 */
+struct sd_flow_limit {
+        u64                     count;
+        unsigned int            num_buckets;
+        unsigned int            history_head;
+        u16                     history[FLOW_LIMIT_HISTORY];
+        u8                      buckets[];
+};
+extern int netdev_flow_limit_table_len;
+#endif /* CONFIG_NET_FLOW_LIMIT */
 /*
 * Incoming packets are placed on per-cpu queues
 */
@@ -1807,6 +1820,10 @@ struct softnet_data {
        unsigned int            dropped;
        struct sk_buff_head     input_pkt_queue;
        struct napi_struct      backlog;
+#ifdef CONFIG_NET_FLOW_LIMIT
+        struct sd_flow_limit    *flow_limit;
+#endif
 };
 static inline void input_queue_head_incr(struct softnet_data *sd)
diff --git a/net/Kconfig b/net/Kconfig
index 2ddc9046868e..08de901415ee 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -259,6 +259,18 @@ config BPF_JIT
          packet sniffing (libpcap/tcpdump). Note : Admin should enable
          this feature changing /proc/sys/net/core/bpf_jit_enable
+config NET_FLOW_LIMIT
+        boolean
+        depends on RPS
+        default y
+        ---help---
+          The network stack has to drop packets when a receive processing CPU's
+          backlog reaches netdev_max_backlog. If a few out of many active flows
+          generate the vast majority of load, drop their traffic earlier to
+          maintain capacity for the other flows. This feature provides servers
+          with many clients some protection against DoS by a single (spoofed)
+          flow that greatly exceeds average workload.
 menu "Network testing"
 config NET_PKTGEN
diff --git a/net/core/dev.c b/net/core/dev.c
index 18e9730cc4be..7229bc30e509 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3064,6 +3064,46 @@ static int rps_ipi_queued(struct softnet_data *sd)
        return 0;
 }
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+        struct sd_flow_limit *fl;
+        struct softnet_data *sd;
+        unsigned int old_flow, new_flow;
+        if (qlen < (netdev_max_backlog >> 1))
+                return false;
+        sd = &__get_cpu_var(softnet_data);
+        rcu_read_lock();
+        fl = rcu_dereference(sd->flow_limit);
+        if (fl) {
+                new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
+                old_flow = fl->history[fl->history_head];
+                fl->history[fl->history_head] = new_flow;
+                fl->history_head++;
+                fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+                if (likely(fl->buckets[old_flow]))
+                        fl->buckets[old_flow]--;
+                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+                        fl->count++;
+                        rcu_read_unlock();
+                        return true;
+                }
+        }
+        rcu_read_unlock();
+#endif
+        return false;
+}
 /*
 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 * queue (may be a remote CPU queue).
@@ -3073,13 +3113,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 {
        struct softnet_data *sd;
        unsigned long flags;
+        unsigned int qlen;
        sd = &per_cpu(softnet_data, cpu);
        local_irq_save(flags);
        rps_lock(sd);
-        if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+        qlen = skb_queue_len(&sd->input_pkt_queue);
+        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
                if (skb_queue_len(&sd->input_pkt_queue)) {
 enqueue:
                        __skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -6269,6 +6311,10 @@ static int __init net_dev_init(void)
                sd->backlog.weight = weight_p;
                sd->backlog.gro_list = NULL;
                sd->backlog.gro_count = 0;
+#ifdef CONFIG_NET_FLOW_LIMIT
+                sd->flow_limit = NULL;
+#endif
        }
        dev_boot_phase = 0;
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 569d355fec3e..2bf83299600a 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -146,11 +146,23 @@ static void softnet_seq_stop(struct seq_file *seq, void *v)
 static int softnet_seq_show(struct seq_file *seq, void *v)
 {
        struct softnet_data *sd = v;
+        unsigned int flow_limit_count = 0;
-        seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+#ifdef CONFIG_NET_FLOW_LIMIT
+        struct sd_flow_limit *fl;
+        rcu_read_lock();
+        fl = rcu_dereference(sd->flow_limit);
+        if (fl)
+                flow_limit_count = fl->count;
+        rcu_read_unlock();
+#endif
+        seq_printf(seq,
+                   "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
                   sd->processed, sd->dropped, sd->time_squeeze, 0,
                   0, 0, 0, 0, /* was fastroute */
-                   sd->cpu_collision, sd->received_rps);
+                   sd->cpu_collision, sd->received_rps, flow_limit_count);
        return 0;
 }
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cfdb46ab3a7f..741db5fc7806 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -87,6 +87,96 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
 }
 #endif /* CONFIG_RPS */
+#ifdef CONFIG_NET_FLOW_LIMIT
+static DEFINE_MUTEX(flow_limit_update_mutex);
+static int flow_limit_cpu_sysctl(ctl_table *table, int write,
+                                 void __user *buffer, size_t *lenp,
+                                 loff_t *ppos)
+{
+        struct sd_flow_limit *cur;
+        struct softnet_data *sd;
+        cpumask_var_t mask;
+        int i, len, ret = 0;
+        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+                return -ENOMEM;
+        if (write) {
+                ret = cpumask_parse_user(buffer, *lenp, mask);
+                if (ret)
+                        goto done;
+                mutex_lock(&flow_limit_update_mutex);
+                len = sizeof(*cur) + netdev_flow_limit_table_len;
+                for_each_possible_cpu(i) {
+                        sd = &per_cpu(softnet_data, i);
+                        cur = rcu_dereference_protected(sd->flow_limit,
+                                     lockdep_is_held(&flow_limit_update_mutex));
+                        if (cur && !cpumask_test_cpu(i, mask)) {
+                                RCU_INIT_POINTER(sd->flow_limit, NULL);
+                                synchronize_rcu();
+                                kfree(cur);
+                        } else if (!cur && cpumask_test_cpu(i, mask)) {
+                                cur = kzalloc(len, GFP_KERNEL);
+                                if (!cur) {
+                                        /* not unwinding previous changes */
+                                        ret = -ENOMEM;
+                                        goto write_unlock;
+                                }
+                                cur->num_buckets = netdev_flow_limit_table_len;
+                                rcu_assign_pointer(sd->flow_limit, cur);
+                        }
+                }
+write_unlock:
+                mutex_unlock(&flow_limit_update_mutex);
+        } else {
+                if (*ppos || !*lenp) {
+                        *lenp = 0;
+                        goto done;
+                }
+                cpumask_clear(mask);
+                rcu_read_lock();
+                for_each_possible_cpu(i) {
+                        sd = &per_cpu(softnet_data, i);
+                        if (rcu_dereference(sd->flow_limit))
+                                cpumask_set_cpu(i, mask);
+                }
+                rcu_read_unlock();
+                len = cpumask_scnprintf(buffer, *lenp, mask);
+                *lenp = len + 1;
+                *ppos += len + 1;
+        }
+done:
+        free_cpumask_var(mask);
+        return ret;
+}
+static int flow_limit_table_len_sysctl(ctl_table *table, int write,
+                                       void __user *buffer, size_t *lenp,
+                                       loff_t *ppos)
+{
+        unsigned int old, *ptr;
+        int ret;
+        mutex_lock(&flow_limit_update_mutex);
+        ptr = table->data;
+        old = *ptr;
+        ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (!ret && write && !is_power_of_2(*ptr)) {
+                *ptr = old;
+                ret = -EINVAL;
+        }
+        mutex_unlock(&flow_limit_update_mutex);
+        return ret;
+}
+#endif /* CONFIG_NET_FLOW_LIMIT */
 static struct ctl_table net_core_table[] = {
 #ifdef CONFIG_NET
        {
@@ -180,6 +270,20 @@ static struct ctl_table net_core_table[] = {
                .proc_handler   = rps_sock_flow_sysctl
        },
 #endif
+#ifdef CONFIG_NET_FLOW_LIMIT
+        {
+                .procname       = "flow_limit_cpu_bitmap",
+                .mode           = 0644,
+                .proc_handler   = flow_limit_cpu_sysctl
+        },
+        {
+                .procname       = "flow_limit_table_len",
+                .data           = &netdev_flow_limit_table_len,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = flow_limit_table_len_sysctl
+        },
+#endif /* CONFIG_NET_FLOW_LIMIT */
 #endif /* CONFIG_NET */
        {
                .procname       = "netdev_budget",

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a94a5a0ab122..7dd535d4b41e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h
@@ -1778,6 +1778,19 @@ static inline int unregister_gifconf(unsigned int family)
1778	return register_gifconf(family, NULL);	1778	return register_gifconf(family, NULL);
1779	}	1779	}
1780		1780
		1781	#ifdef CONFIG_NET_FLOW_LIMIT
		1782	#define FLOW_LIMIT_HISTORY (1 << 8) /* must be ^2 */
		1783	struct sd_flow_limit {
		1784	u64 count;
		1785	unsigned int num_buckets;
		1786	unsigned int history_head;
		1787	u16 history[FLOW_LIMIT_HISTORY];
		1788	u8 buckets[];
		1789	};
		1790
		1791	extern int netdev_flow_limit_table_len;
		1792	#endif /* CONFIG_NET_FLOW_LIMIT */
		1793
1781	/*	1794	/*
1782	* Incoming packets are placed on per-cpu queues	1795	* Incoming packets are placed on per-cpu queues
1783	*/	1796	*/
@@ -1807,6 +1820,10 @@ struct softnet_data {
1807	unsigned int dropped;	1820	unsigned int dropped;
1808	struct sk_buff_head input_pkt_queue;	1821	struct sk_buff_head input_pkt_queue;
1809	struct napi_struct backlog;	1822	struct napi_struct backlog;
		1823
		1824	#ifdef CONFIG_NET_FLOW_LIMIT
		1825	struct sd_flow_limit *flow_limit;
		1826	#endif
1810	};	1827	};
1811		1828
1812	static inline void input_queue_head_incr(struct softnet_data *sd)	1829	static inline void input_queue_head_incr(struct softnet_data *sd)


diff --git a/net/Kconfig b/net/Kconfig index 2ddc9046868e..08de901415ee 100644 --- a/net/Kconfig +++ b/net/Kconfig
@@ -259,6 +259,18 @@ config BPF_JIT
259	packet sniffing (libpcap/tcpdump). Note : Admin should enable	259	packet sniffing (libpcap/tcpdump). Note : Admin should enable
260	this feature changing /proc/sys/net/core/bpf_jit_enable	260	this feature changing /proc/sys/net/core/bpf_jit_enable
261		261
		262	config NET_FLOW_LIMIT
		263	boolean
		264	depends on RPS
		265	default y
		266	---help---
		267	The network stack has to drop packets when a receive processing CPU's
		268	backlog reaches netdev_max_backlog. If a few out of many active flows
		269	generate the vast majority of load, drop their traffic earlier to
		270	maintain capacity for the other flows. This feature provides servers
		271	with many clients some protection against DoS by a single (spoofed)
		272	flow that greatly exceeds average workload.
		273
262	menu "Network testing"	274	menu "Network testing"
263		275
264	config NET_PKTGEN	276	config NET_PKTGEN


diff --git a/net/core/dev.c b/net/core/dev.c index 18e9730cc4be..7229bc30e509 100644 --- a/net/core/dev.c +++ b/net/core/dev.c
@@ -3064,6 +3064,46 @@ static int rps_ipi_queued(struct softnet_data *sd)
3064	return 0;	3064	return 0;
3065	}	3065	}
3066		3066
		3067	#ifdef CONFIG_NET_FLOW_LIMIT
		3068	int netdev_flow_limit_table_len __read_mostly = (1 << 12);
		3069	#endif
		3070
		3071	static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
		3072	{
		3073	#ifdef CONFIG_NET_FLOW_LIMIT
		3074	struct sd_flow_limit *fl;
		3075	struct softnet_data *sd;
		3076	unsigned int old_flow, new_flow;
		3077
		3078	if (qlen < (netdev_max_backlog >> 1))
		3079	return false;
		3080
		3081	sd = &__get_cpu_var(softnet_data);
		3082
		3083	rcu_read_lock();
		3084	fl = rcu_dereference(sd->flow_limit);
		3085	if (fl) {
		3086	new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
		3087	old_flow = fl->history[fl->history_head];
		3088	fl->history[fl->history_head] = new_flow;
		3089
		3090	fl->history_head++;
		3091	fl->history_head &= FLOW_LIMIT_HISTORY - 1;
		3092
		3093	if (likely(fl->buckets[old_flow]))
		3094	fl->buckets[old_flow]--;
		3095
		3096	if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
		3097	fl->count++;
		3098	rcu_read_unlock();
		3099	return true;
		3100	}
		3101	}
		3102	rcu_read_unlock();
		3103	#endif
		3104	return false;
		3105	}
		3106
3067	/*	3107	/*
3068	* enqueue_to_backlog is called to queue an skb to a per CPU backlog	3108	* enqueue_to_backlog is called to queue an skb to a per CPU backlog
3069	* queue (may be a remote CPU queue).	3109	* queue (may be a remote CPU queue).
@@ -3073,13 +3113,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3073	{	3113	{
3074	struct softnet_data *sd;	3114	struct softnet_data *sd;
3075	unsigned long flags;	3115	unsigned long flags;
		3116	unsigned int qlen;
3076		3117
3077	sd = &per_cpu(softnet_data, cpu);	3118	sd = &per_cpu(softnet_data, cpu);
3078		3119
3079	local_irq_save(flags);	3120	local_irq_save(flags);
3080		3121
3081	rps_lock(sd);	3122	rps_lock(sd);
3082	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {	3123	qlen = skb_queue_len(&sd->input_pkt_queue);
		3124	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3083	if (skb_queue_len(&sd->input_pkt_queue)) {	3125	if (skb_queue_len(&sd->input_pkt_queue)) {
3084	enqueue:	3126	enqueue:
3085	__skb_queue_tail(&sd->input_pkt_queue, skb);	3127	__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -6269,6 +6311,10 @@ static int __init net_dev_init(void)
6269	sd->backlog.weight = weight_p;	6311	sd->backlog.weight = weight_p;
6270	sd->backlog.gro_list = NULL;	6312	sd->backlog.gro_list = NULL;
6271	sd->backlog.gro_count = 0;	6313	sd->backlog.gro_count = 0;
		6314
		6315	#ifdef CONFIG_NET_FLOW_LIMIT
		6316	sd->flow_limit = NULL;
		6317	#endif
6272	}	6318	}
6273		6319
6274	dev_boot_phase = 0;	6320	dev_boot_phase = 0;


diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 569d355fec3e..2bf83299600a 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c
@@ -146,11 +146,23 @@ static void softnet_seq_stop(struct seq_file seq, void v)
146	static int softnet_seq_show(struct seq_file seq, void v)	146	static int softnet_seq_show(struct seq_file seq, void v)
147	{	147	{
148	struct softnet_data *sd = v;	148	struct softnet_data *sd = v;
		149	unsigned int flow_limit_count = 0;
149		150
150	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",	151	#ifdef CONFIG_NET_FLOW_LIMIT
		152	struct sd_flow_limit *fl;
		153
		154	rcu_read_lock();
		155	fl = rcu_dereference(sd->flow_limit);
		156	if (fl)
		157	flow_limit_count = fl->count;
		158	rcu_read_unlock();
		159	#endif
		160
		161	seq_printf(seq,
		162	"%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
151	sd->processed, sd->dropped, sd->time_squeeze, 0,	163	sd->processed, sd->dropped, sd->time_squeeze, 0,
152	0, 0, 0, 0, /* was fastroute */	164	0, 0, 0, 0, /* was fastroute */
153	sd->cpu_collision, sd->received_rps);	165	sd->cpu_collision, sd->received_rps, flow_limit_count);
154	return 0;	166	return 0;
155	}	167	}
156		168


diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cfdb46ab3a7f..741db5fc7806 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c
@@ -87,6 +87,96 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
87	}	87	}
88	#endif /* CONFIG_RPS */	88	#endif /* CONFIG_RPS */
89		89
		90	#ifdef CONFIG_NET_FLOW_LIMIT
		91	static DEFINE_MUTEX(flow_limit_update_mutex);
		92
		93	static int flow_limit_cpu_sysctl(ctl_table *table, int write,
		94	void __user buffer, size_t lenp,
		95	loff_t *ppos)
		96	{
		97	struct sd_flow_limit *cur;
		98	struct softnet_data *sd;
		99	cpumask_var_t mask;
		100	int i, len, ret = 0;
		101
		102	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
		103	return -ENOMEM;
		104
		105	if (write) {
		106	ret = cpumask_parse_user(buffer, *lenp, mask);
		107	if (ret)
		108	goto done;
		109
		110	mutex_lock(&flow_limit_update_mutex);
		111	len = sizeof(*cur) + netdev_flow_limit_table_len;
		112	for_each_possible_cpu(i) {
		113	sd = &per_cpu(softnet_data, i);
		114	cur = rcu_dereference_protected(sd->flow_limit,
		115	lockdep_is_held(&flow_limit_update_mutex));
		116	if (cur && !cpumask_test_cpu(i, mask)) {
		117	RCU_INIT_POINTER(sd->flow_limit, NULL);
		118	synchronize_rcu();
		119	kfree(cur);
		120	} else if (!cur && cpumask_test_cpu(i, mask)) {
		121	cur = kzalloc(len, GFP_KERNEL);
		122	if (!cur) {
		123	/* not unwinding previous changes */
		124	ret = -ENOMEM;
		125	goto write_unlock;
		126	}
		127	cur->num_buckets = netdev_flow_limit_table_len;
		128	rcu_assign_pointer(sd->flow_limit, cur);
		129	}
		130	}
		131	write_unlock:
		132	mutex_unlock(&flow_limit_update_mutex);
		133	} else {
		134	if (ppos \|\| !lenp) {
		135	*lenp = 0;
		136	goto done;
		137	}
		138
		139	cpumask_clear(mask);
		140	rcu_read_lock();
		141	for_each_possible_cpu(i) {
		142	sd = &per_cpu(softnet_data, i);
		143	if (rcu_dereference(sd->flow_limit))
		144	cpumask_set_cpu(i, mask);
		145	}
		146	rcu_read_unlock();
		147
		148	len = cpumask_scnprintf(buffer, *lenp, mask);
		149	*lenp = len + 1;
		150	*ppos += len + 1;
		151	}
		152
		153	done:
		154	free_cpumask_var(mask);
		155	return ret;
		156	}
		157
		158	static int flow_limit_table_len_sysctl(ctl_table *table, int write,
		159	void __user buffer, size_t lenp,
		160	loff_t *ppos)
		161	{
		162	unsigned int old, *ptr;
		163	int ret;
		164
		165	mutex_lock(&flow_limit_update_mutex);
		166
		167	ptr = table->data;
		168	old = *ptr;
		169	ret = proc_dointvec(table, write, buffer, lenp, ppos);
		170	if (!ret && write && !is_power_of_2(*ptr)) {
		171	*ptr = old;
		172	ret = -EINVAL;
		173	}
		174
		175	mutex_unlock(&flow_limit_update_mutex);
		176	return ret;
		177	}
		178	#endif /* CONFIG_NET_FLOW_LIMIT */
		179
90	static struct ctl_table net_core_table[] = {	180	static struct ctl_table net_core_table[] = {
91	#ifdef CONFIG_NET	181	#ifdef CONFIG_NET
92	{	182	{
@@ -180,6 +270,20 @@ static struct ctl_table net_core_table[] = {
180	.proc_handler = rps_sock_flow_sysctl	270	.proc_handler = rps_sock_flow_sysctl
181	},	271	},
182	#endif	272	#endif
		273	#ifdef CONFIG_NET_FLOW_LIMIT
		274	{
		275	.procname = "flow_limit_cpu_bitmap",
		276	.mode = 0644,
		277	.proc_handler = flow_limit_cpu_sysctl
		278	},
		279	{
		280	.procname = "flow_limit_table_len",
		281	.data = &netdev_flow_limit_table_len,
		282	.maxlen = sizeof(int),
		283	.mode = 0644,
		284	.proc_handler = flow_limit_table_len_sysctl
		285	},
		286	#endif /* CONFIG_NET_FLOW_LIMIT */
183	#endif /* CONFIG_NET */	287	#endif /* CONFIG_NET */
184	{	288	{
185	.procname = "netdev_budget",	289	.procname = "netdev_budget",