drop_monitor: dont sleep in atomic context

drop_monitor calls several sleeping functions while in atomic context. BUG: sleeping function called from invalid context at mm/slub.c:943 in_atomic(): 1, irqs_disabled(): 0, pid: 2103, name: kworker/0:2 Pid: 2103, comm: kworker/0:2 Not tainted 3.5.0-rc1+ #55 Call Trace: [<ffffffff810697ca>] __might_sleep+0xca/0xf0 [<ffffffff811345a3>] kmem_cache_alloc_node+0x1b3/0x1c0 [<ffffffff8105578c>] ? queue_delayed_work_on+0x11c/0x130 [<ffffffff815343fb>] __alloc_skb+0x4b/0x230 [<ffffffffa00b0360>] ? reset_per_cpu_data+0x160/0x160 [drop_monitor] [<ffffffffa00b022f>] reset_per_cpu_data+0x2f/0x160 [drop_monitor] [<ffffffffa00b03ab>] send_dm_alert+0x4b/0xb0 [drop_monitor] [<ffffffff810568e0>] process_one_work+0x130/0x4c0 [<ffffffff81058249>] worker_thread+0x159/0x360 [<ffffffff810580f0>] ? manage_workers.isra.27+0x240/0x240 [<ffffffff8105d403>] kthread+0x93/0xa0 [<ffffffff816be6d4>] kernel_thread_helper+0x4/0x10 [<ffffffff8105d370>] ? kthread_freezable_should_stop+0x80/0x80 [<ffffffff816be6d0>] ? gs_change+0xb/0xb Rework the logic to call the sleeping functions in right context. Use standard timer/workqueue api to let system chose any cpu to perform the allocation and netlink send. Also avoid a loop if reset_per_cpu_data() cannot allocate memory : use mod_timer() to wait 1/10 second before next try. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Neil Horman <nhorman@tuxdriver.com> Reviewed-by: Neil Horman <nhorman@tuxdriver.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <edumazet@google.com> 2012-06-03 20:18:19 -0400
committer: David S. Miller <davem@davemloft.net> 2012-06-04 11:42:01 -0400
commit: bec4596b4e6770c7037f21f6bd27567b152dc0d6 (patch)
tree: efd668e2e2868d0059e1e02de2c0849ba65b9f62
parent: f8f5701bdaf9134b1f90e5044a82c66324d2073f (diff)
1 files changed, 33 insertions, 69 deletions
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index ea5fb9fcc3f..d23b6682f4e 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -36,9 +36,6 @@
 #define TRACE_ON 1
 #define TRACE_OFF 0
-static void send_dm_alert(struct work_struct *unused);
 /*
 * Globals, our netlink socket pointer
 * and the work handle that will send up
@@ -48,11 +45,10 @@ static int trace_state = TRACE_OFF;
 static DEFINE_MUTEX(trace_state_mutex);
 struct per_cpu_dm_data {
-        struct work_struct dm_alert_work;
+        spinlock_t              lock;
-        struct sk_buff __rcu *skb;
+        struct sk_buff          *skb;
-        atomic_t dm_hit_count;
+        struct work_struct      dm_alert_work;
-        struct timer_list send_timer;
+        struct timer_list       send_timer;
-        int cpu;
 };
 struct dm_hw_stat_delta {
@@ -78,13 +74,13 @@ static int dm_delay = 1;
 static unsigned long dm_hw_check_delta = 2*HZ;
 static LIST_HEAD(hw_stats_list);
-static void reset_per_cpu_data(struct per_cpu_dm_data *data)
+static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
 {
        size_t al;
        struct net_dm_alert_msg *msg;
        struct nlattr *nla;
        struct sk_buff *skb;
-        struct sk_buff *oskb = rcu_dereference_protected(data->skb, 1);
+        unsigned long flags;
        al = sizeof(struct net_dm_alert_msg);
        al += dm_hit_limit * sizeof(struct net_dm_drop_point);
@@ -99,65 +95,40 @@ static void reset_per_cpu_data(struct per_cpu_dm_data *data)
                                  sizeof(struct net_dm_alert_msg));
                msg = nla_data(nla);
                memset(msg, 0, al);
-        } else
+        } else {
-                schedule_work_on(data->cpu, &data->dm_alert_work);
+                mod_timer(&data->send_timer, jiffies + HZ / 10);
-        /*
-         * Don't need to lock this, since we are guaranteed to only
-         * run this on a single cpu at a time.
-         * Note also that we only update data->skb if the old and new skb
-         * pointers don't match.  This ensures that we don't continually call
-         * synchornize_rcu if we repeatedly fail to alloc a new netlink message.
-         */
-        if (skb != oskb) {
-                rcu_assign_pointer(data->skb, skb);
-                synchronize_rcu();
-                atomic_set(&data->dm_hit_count, dm_hit_limit);
        }
+        spin_lock_irqsave(&data->lock, flags);
+        swap(data->skb, skb);
+        spin_unlock_irqrestore(&data->lock, flags);
+        return skb;
 }
-static void send_dm_alert(struct work_struct *unused)
+static void send_dm_alert(struct work_struct *work)
 {
        struct sk_buff *skb;
-        struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);
+        struct per_cpu_dm_data *data;
-        WARN_ON_ONCE(data->cpu != smp_processor_id());
+        data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
-        /*
+        skb = reset_per_cpu_data(data);
-         * Grab the skb we're about to send
-         */
-        skb = rcu_dereference_protected(data->skb, 1);
-        /*
-         * Replace it with a new one
-         */
-        reset_per_cpu_data(data);
-        /*
-         * Ship it!
-         */
        if (skb)
                genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL);
-        put_cpu_var(dm_cpu_data);
 }
 /*
 * This is the timer function to delay the sending of an alert
 * in the event that more drops will arrive during the
- * hysteresis period.  Note that it operates under the timer interrupt
+ * hysteresis period.
- * so we don't need to disable preemption here
 */
-static void sched_send_work(unsigned long unused)
+static void sched_send_work(unsigned long _data)
 {
-        struct per_cpu_dm_data *data =  &get_cpu_var(dm_cpu_data);
+        struct per_cpu_dm_data *data = (struct per_cpu_dm_data *)_data;
-        schedule_work_on(smp_processor_id(), &data->dm_alert_work);
-        put_cpu_var(dm_cpu_data);
+        schedule_work(&data->dm_alert_work);
 }
 static void trace_drop_common(struct sk_buff *skb, void *location)
@@ -167,33 +138,28 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
        struct nlattr *nla;
        int i;
        struct sk_buff *dskb;
-        struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);
+        struct per_cpu_dm_data *data;
+        unsigned long flags;
-        rcu_read_lock();
+        local_irq_save(flags);
-        dskb = rcu_dereference(data->skb);
+        data = &__get_cpu_var(dm_cpu_data);
+        spin_lock(&data->lock);
+        dskb = data->skb;
        if (!dskb)
                goto out;
-        if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) {
-                /*
-                 * we're already at zero, discard this hit
-                 */
-                goto out;
-        }
        nlh = (struct nlmsghdr *)dskb->data;
        nla = genlmsg_data(nlmsg_data(nlh));
        msg = nla_data(nla);
        for (i = 0; i < msg->entries; i++) {
                if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
                        msg->points[i].count++;
-                        atomic_inc(&data->dm_hit_count);
                        goto out;
                }
        }
+        if (msg->entries == dm_hit_limit)
+                goto out;
        /*
         * We need to create a new entry
         */
@@ -205,13 +171,11 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
        if (!timer_pending(&data->send_timer)) {
                data->send_timer.expires = jiffies + dm_delay * HZ;
-                add_timer_on(&data->send_timer, smp_processor_id());
+                add_timer(&data->send_timer);
        }
 out:
-        rcu_read_unlock();
+        spin_unlock_irqrestore(&data->lock, flags);
-        put_cpu_var(dm_cpu_data);
-        return;
 }
 static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
@@ -418,11 +382,11 @@ static int __init init_net_drop_monitor(void)
        for_each_possible_cpu(cpu) {
                data = &per_cpu(dm_cpu_data, cpu);
-                data->cpu = cpu;
                INIT_WORK(&data->dm_alert_work, send_dm_alert);
                init_timer(&data->send_timer);
-                data->send_timer.data = cpu;
+                data->send_timer.data = (unsigned long)data;
                data->send_timer.function = sched_send_work;
+                spin_lock_init(&data->lock);
                reset_per_cpu_data(data);
        }
author	Eric Dumazet <edumazet@google.com>	2012-06-03 20:18:19 -0400
committer	David S. Miller <davem@davemloft.net>	2012-06-04 11:42:01 -0400
commit	bec4596b4e6770c7037f21f6bd27567b152dc0d6 (patch)
tree	efd668e2e2868d0059e1e02de2c0849ba65b9f62
parent	f8f5701bdaf9134b1f90e5044a82c66324d2073f (diff)

diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index ea5fb9fcc3f..d23b6682f4e 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c
@@ -36,9 +36,6 @@
36	#define TRACE_ON 1	36	#define TRACE_ON 1
37	#define TRACE_OFF 0	37	#define TRACE_OFF 0
38		38
39	static void send_dm_alert(struct work_struct *unused);
40
41
42	/*	39	/*
43	* Globals, our netlink socket pointer	40	* Globals, our netlink socket pointer
44	* and the work handle that will send up	41	* and the work handle that will send up
@@ -48,11 +45,10 @@ static int trace_state = TRACE_OFF;
48	static DEFINE_MUTEX(trace_state_mutex);	45	static DEFINE_MUTEX(trace_state_mutex);
49		46
50	struct per_cpu_dm_data {	47	struct per_cpu_dm_data {
51	struct work_struct dm_alert_work;	48	spinlock_t lock;
52	struct sk_buff __rcu *skb;	49	struct sk_buff *skb;
53	atomic_t dm_hit_count;	50	struct work_struct dm_alert_work;
54	struct timer_list send_timer;	51	struct timer_list send_timer;
55	int cpu;
56	};	52	};
57		53
58	struct dm_hw_stat_delta {	54	struct dm_hw_stat_delta {
@@ -78,13 +74,13 @@ static int dm_delay = 1;
78	static unsigned long dm_hw_check_delta = 2*HZ;	74	static unsigned long dm_hw_check_delta = 2*HZ;
79	static LIST_HEAD(hw_stats_list);	75	static LIST_HEAD(hw_stats_list);
80		76
81	static void reset_per_cpu_data(struct per_cpu_dm_data *data)	77	static struct sk_buff reset_per_cpu_data(struct per_cpu_dm_data data)
82	{	78	{
83	size_t al;	79	size_t al;
84	struct net_dm_alert_msg *msg;	80	struct net_dm_alert_msg *msg;
85	struct nlattr *nla;	81	struct nlattr *nla;
86	struct sk_buff *skb;	82	struct sk_buff *skb;
87	struct sk_buff *oskb = rcu_dereference_protected(data->skb, 1);	83	unsigned long flags;
88		84
89	al = sizeof(struct net_dm_alert_msg);	85	al = sizeof(struct net_dm_alert_msg);
90	al += dm_hit_limit * sizeof(struct net_dm_drop_point);	86	al += dm_hit_limit * sizeof(struct net_dm_drop_point);
@@ -99,65 +95,40 @@ static void reset_per_cpu_data(struct per_cpu_dm_data *data)
99	sizeof(struct net_dm_alert_msg));	95	sizeof(struct net_dm_alert_msg));
100	msg = nla_data(nla);	96	msg = nla_data(nla);
101	memset(msg, 0, al);	97	memset(msg, 0, al);
102	} else	98	} else {
103	schedule_work_on(data->cpu, &data->dm_alert_work);	99	mod_timer(&data->send_timer, jiffies + HZ / 10);
104
105	/*
106	* Don't need to lock this, since we are guaranteed to only
107	* run this on a single cpu at a time.
108	* Note also that we only update data->skb if the old and new skb
109	* pointers don't match. This ensures that we don't continually call
110	* synchornize_rcu if we repeatedly fail to alloc a new netlink message.
111	*/
112	if (skb != oskb) {
113	rcu_assign_pointer(data->skb, skb);
114
115	synchronize_rcu();
116
117	atomic_set(&data->dm_hit_count, dm_hit_limit);
118	}	100	}
119		101
		102	spin_lock_irqsave(&data->lock, flags);
		103	swap(data->skb, skb);
		104	spin_unlock_irqrestore(&data->lock, flags);
		105
		106	return skb;
120	}	107	}
121		108
122	static void send_dm_alert(struct work_struct *unused)	109	static void send_dm_alert(struct work_struct *work)
123	{	110	{
124	struct sk_buff *skb;	111	struct sk_buff *skb;
125	struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);	112	struct per_cpu_dm_data *data;
126		113
127	WARN_ON_ONCE(data->cpu != smp_processor_id());	114	data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
128		115
129	/*	116	skb = reset_per_cpu_data(data);
130	* Grab the skb we're about to send
131	*/
132	skb = rcu_dereference_protected(data->skb, 1);
133
134	/*
135	* Replace it with a new one
136	*/
137	reset_per_cpu_data(data);
138		117
139	/*
140	* Ship it!
141	*/
142	if (skb)	118	if (skb)
143	genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL);	119	genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL);
144
145	put_cpu_var(dm_cpu_data);
146	}	120	}
147		121
148	/*	122	/*
149	* This is the timer function to delay the sending of an alert	123	* This is the timer function to delay the sending of an alert
150	* in the event that more drops will arrive during the	124	* in the event that more drops will arrive during the
151	* hysteresis period. Note that it operates under the timer interrupt	125	* hysteresis period.
152	* so we don't need to disable preemption here
153	*/	126	*/
154	static void sched_send_work(unsigned long unused)	127	static void sched_send_work(unsigned long _data)
155	{	128	{
156	struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);	129	struct per_cpu_dm_data data = (struct per_cpu_dm_data )_data;
157
158	schedule_work_on(smp_processor_id(), &data->dm_alert_work);
159		130
160	put_cpu_var(dm_cpu_data);	131	schedule_work(&data->dm_alert_work);
161	}	132	}
162		133
163	static void trace_drop_common(struct sk_buff skb, void location)	134	static void trace_drop_common(struct sk_buff skb, void location)
@@ -167,33 +138,28 @@ static void trace_drop_common(struct sk_buff skb, void location)
167	struct nlattr *nla;	138	struct nlattr *nla;
168	int i;	139	int i;
169	struct sk_buff *dskb;	140	struct sk_buff *dskb;
170	struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data);	141	struct per_cpu_dm_data *data;
171		142	unsigned long flags;
172		143
173	rcu_read_lock();	144	local_irq_save(flags);
174	dskb = rcu_dereference(data->skb);	145	data = &__get_cpu_var(dm_cpu_data);
		146	spin_lock(&data->lock);
		147	dskb = data->skb;
175		148
176	if (!dskb)	149	if (!dskb)
177	goto out;	150	goto out;
178		151
179	if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) {
180	/*
181	* we're already at zero, discard this hit
182	*/
183	goto out;
184	}
185
186	nlh = (struct nlmsghdr *)dskb->data;	152	nlh = (struct nlmsghdr *)dskb->data;
187	nla = genlmsg_data(nlmsg_data(nlh));	153	nla = genlmsg_data(nlmsg_data(nlh));
188	msg = nla_data(nla);	154	msg = nla_data(nla);
189	for (i = 0; i < msg->entries; i++) {	155	for (i = 0; i < msg->entries; i++) {
190	if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {	156	if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
191	msg->points[i].count++;	157	msg->points[i].count++;
192	atomic_inc(&data->dm_hit_count);
193	goto out;	158	goto out;
194	}	159	}
195	}	160	}
196		161	if (msg->entries == dm_hit_limit)
		162	goto out;
197	/*	163	/*
198	* We need to create a new entry	164	* We need to create a new entry
199	*/	165	*/
@@ -205,13 +171,11 @@ static void trace_drop_common(struct sk_buff skb, void location)
205		171
206	if (!timer_pending(&data->send_timer)) {	172	if (!timer_pending(&data->send_timer)) {
207	data->send_timer.expires = jiffies + dm_delay * HZ;	173	data->send_timer.expires = jiffies + dm_delay * HZ;
208	add_timer_on(&data->send_timer, smp_processor_id());	174	add_timer(&data->send_timer);
209	}	175	}
210		176
211	out:	177	out:
212	rcu_read_unlock();	178	spin_unlock_irqrestore(&data->lock, flags);
213	put_cpu_var(dm_cpu_data);
214	return;
215	}	179	}
216		180
217	static void trace_kfree_skb_hit(void ignore, struct sk_buff skb, void *location)	181	static void trace_kfree_skb_hit(void ignore, struct sk_buff skb, void *location)
@@ -418,11 +382,11 @@ static int __init init_net_drop_monitor(void)
418		382
419	for_each_possible_cpu(cpu) {	383	for_each_possible_cpu(cpu) {
420	data = &per_cpu(dm_cpu_data, cpu);	384	data = &per_cpu(dm_cpu_data, cpu);
421	data->cpu = cpu;
422	INIT_WORK(&data->dm_alert_work, send_dm_alert);	385	INIT_WORK(&data->dm_alert_work, send_dm_alert);
423	init_timer(&data->send_timer);	386	init_timer(&data->send_timer);
424	data->send_timer.data = cpu;	387	data->send_timer.data = (unsigned long)data;
425	data->send_timer.function = sched_send_work;	388	data->send_timer.function = sched_send_work;
		389	spin_lock_init(&data->lock);
426	reset_per_cpu_data(data);	390	reset_per_cpu_data(data);
427	}	391	}
428		392