4 files changed, 98 insertions, 11 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 551ecf09c8dd..9a53c929f017 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -4235,6 +4235,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        The default value of this parameter is determined by
                        the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT.
+        workqueue.debug_force_rr_cpu
+                        Workqueue used to implicitly guarantee that work
+                        items queued without explicit CPU specified are put
+                        on the local CPU.  This guarantee is no longer true
+                        and while local CPU is still preferred work items
+                        may be put on foreign CPUs.  This debug option
+                        forces round-robin CPU selection to flush out
+                        usages which depend on the now broken guarantee.
+                        When enabled, memory and cache locality will be
+                        impacted.
        x2apic_phys     [X86-64,APIC] Use x2apic physical mode instead of
                        default x2apic cluster mode on platforms
                        supporting x2apic.
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 0e32bc71245e..ca73c503b92a 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -311,6 +311,7 @@ enum {
        __WQ_DRAINING           = 1 << 16, /* internal: workqueue is draining */
        __WQ_ORDERED            = 1 << 17, /* internal: workqueue is ordered */
+        __WQ_LEGACY             = 1 << 18, /* internal: create*_workqueue() */
        WQ_MAX_ACTIVE           = 512,    /* I like 512, better ideas? */
        WQ_MAX_UNBOUND_PER_CPU  = 4,      /* 4 * #cpus for unbound wq */
@@ -411,12 +412,12 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
        alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args)
 #define create_workqueue(name)                                          \
-        alloc_workqueue("%s", WQ_MEM_RECLAIM, 1, (name))
+        alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))
 #define create_freezable_workqueue(name)                                \
-        alloc_workqueue("%s", WQ_FREEZABLE | WQ_UNBOUND | WQ_MEM_RECLAIM, \
+        alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \
-                        1, (name))
+                        WQ_MEM_RECLAIM, 1, (name))
 #define create_singlethread_workqueue(name)                             \
-        alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM, name)
+        alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)
 extern void destroy_workqueue(struct workqueue_struct *wq);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 61a0264e28f9..7ff5dc7d2ac5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -301,7 +301,23 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);           /* PR: list of all workqueues */
 static bool workqueue_freezing;         /* PL: have wqs started freezing? */
-static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
+/* PL: allowable cpus for unbound wqs and work items */
+static cpumask_var_t wq_unbound_cpumask;
+/* CPU where unbound work was last round robin scheduled from this CPU */
+static DEFINE_PER_CPU(int, wq_rr_cpu_last);
+/*
+ * Local execution of unbound work items is no longer guaranteed.  The
+ * following always forces round-robin CPU selection on unbound work items
+ * to uncover usages which depend on it.
+ */
+#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
+static bool wq_debug_force_rr_cpu = true;
+#else
+static bool wq_debug_force_rr_cpu = false;
+#endif
+module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
@@ -570,6 +586,16 @@ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
                                                  int node)
 {
        assert_rcu_or_wq_mutex_or_pool_mutex(wq);
+        /*
+         * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
+         * delayed item is pending.  The plan is to keep CPU -> NODE
+         * mapping valid and stable across CPU on/offlines.  Once that
+         * happens, this workaround can be removed.
+         */
+        if (unlikely(node == NUMA_NO_NODE))
+                return wq->dfl_pwq;
        return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
 }
@@ -1298,6 +1324,39 @@ static bool is_chained_work(struct workqueue_struct *wq)
        return worker && worker->current_pwq->wq == wq;
 }
+/*
+ * When queueing an unbound work item to a wq, prefer local CPU if allowed
+ * by wq_unbound_cpumask.  Otherwise, round robin among the allowed ones to
+ * avoid perturbing sensitive tasks.
+ */
+static int wq_select_unbound_cpu(int cpu)
+{
+        static bool printed_dbg_warning;
+        int new_cpu;
+        if (likely(!wq_debug_force_rr_cpu)) {
+                if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
+                        return cpu;
+        } else if (!printed_dbg_warning) {
+                pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n");
+                printed_dbg_warning = true;
+        }
+        if (cpumask_empty(wq_unbound_cpumask))
+                return cpu;
+        new_cpu = __this_cpu_read(wq_rr_cpu_last);
+        new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
+        if (unlikely(new_cpu >= nr_cpu_ids)) {
+                new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
+                if (unlikely(new_cpu >= nr_cpu_ids))
+                        return cpu;
+        }
+        __this_cpu_write(wq_rr_cpu_last, new_cpu);
+        return new_cpu;
+}
 static void __queue_work(int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
 {
@@ -1323,7 +1382,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
                return;
 retry:
        if (req_cpu == WORK_CPU_UNBOUND)
-                cpu = raw_smp_processor_id();
+                cpu = wq_select_unbound_cpu(raw_smp_processor_id());
        /* pwq which will be used unless @work is executing elsewhere */
        if (!(wq->flags & WQ_UNBOUND))
@@ -1464,13 +1523,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
        timer_stats_timer_set_start_info(&dwork->timer);
        dwork->wq = wq;
-        /* timer isn't guaranteed to run in this cpu, record earlier */
-        if (cpu == WORK_CPU_UNBOUND)
-                cpu = raw_smp_processor_id();
        dwork->cpu = cpu;
        timer->expires = jiffies + delay;
-        add_timer_on(timer, cpu);
+        if (unlikely(cpu != WORK_CPU_UNBOUND))
+                add_timer_on(timer, cpu);
+        else
+                add_timer(timer);
 }
 /**
@@ -2355,7 +2414,8 @@ static void check_flush_dependency(struct workqueue_struct *target_wq,
        WARN_ONCE(current->flags & PF_MEMALLOC,
                  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf",
                  current->pid, current->comm, target_wq->name, target_func);
-        WARN_ONCE(worker && (worker->current_pwq->wq->flags & WQ_MEM_RECLAIM),
+        WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
+                              (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
                  "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf",
                  worker->current_pwq->wq->name, worker->current_func,
                  target_wq->name, target_func);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ecb9e75614bf..8bfd1aca7a3d 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1400,6 +1400,21 @@ config RCU_EQS_DEBUG
 endmenu # "RCU Debugging"
+config DEBUG_WQ_FORCE_RR_CPU
+        bool "Force round-robin CPU selection for unbound work items"
+        depends on DEBUG_KERNEL
+        default n
+        help
+          Workqueue used to implicitly guarantee that work items queued
+          without explicit CPU specified are put on the local CPU.  This
+          guarantee is no longer true and while local CPU is still
+          preferred work items may be put on foreign CPUs.  Kernel
+          parameter "workqueue.debug_force_rr_cpu" is added to force
+          round-robin CPU selection to flush out usages which depend on the
+          now broken guarantee.  This config option enables the debug
+          feature by default.  When enabled, memory and cache locality will
+          be impacted.
 config DEBUG_BLOCK_EXT_DEVT
        bool "Force extended block device numbers and spread them"
        depends on DEBUG_KERNEL

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 551ecf09c8dd..9a53c929f017 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt
@@ -4235,6 +4235,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
4235	The default value of this parameter is determined by	4235	The default value of this parameter is determined by
4236	the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT.	4236	the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT.
4237		4237
		4238	workqueue.debug_force_rr_cpu
		4239	Workqueue used to implicitly guarantee that work
		4240	items queued without explicit CPU specified are put
		4241	on the local CPU. This guarantee is no longer true
		4242	and while local CPU is still preferred work items
		4243	may be put on foreign CPUs. This debug option
		4244	forces round-robin CPU selection to flush out
		4245	usages which depend on the now broken guarantee.
		4246	When enabled, memory and cache locality will be
		4247	impacted.
		4248
4238	x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of	4249	x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
4239	default x2apic cluster mode on platforms	4250	default x2apic cluster mode on platforms
4240	supporting x2apic.	4251	supporting x2apic.


diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 0e32bc71245e..ca73c503b92a 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h
@@ -311,6 +311,7 @@ enum {
311		311
312	__WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */	312	__WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */
313	__WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */	313	__WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */
		314	__WQ_LEGACY = 1 << 18, /* internal: create_workqueue() /
314		315
315	WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */	316	WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */
316	WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */	317	WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */
@@ -411,12 +412,12 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
411	alloc_workqueue(fmt, WQ_UNBOUND \| __WQ_ORDERED \| (flags), 1, ##args)	412	alloc_workqueue(fmt, WQ_UNBOUND \| __WQ_ORDERED \| (flags), 1, ##args)
412		413
413	#define create_workqueue(name) \	414	#define create_workqueue(name) \
414	alloc_workqueue("%s", WQ_MEM_RECLAIM, 1, (name))	415	alloc_workqueue("%s", __WQ_LEGACY \| WQ_MEM_RECLAIM, 1, (name))
415	#define create_freezable_workqueue(name) \	416	#define create_freezable_workqueue(name) \
416	alloc_workqueue("%s", WQ_FREEZABLE \| WQ_UNBOUND \| WQ_MEM_RECLAIM, \	417	alloc_workqueue("%s", __WQ_LEGACY \| WQ_FREEZABLE \| WQ_UNBOUND \| \
417	1, (name))	418	WQ_MEM_RECLAIM, 1, (name))
418	#define create_singlethread_workqueue(name) \	419	#define create_singlethread_workqueue(name) \
419	alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM, name)	420	alloc_ordered_workqueue("%s", __WQ_LEGACY \| WQ_MEM_RECLAIM, name)
420		421
421	extern void destroy_workqueue(struct workqueue_struct *wq);	422	extern void destroy_workqueue(struct workqueue_struct *wq);
422		423


diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 61a0264e28f9..7ff5dc7d2ac5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c
@@ -301,7 +301,23 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
301	static LIST_HEAD(workqueues); /* PR: list of all workqueues */	301	static LIST_HEAD(workqueues); /* PR: list of all workqueues */
302	static bool workqueue_freezing; /* PL: have wqs started freezing? */	302	static bool workqueue_freezing; /* PL: have wqs started freezing? */
303		303
304	static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */	304	/* PL: allowable cpus for unbound wqs and work items */
		305	static cpumask_var_t wq_unbound_cpumask;
		306
		307	/* CPU where unbound work was last round robin scheduled from this CPU */
		308	static DEFINE_PER_CPU(int, wq_rr_cpu_last);
		309
		310	/*
		311	* Local execution of unbound work items is no longer guaranteed. The
		312	* following always forces round-robin CPU selection on unbound work items
		313	* to uncover usages which depend on it.
		314	*/
		315	#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
		316	static bool wq_debug_force_rr_cpu = true;
		317	#else
		318	static bool wq_debug_force_rr_cpu = false;
		319	#endif
		320	module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
305		321
306	/* the per-cpu worker pools */	322	/* the per-cpu worker pools */
307	static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],	323	static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
@@ -570,6 +586,16 @@ static struct pool_workqueue unbound_pwq_by_node(struct workqueue_struct wq,
570	int node)	586	int node)
571	{	587	{
572	assert_rcu_or_wq_mutex_or_pool_mutex(wq);	588	assert_rcu_or_wq_mutex_or_pool_mutex(wq);
		589
		590	/*
		591	* XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
		592	* delayed item is pending. The plan is to keep CPU -> NODE
		593	* mapping valid and stable across CPU on/offlines. Once that
		594	* happens, this workaround can be removed.
		595	*/
		596	if (unlikely(node == NUMA_NO_NODE))
		597	return wq->dfl_pwq;
		598
573	return rcu_dereference_raw(wq->numa_pwq_tbl[node]);	599	return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
574	}	600	}
575		601
@@ -1298,6 +1324,39 @@ static bool is_chained_work(struct workqueue_struct *wq)
1298	return worker && worker->current_pwq->wq == wq;	1324	return worker && worker->current_pwq->wq == wq;
1299	}	1325	}
1300		1326
		1327	/*
		1328	* When queueing an unbound work item to a wq, prefer local CPU if allowed
		1329	* by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to
		1330	* avoid perturbing sensitive tasks.
		1331	*/
		1332	static int wq_select_unbound_cpu(int cpu)
		1333	{
		1334	static bool printed_dbg_warning;
		1335	int new_cpu;
		1336
		1337	if (likely(!wq_debug_force_rr_cpu)) {
		1338	if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
		1339	return cpu;
		1340	} else if (!printed_dbg_warning) {
		1341	pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n");
		1342	printed_dbg_warning = true;
		1343	}
		1344
		1345	if (cpumask_empty(wq_unbound_cpumask))
		1346	return cpu;
		1347
		1348	new_cpu = __this_cpu_read(wq_rr_cpu_last);
		1349	new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
		1350	if (unlikely(new_cpu >= nr_cpu_ids)) {
		1351	new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
		1352	if (unlikely(new_cpu >= nr_cpu_ids))
		1353	return cpu;
		1354	}
		1355	__this_cpu_write(wq_rr_cpu_last, new_cpu);
		1356
		1357	return new_cpu;
		1358	}
		1359
1301	static void __queue_work(int cpu, struct workqueue_struct *wq,	1360	static void __queue_work(int cpu, struct workqueue_struct *wq,
1302	struct work_struct *work)	1361	struct work_struct *work)
1303	{	1362	{
@@ -1323,7 +1382,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
1323	return;	1382	return;
1324	retry:	1383	retry:
1325	if (req_cpu == WORK_CPU_UNBOUND)	1384	if (req_cpu == WORK_CPU_UNBOUND)
1326	cpu = raw_smp_processor_id();	1385	cpu = wq_select_unbound_cpu(raw_smp_processor_id());
1327		1386
1328	/* pwq which will be used unless @work is executing elsewhere */	1387	/* pwq which will be used unless @work is executing elsewhere */
1329	if (!(wq->flags & WQ_UNBOUND))	1388	if (!(wq->flags & WQ_UNBOUND))
@@ -1464,13 +1523,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1464	timer_stats_timer_set_start_info(&dwork->timer);	1523	timer_stats_timer_set_start_info(&dwork->timer);
1465		1524
1466	dwork->wq = wq;	1525	dwork->wq = wq;
1467	/* timer isn't guaranteed to run in this cpu, record earlier */
1468	if (cpu == WORK_CPU_UNBOUND)
1469	cpu = raw_smp_processor_id();
1470	dwork->cpu = cpu;	1526	dwork->cpu = cpu;
1471	timer->expires = jiffies + delay;	1527	timer->expires = jiffies + delay;
1472		1528
1473	add_timer_on(timer, cpu);	1529	if (unlikely(cpu != WORK_CPU_UNBOUND))
		1530	add_timer_on(timer, cpu);
		1531	else
		1532	add_timer(timer);
1474	}	1533	}
1475		1534
1476	/**	1535	/**
@@ -2355,7 +2414,8 @@ static void check_flush_dependency(struct workqueue_struct *target_wq,
2355	WARN_ONCE(current->flags & PF_MEMALLOC,	2414	WARN_ONCE(current->flags & PF_MEMALLOC,
2356	"workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf",	2415	"workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf",
2357	current->pid, current->comm, target_wq->name, target_func);	2416	current->pid, current->comm, target_wq->name, target_func);
2358	WARN_ONCE(worker && (worker->current_pwq->wq->flags & WQ_MEM_RECLAIM),	2417	WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
		2418	(WQ_MEM_RECLAIM \| __WQ_LEGACY)) == WQ_MEM_RECLAIM),
2359	"workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf",	2419	"workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf",
2360	worker->current_pwq->wq->name, worker->current_func,	2420	worker->current_pwq->wq->name, worker->current_func,
2361	target_wq->name, target_func);	2421	target_wq->name, target_func);


diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ecb9e75614bf..8bfd1aca7a3d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug
@@ -1400,6 +1400,21 @@ config RCU_EQS_DEBUG
1400		1400
1401	endmenu # "RCU Debugging"	1401	endmenu # "RCU Debugging"
1402		1402
		1403	config DEBUG_WQ_FORCE_RR_CPU
		1404	bool "Force round-robin CPU selection for unbound work items"
		1405	depends on DEBUG_KERNEL
		1406	default n
		1407	help
		1408	Workqueue used to implicitly guarantee that work items queued
		1409	without explicit CPU specified are put on the local CPU. This
		1410	guarantee is no longer true and while local CPU is still
		1411	preferred work items may be put on foreign CPUs. Kernel
		1412	parameter "workqueue.debug_force_rr_cpu" is added to force
		1413	round-robin CPU selection to flush out usages which depend on the
		1414	now broken guarantee. This config option enables the debug
		1415	feature by default. When enabled, memory and cache locality will
		1416	be impacted.
		1417
1403	config DEBUG_BLOCK_EXT_DEVT	1418	config DEBUG_BLOCK_EXT_DEVT
1404	bool "Force extended block device numbers and spread them"	1419	bool "Force extended block device numbers and spread them"
1405	depends on DEBUG_KERNEL	1420	depends on DEBUG_KERNEL