aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.com>2017-04-07 19:05:05 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-04-08 03:47:49 -0400
commitce612879ddc78ea7e4de4be80cba4ebf9caa07ee (patch)
treeee47f2673091affe136dab58ae89f42ffd5eb6df
parentcdcf4330d5660998d06fcd899b443693ab3d652f (diff)
mm: move pcp and lru-pcp draining into single wq
We currently have 2 specific WQ_RECLAIM workqueues in the mm code. vmstat_wq for updating pcp stats and lru_add_drain_wq dedicated to drain per cpu lru caches. This seems more than necessary because both can run on a single WQ. Both do not block on locks requiring a memory allocation nor perform any allocations themselves. We will save one rescuer thread this way. On the other hand drain_all_pages() queues work on the system wq which doesn't have rescuer and so this depend on memory allocation (when all workers are stuck allocating and new ones cannot be created). Initially we thought this would be more of a theoretical problem but Hugh Dickins has reported: : 4.11-rc has been giving me hangs after hours of swapping load. At : first they looked like memory leaks ("fork: Cannot allocate memory"); : but for no good reason I happened to do "cat /proc/sys/vm/stat_refresh" : before looking at /proc/meminfo one time, and the stat_refresh stuck : in D state, waiting for completion of flush_work like many kworkers. : kthreadd waiting for completion of flush_work in drain_all_pages(). This worker should be using WQ_RECLAIM as well in order to guarantee a forward progress. We can reuse the same one as for lru draining and vmstat. Link: http://lkml.kernel.org/r/20170307131751.24936-1-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Suggested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Mel Gorman <mgorman@suse.de> Tested-by: Yang Li <pku.leo@gmail.com> Tested-by: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/internal.h7
-rw-r--r--mm/page_alloc.c9
-rw-r--r--mm/swap.c27
-rw-r--r--mm/vmstat.c15
4 files changed, 32 insertions, 26 deletions
diff --git a/mm/internal.h b/mm/internal.h
index ccfc2a2969f4..266efaeaa370 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -481,6 +481,13 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
481enum ttu_flags; 481enum ttu_flags;
482struct tlbflush_unmap_batch; 482struct tlbflush_unmap_batch;
483 483
484
485/*
486 * only for MM internal work items which do not depend on
487 * any allocations or locks which might depend on allocations
488 */
489extern struct workqueue_struct *mm_percpu_wq;
490
484#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 491#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
485void try_to_unmap_flush(void); 492void try_to_unmap_flush(void);
486void try_to_unmap_flush_dirty(void); 493void try_to_unmap_flush_dirty(void);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d6a665057d61..f3d603cef2c0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2373,6 +2373,13 @@ void drain_all_pages(struct zone *zone)
2373 */ 2373 */
2374 static cpumask_t cpus_with_pcps; 2374 static cpumask_t cpus_with_pcps;
2375 2375
2376 /*
2377 * Make sure nobody triggers this path before mm_percpu_wq is fully
2378 * initialized.
2379 */
2380 if (WARN_ON_ONCE(!mm_percpu_wq))
2381 return;
2382
2376 /* Workqueues cannot recurse */ 2383 /* Workqueues cannot recurse */
2377 if (current->flags & PF_WQ_WORKER) 2384 if (current->flags & PF_WQ_WORKER)
2378 return; 2385 return;
@@ -2422,7 +2429,7 @@ void drain_all_pages(struct zone *zone)
2422 for_each_cpu(cpu, &cpus_with_pcps) { 2429 for_each_cpu(cpu, &cpus_with_pcps) {
2423 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); 2430 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
2424 INIT_WORK(work, drain_local_pages_wq); 2431 INIT_WORK(work, drain_local_pages_wq);
2425 schedule_work_on(cpu, work); 2432 queue_work_on(cpu, mm_percpu_wq, work);
2426 } 2433 }
2427 for_each_cpu(cpu, &cpus_with_pcps) 2434 for_each_cpu(cpu, &cpus_with_pcps)
2428 flush_work(per_cpu_ptr(&pcpu_drain, cpu)); 2435 flush_work(per_cpu_ptr(&pcpu_drain, cpu));
diff --git a/mm/swap.c b/mm/swap.c
index c4910f14f957..5dabf444d724 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -670,30 +670,19 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
670 670
671static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); 671static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
672 672
673/*
674 * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
675 * workqueue, aiding in getting memory freed.
676 */
677static struct workqueue_struct *lru_add_drain_wq;
678
679static int __init lru_init(void)
680{
681 lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0);
682
683 if (WARN(!lru_add_drain_wq,
684 "Failed to create workqueue lru_add_drain_wq"))
685 return -ENOMEM;
686
687 return 0;
688}
689early_initcall(lru_init);
690
691void lru_add_drain_all(void) 673void lru_add_drain_all(void)
692{ 674{
693 static DEFINE_MUTEX(lock); 675 static DEFINE_MUTEX(lock);
694 static struct cpumask has_work; 676 static struct cpumask has_work;
695 int cpu; 677 int cpu;
696 678
679 /*
680 * Make sure nobody triggers this path before mm_percpu_wq is fully
681 * initialized.
682 */
683 if (WARN_ON(!mm_percpu_wq))
684 return;
685
697 mutex_lock(&lock); 686 mutex_lock(&lock);
698 get_online_cpus(); 687 get_online_cpus();
699 cpumask_clear(&has_work); 688 cpumask_clear(&has_work);
@@ -707,7 +696,7 @@ void lru_add_drain_all(void)
707 pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || 696 pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
708 need_activate_page_drain(cpu)) { 697 need_activate_page_drain(cpu)) {
709 INIT_WORK(work, lru_add_drain_per_cpu); 698 INIT_WORK(work, lru_add_drain_per_cpu);
710 queue_work_on(cpu, lru_add_drain_wq, work); 699 queue_work_on(cpu, mm_percpu_wq, work);
711 cpumask_set_cpu(cpu, &has_work); 700 cpumask_set_cpu(cpu, &has_work);
712 } 701 }
713 } 702 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 89f95396ec46..809025ed97ea 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1552,7 +1552,6 @@ static const struct file_operations proc_vmstat_file_operations = {
1552#endif /* CONFIG_PROC_FS */ 1552#endif /* CONFIG_PROC_FS */
1553 1553
1554#ifdef CONFIG_SMP 1554#ifdef CONFIG_SMP
1555static struct workqueue_struct *vmstat_wq;
1556static DEFINE_PER_CPU(struct delayed_work, vmstat_work); 1555static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1557int sysctl_stat_interval __read_mostly = HZ; 1556int sysctl_stat_interval __read_mostly = HZ;
1558 1557
@@ -1623,7 +1622,7 @@ static void vmstat_update(struct work_struct *w)
1623 * to occur in the future. Keep on running the 1622 * to occur in the future. Keep on running the
1624 * update worker thread. 1623 * update worker thread.
1625 */ 1624 */
1626 queue_delayed_work_on(smp_processor_id(), vmstat_wq, 1625 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
1627 this_cpu_ptr(&vmstat_work), 1626 this_cpu_ptr(&vmstat_work),
1628 round_jiffies_relative(sysctl_stat_interval)); 1627 round_jiffies_relative(sysctl_stat_interval));
1629 } 1628 }
@@ -1702,7 +1701,7 @@ static void vmstat_shepherd(struct work_struct *w)
1702 struct delayed_work *dw = &per_cpu(vmstat_work, cpu); 1701 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
1703 1702
1704 if (!delayed_work_pending(dw) && need_update(cpu)) 1703 if (!delayed_work_pending(dw) && need_update(cpu))
1705 queue_delayed_work_on(cpu, vmstat_wq, dw, 0); 1704 queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
1706 } 1705 }
1707 put_online_cpus(); 1706 put_online_cpus();
1708 1707
@@ -1718,7 +1717,6 @@ static void __init start_shepherd_timer(void)
1718 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), 1717 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
1719 vmstat_update); 1718 vmstat_update);
1720 1719
1721 vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
1722 schedule_delayed_work(&shepherd, 1720 schedule_delayed_work(&shepherd,
1723 round_jiffies_relative(sysctl_stat_interval)); 1721 round_jiffies_relative(sysctl_stat_interval));
1724} 1722}
@@ -1764,11 +1762,16 @@ static int vmstat_cpu_dead(unsigned int cpu)
1764 1762
1765#endif 1763#endif
1766 1764
1765struct workqueue_struct *mm_percpu_wq;
1766
1767void __init init_mm_internals(void) 1767void __init init_mm_internals(void)
1768{ 1768{
1769#ifdef CONFIG_SMP 1769 int ret __maybe_unused;
1770 int ret;
1771 1770
1771 mm_percpu_wq = alloc_workqueue("mm_percpu_wq",
1772 WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
1773
1774#ifdef CONFIG_SMP
1772 ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead", 1775 ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
1773 NULL, vmstat_cpu_dead); 1776 NULL, vmstat_cpu_dead);
1774 if (ret < 0) 1777 if (ret < 0)