aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c30
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/memcontrol.c198
-rw-r--r--mm/mempolicy.c9
-rw-r--r--mm/page-writeback.c15
-rw-r--r--mm/slub.c2
-rw-r--r--mm/vmalloc.c8
-rw-r--r--mm/vmscan.c69
-rw-r--r--mm/vmstat.c4
9 files changed, 76 insertions, 265 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d6edf8d14f9c..a87da524a4a0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -359,6 +359,17 @@ static unsigned long bdi_longest_inactive(void)
359 return max(5UL * 60 * HZ, interval); 359 return max(5UL * 60 * HZ, interval);
360} 360}
361 361
362/*
363 * Clear pending bit and wakeup anybody waiting for flusher thread creation or
364 * shutdown
365 */
366static void bdi_clear_pending(struct backing_dev_info *bdi)
367{
368 clear_bit(BDI_pending, &bdi->state);
369 smp_mb__after_clear_bit();
370 wake_up_bit(&bdi->state, BDI_pending);
371}
372
362static int bdi_forker_thread(void *ptr) 373static int bdi_forker_thread(void *ptr)
363{ 374{
364 struct bdi_writeback *me = ptr; 375 struct bdi_writeback *me = ptr;
@@ -390,6 +401,13 @@ static int bdi_forker_thread(void *ptr)
390 } 401 }
391 402
392 spin_lock_bh(&bdi_lock); 403 spin_lock_bh(&bdi_lock);
404 /*
405 * In the following loop we are going to check whether we have
406 * some work to do without any synchronization with tasks
407 * waking us up to do work for them. So we have to set task
408 * state already here so that we don't miss wakeups coming
409 * after we verify some condition.
410 */
393 set_current_state(TASK_INTERRUPTIBLE); 411 set_current_state(TASK_INTERRUPTIBLE);
394 412
395 list_for_each_entry(bdi, &bdi_list, bdi_list) { 413 list_for_each_entry(bdi, &bdi_list, bdi_list) {
@@ -469,11 +487,13 @@ static int bdi_forker_thread(void *ptr)
469 spin_unlock_bh(&bdi->wb_lock); 487 spin_unlock_bh(&bdi->wb_lock);
470 wake_up_process(task); 488 wake_up_process(task);
471 } 489 }
490 bdi_clear_pending(bdi);
472 break; 491 break;
473 492
474 case KILL_THREAD: 493 case KILL_THREAD:
475 __set_current_state(TASK_RUNNING); 494 __set_current_state(TASK_RUNNING);
476 kthread_stop(task); 495 kthread_stop(task);
496 bdi_clear_pending(bdi);
477 break; 497 break;
478 498
479 case NO_ACTION: 499 case NO_ACTION:
@@ -489,16 +509,8 @@ static int bdi_forker_thread(void *ptr)
489 else 509 else
490 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); 510 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
491 try_to_freeze(); 511 try_to_freeze();
492 /* Back to the main loop */ 512 break;
493 continue;
494 } 513 }
495
496 /*
497 * Clear pending bit and wakeup anybody waiting to tear us down.
498 */
499 clear_bit(BDI_pending, &bdi->state);
500 smp_mb__after_clear_bit();
501 wake_up_bit(&bdi->state, BDI_pending);
502 } 514 }
503 515
504 return 0; 516 return 0;
diff --git a/mm/filemap.c b/mm/filemap.c
index 645a080ba4df..7771871fa353 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -827,13 +827,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
827{ 827{
828 unsigned int i; 828 unsigned int i;
829 unsigned int ret; 829 unsigned int ret;
830 unsigned int nr_found; 830 unsigned int nr_found, nr_skip;
831 831
832 rcu_read_lock(); 832 rcu_read_lock();
833restart: 833restart:
834 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 834 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
835 (void ***)pages, NULL, start, nr_pages); 835 (void ***)pages, NULL, start, nr_pages);
836 ret = 0; 836 ret = 0;
837 nr_skip = 0;
837 for (i = 0; i < nr_found; i++) { 838 for (i = 0; i < nr_found; i++) {
838 struct page *page; 839 struct page *page;
839repeat: 840repeat:
@@ -856,6 +857,7 @@ repeat:
856 * here as an exceptional entry: so skip over it - 857 * here as an exceptional entry: so skip over it -
857 * we only reach this from invalidate_mapping_pages(). 858 * we only reach this from invalidate_mapping_pages().
858 */ 859 */
860 nr_skip++;
859 continue; 861 continue;
860 } 862 }
861 863
@@ -876,7 +878,7 @@ repeat:
876 * If all entries were removed before we could secure them, 878 * If all entries were removed before we could secure them,
877 * try again, because callers stop trying once 0 is returned. 879 * try again, because callers stop trying once 0 is returned.
878 */ 880 */
879 if (unlikely(!ret && nr_found)) 881 if (unlikely(!ret && nr_found > nr_skip))
880 goto restart; 882 goto restart;
881 rcu_read_unlock(); 883 rcu_read_unlock();
882 return ret; 884 return ret;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 930de9437271..3508777837c7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -204,50 +204,6 @@ struct mem_cgroup_eventfd_list {
204static void mem_cgroup_threshold(struct mem_cgroup *mem); 204static void mem_cgroup_threshold(struct mem_cgroup *mem);
205static void mem_cgroup_oom_notify(struct mem_cgroup *mem); 205static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
206 206
207enum {
208 SCAN_BY_LIMIT,
209 SCAN_BY_SYSTEM,
210 NR_SCAN_CONTEXT,
211 SCAN_BY_SHRINK, /* not recorded now */
212};
213
214enum {
215 SCAN,
216 SCAN_ANON,
217 SCAN_FILE,
218 ROTATE,
219 ROTATE_ANON,
220 ROTATE_FILE,
221 FREED,
222 FREED_ANON,
223 FREED_FILE,
224 ELAPSED,
225 NR_SCANSTATS,
226};
227
228struct scanstat {
229 spinlock_t lock;
230 unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS];
231 unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS];
232};
233
234const char *scanstat_string[NR_SCANSTATS] = {
235 "scanned_pages",
236 "scanned_anon_pages",
237 "scanned_file_pages",
238 "rotated_pages",
239 "rotated_anon_pages",
240 "rotated_file_pages",
241 "freed_pages",
242 "freed_anon_pages",
243 "freed_file_pages",
244 "elapsed_ns",
245};
246#define SCANSTAT_WORD_LIMIT "_by_limit"
247#define SCANSTAT_WORD_SYSTEM "_by_system"
248#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy"
249
250
251/* 207/*
252 * The memory controller data structure. The memory controller controls both 208 * The memory controller data structure. The memory controller controls both
253 * page cache and RSS per cgroup. We would eventually like to provide 209 * page cache and RSS per cgroup. We would eventually like to provide
@@ -313,8 +269,7 @@ struct mem_cgroup {
313 269
314 /* For oom notifier event fd */ 270 /* For oom notifier event fd */
315 struct list_head oom_notify; 271 struct list_head oom_notify;
316 /* For recording LRU-scan statistics */ 272
317 struct scanstat scanstat;
318 /* 273 /*
319 * Should we move charges of a task when a task is moved into this 274 * Should we move charges of a task when a task is moved into this
320 * mem_cgroup ? And what type of charges should we move ? 275 * mem_cgroup ? And what type of charges should we move ?
@@ -1678,44 +1633,6 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1678} 1633}
1679#endif 1634#endif
1680 1635
1681static void __mem_cgroup_record_scanstat(unsigned long *stats,
1682 struct memcg_scanrecord *rec)
1683{
1684
1685 stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1];
1686 stats[SCAN_ANON] += rec->nr_scanned[0];
1687 stats[SCAN_FILE] += rec->nr_scanned[1];
1688
1689 stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1];
1690 stats[ROTATE_ANON] += rec->nr_rotated[0];
1691 stats[ROTATE_FILE] += rec->nr_rotated[1];
1692
1693 stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1];
1694 stats[FREED_ANON] += rec->nr_freed[0];
1695 stats[FREED_FILE] += rec->nr_freed[1];
1696
1697 stats[ELAPSED] += rec->elapsed;
1698}
1699
1700static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec)
1701{
1702 struct mem_cgroup *mem;
1703 int context = rec->context;
1704
1705 if (context >= NR_SCAN_CONTEXT)
1706 return;
1707
1708 mem = rec->mem;
1709 spin_lock(&mem->scanstat.lock);
1710 __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec);
1711 spin_unlock(&mem->scanstat.lock);
1712
1713 mem = rec->root;
1714 spin_lock(&mem->scanstat.lock);
1715 __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec);
1716 spin_unlock(&mem->scanstat.lock);
1717}
1718
1719/* 1636/*
1720 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1637 * Scan the hierarchy if needed to reclaim memory. We remember the last child
1721 * we reclaimed from, so that we don't end up penalizing one child extensively 1638 * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -1740,9 +1657,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1740 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1657 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1741 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1658 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1742 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1659 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1743 struct memcg_scanrecord rec;
1744 unsigned long excess; 1660 unsigned long excess;
1745 unsigned long scanned; 1661 unsigned long nr_scanned;
1746 1662
1747 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1663 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1748 1664
@@ -1750,15 +1666,6 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1750 if (!check_soft && !shrink && root_mem->memsw_is_minimum) 1666 if (!check_soft && !shrink && root_mem->memsw_is_minimum)
1751 noswap = true; 1667 noswap = true;
1752 1668
1753 if (shrink)
1754 rec.context = SCAN_BY_SHRINK;
1755 else if (check_soft)
1756 rec.context = SCAN_BY_SYSTEM;
1757 else
1758 rec.context = SCAN_BY_LIMIT;
1759
1760 rec.root = root_mem;
1761
1762 while (1) { 1669 while (1) {
1763 victim = mem_cgroup_select_victim(root_mem); 1670 victim = mem_cgroup_select_victim(root_mem);
1764 if (victim == root_mem) { 1671 if (victim == root_mem) {
@@ -1799,23 +1706,14 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1799 css_put(&victim->css); 1706 css_put(&victim->css);
1800 continue; 1707 continue;
1801 } 1708 }
1802 rec.mem = victim;
1803 rec.nr_scanned[0] = 0;
1804 rec.nr_scanned[1] = 0;
1805 rec.nr_rotated[0] = 0;
1806 rec.nr_rotated[1] = 0;
1807 rec.nr_freed[0] = 0;
1808 rec.nr_freed[1] = 0;
1809 rec.elapsed = 0;
1810 /* we use swappiness of local cgroup */ 1709 /* we use swappiness of local cgroup */
1811 if (check_soft) { 1710 if (check_soft) {
1812 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1711 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1813 noswap, zone, &rec, &scanned); 1712 noswap, zone, &nr_scanned);
1814 *total_scanned += scanned; 1713 *total_scanned += nr_scanned;
1815 } else 1714 } else
1816 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1715 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1817 noswap, &rec); 1716 noswap);
1818 mem_cgroup_record_scanstat(&rec);
1819 css_put(&victim->css); 1717 css_put(&victim->css);
1820 /* 1718 /*
1821 * At shrinking usage, we can't check we should stop here or 1719 * At shrinking usage, we can't check we should stop here or
@@ -1841,29 +1739,23 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1841 */ 1739 */
1842static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1740static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1843{ 1741{
1844 int lock_count = -1;
1845 struct mem_cgroup *iter, *failed = NULL; 1742 struct mem_cgroup *iter, *failed = NULL;
1846 bool cond = true; 1743 bool cond = true;
1847 1744
1848 for_each_mem_cgroup_tree_cond(iter, mem, cond) { 1745 for_each_mem_cgroup_tree_cond(iter, mem, cond) {
1849 bool locked = iter->oom_lock; 1746 if (iter->oom_lock) {
1850
1851 iter->oom_lock = true;
1852 if (lock_count == -1)
1853 lock_count = iter->oom_lock;
1854 else if (lock_count != locked) {
1855 /* 1747 /*
1856 * this subtree of our hierarchy is already locked 1748 * this subtree of our hierarchy is already locked
1857 * so we cannot give a lock. 1749 * so we cannot give a lock.
1858 */ 1750 */
1859 lock_count = 0;
1860 failed = iter; 1751 failed = iter;
1861 cond = false; 1752 cond = false;
1862 } 1753 } else
1754 iter->oom_lock = true;
1863 } 1755 }
1864 1756
1865 if (!failed) 1757 if (!failed)
1866 goto done; 1758 return true;
1867 1759
1868 /* 1760 /*
1869 * OK, we failed to lock the whole subtree so we have to clean up 1761 * OK, we failed to lock the whole subtree so we have to clean up
@@ -1877,8 +1769,7 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1877 } 1769 }
1878 iter->oom_lock = false; 1770 iter->oom_lock = false;
1879 } 1771 }
1880done: 1772 return false;
1881 return lock_count;
1882} 1773}
1883 1774
1884/* 1775/*
@@ -2169,13 +2060,7 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
2169 2060
2170 /* Notify other cpus that system-wide "drain" is running */ 2061 /* Notify other cpus that system-wide "drain" is running */
2171 get_online_cpus(); 2062 get_online_cpus();
2172 /* 2063 curcpu = get_cpu();
2173 * Get a hint for avoiding draining charges on the current cpu,
2174 * which must be exhausted by our charging. It is not required that
2175 * this be a precise check, so we use raw_smp_processor_id() instead of
2176 * getcpu()/putcpu().
2177 */
2178 curcpu = raw_smp_processor_id();
2179 for_each_online_cpu(cpu) { 2064 for_each_online_cpu(cpu) {
2180 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2065 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2181 struct mem_cgroup *mem; 2066 struct mem_cgroup *mem;
@@ -2192,6 +2077,7 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
2192 schedule_work_on(cpu, &stock->work); 2077 schedule_work_on(cpu, &stock->work);
2193 } 2078 }
2194 } 2079 }
2080 put_cpu();
2195 2081
2196 if (!sync) 2082 if (!sync)
2197 goto out; 2083 goto out;
@@ -3866,18 +3752,14 @@ try_to_free:
3866 /* try to free all pages in this cgroup */ 3752 /* try to free all pages in this cgroup */
3867 shrink = 1; 3753 shrink = 1;
3868 while (nr_retries && mem->res.usage > 0) { 3754 while (nr_retries && mem->res.usage > 0) {
3869 struct memcg_scanrecord rec;
3870 int progress; 3755 int progress;
3871 3756
3872 if (signal_pending(current)) { 3757 if (signal_pending(current)) {
3873 ret = -EINTR; 3758 ret = -EINTR;
3874 goto out; 3759 goto out;
3875 } 3760 }
3876 rec.context = SCAN_BY_SHRINK;
3877 rec.mem = mem;
3878 rec.root = mem;
3879 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 3761 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3880 false, &rec); 3762 false);
3881 if (!progress) { 3763 if (!progress) {
3882 nr_retries--; 3764 nr_retries--;
3883 /* maybe some writeback is necessary */ 3765 /* maybe some writeback is necessary */
@@ -4721,54 +4603,6 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
4721} 4603}
4722#endif /* CONFIG_NUMA */ 4604#endif /* CONFIG_NUMA */
4723 4605
4724static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp,
4725 struct cftype *cft,
4726 struct cgroup_map_cb *cb)
4727{
4728 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4729 char string[64];
4730 int i;
4731
4732 for (i = 0; i < NR_SCANSTATS; i++) {
4733 strcpy(string, scanstat_string[i]);
4734 strcat(string, SCANSTAT_WORD_LIMIT);
4735 cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]);
4736 }
4737
4738 for (i = 0; i < NR_SCANSTATS; i++) {
4739 strcpy(string, scanstat_string[i]);
4740 strcat(string, SCANSTAT_WORD_SYSTEM);
4741 cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]);
4742 }
4743
4744 for (i = 0; i < NR_SCANSTATS; i++) {
4745 strcpy(string, scanstat_string[i]);
4746 strcat(string, SCANSTAT_WORD_LIMIT);
4747 strcat(string, SCANSTAT_WORD_HIERARCHY);
4748 cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]);
4749 }
4750 for (i = 0; i < NR_SCANSTATS; i++) {
4751 strcpy(string, scanstat_string[i]);
4752 strcat(string, SCANSTAT_WORD_SYSTEM);
4753 strcat(string, SCANSTAT_WORD_HIERARCHY);
4754 cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]);
4755 }
4756 return 0;
4757}
4758
4759static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp,
4760 unsigned int event)
4761{
4762 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4763
4764 spin_lock(&mem->scanstat.lock);
4765 memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats));
4766 memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats));
4767 spin_unlock(&mem->scanstat.lock);
4768 return 0;
4769}
4770
4771
4772static struct cftype mem_cgroup_files[] = { 4606static struct cftype mem_cgroup_files[] = {
4773 { 4607 {
4774 .name = "usage_in_bytes", 4608 .name = "usage_in_bytes",
@@ -4839,11 +4673,6 @@ static struct cftype mem_cgroup_files[] = {
4839 .mode = S_IRUGO, 4673 .mode = S_IRUGO,
4840 }, 4674 },
4841#endif 4675#endif
4842 {
4843 .name = "vmscan_stat",
4844 .read_map = mem_cgroup_vmscan_stat_read,
4845 .trigger = mem_cgroup_reset_vmscan_stat,
4846 },
4847}; 4676};
4848 4677
4849#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4678#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -5107,7 +4936,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
5107 atomic_set(&mem->refcnt, 1); 4936 atomic_set(&mem->refcnt, 1);
5108 mem->move_charge_at_immigrate = 0; 4937 mem->move_charge_at_immigrate = 0;
5109 mutex_init(&mem->thresholds_lock); 4938 mutex_init(&mem->thresholds_lock);
5110 spin_lock_init(&mem->scanstat.lock);
5111 return &mem->css; 4939 return &mem->css;
5112free_out: 4940free_out:
5113 __mem_cgroup_free(mem); 4941 __mem_cgroup_free(mem);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8b57173c1dd5..9c51f9f58cac 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -636,7 +636,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
636 struct vm_area_struct *prev; 636 struct vm_area_struct *prev;
637 struct vm_area_struct *vma; 637 struct vm_area_struct *vma;
638 int err = 0; 638 int err = 0;
639 pgoff_t pgoff;
640 unsigned long vmstart; 639 unsigned long vmstart;
641 unsigned long vmend; 640 unsigned long vmend;
642 641
@@ -649,9 +648,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
649 vmstart = max(start, vma->vm_start); 648 vmstart = max(start, vma->vm_start);
650 vmend = min(end, vma->vm_end); 649 vmend = min(end, vma->vm_end);
651 650
652 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
653 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, 651 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
654 vma->anon_vma, vma->vm_file, pgoff, new_pol); 652 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
653 new_pol);
655 if (prev) { 654 if (prev) {
656 vma = prev; 655 vma = prev;
657 next = vma->vm_next; 656 next = vma->vm_next;
@@ -1412,7 +1411,9 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1412 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); 1411 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1413 1412
1414 if (!err && nmask) { 1413 if (!err && nmask) {
1415 err = copy_from_user(bm, nm, alloc_size); 1414 unsigned long copy_size;
1415 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1416 err = copy_from_user(bm, nm, copy_size);
1416 /* ensure entire bitmap is zeroed */ 1417 /* ensure entire bitmap is zeroed */
1417 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); 1418 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1418 err |= compat_put_bitmap(nmask, bm, nr_bits); 1419 err |= compat_put_bitmap(nmask, bm, nr_bits);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d1960744f881..0e309cd1b5b9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -754,21 +754,10 @@ static void balance_dirty_pages(struct address_space *mapping,
754 * 200ms is typically more than enough to curb heavy dirtiers; 754 * 200ms is typically more than enough to curb heavy dirtiers;
755 * (b) the pause time limit makes the dirtiers more responsive. 755 * (b) the pause time limit makes the dirtiers more responsive.
756 */ 756 */
757 if (nr_dirty < dirty_thresh + 757 if (nr_dirty < dirty_thresh &&
758 dirty_thresh / DIRTY_MAXPAUSE_AREA && 758 bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 &&
759 time_after(jiffies, start_time + MAX_PAUSE)) 759 time_after(jiffies, start_time + MAX_PAUSE))
760 break; 760 break;
761 /*
762 * pass-good area. When some bdi gets blocked (eg. NFS server
763 * not responding), or write bandwidth dropped dramatically due
764 * to concurrent reads, or dirty threshold suddenly dropped and
765 * the dirty pages cannot be brought down anytime soon (eg. on
766 * slow USB stick), at least let go of the good bdi's.
767 */
768 if (nr_dirty < dirty_thresh +
769 dirty_thresh / DIRTY_PASSGOOD_AREA &&
770 bdi_dirty < bdi_thresh)
771 break;
772 761
773 /* 762 /*
774 * Increase the delay for each loop, up to our previous 763 * Increase the delay for each loop, up to our previous
diff --git a/mm/slub.c b/mm/slub.c
index 9f662d70eb47..7c54fe83a90c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2377,7 +2377,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2377 */ 2377 */
2378 if (unlikely(!prior)) { 2378 if (unlikely(!prior)) {
2379 remove_full(s, page); 2379 remove_full(s, page);
2380 add_partial(n, page, 0); 2380 add_partial(n, page, 1);
2381 stat(s, FREE_ADD_PARTIAL); 2381 stat(s, FREE_ADD_PARTIAL);
2382 } 2382 }
2383 } 2383 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7ef0903058ee..5016f19e1661 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2140,6 +2140,14 @@ struct vm_struct *alloc_vm_area(size_t size)
2140 return NULL; 2140 return NULL;
2141 } 2141 }
2142 2142
2143 /*
2144 * If the allocated address space is passed to a hypercall
2145 * before being used then we cannot rely on a page fault to
2146 * trigger an update of the page tables. So sync all the page
2147 * tables here.
2148 */
2149 vmalloc_sync_all();
2150
2143 return area; 2151 return area;
2144} 2152}
2145EXPORT_SYMBOL_GPL(alloc_vm_area); 2153EXPORT_SYMBOL_GPL(alloc_vm_area);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7ef69124fa3e..b55699cd9067 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -105,7 +105,6 @@ struct scan_control {
105 105
106 /* Which cgroup do we reclaim from */ 106 /* Which cgroup do we reclaim from */
107 struct mem_cgroup *mem_cgroup; 107 struct mem_cgroup *mem_cgroup;
108 struct memcg_scanrecord *memcg_record;
109 108
110 /* 109 /*
111 * Nodemask of nodes allowed by the caller. If NULL, all nodes 110 * Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -1349,8 +1348,6 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
1349 int file = is_file_lru(lru); 1348 int file = is_file_lru(lru);
1350 int numpages = hpage_nr_pages(page); 1349 int numpages = hpage_nr_pages(page);
1351 reclaim_stat->recent_rotated[file] += numpages; 1350 reclaim_stat->recent_rotated[file] += numpages;
1352 if (!scanning_global_lru(sc))
1353 sc->memcg_record->nr_rotated[file] += numpages;
1354 } 1351 }
1355 if (!pagevec_add(&pvec, page)) { 1352 if (!pagevec_add(&pvec, page)) {
1356 spin_unlock_irq(&zone->lru_lock); 1353 spin_unlock_irq(&zone->lru_lock);
@@ -1394,10 +1391,6 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
1394 1391
1395 reclaim_stat->recent_scanned[0] += *nr_anon; 1392 reclaim_stat->recent_scanned[0] += *nr_anon;
1396 reclaim_stat->recent_scanned[1] += *nr_file; 1393 reclaim_stat->recent_scanned[1] += *nr_file;
1397 if (!scanning_global_lru(sc)) {
1398 sc->memcg_record->nr_scanned[0] += *nr_anon;
1399 sc->memcg_record->nr_scanned[1] += *nr_file;
1400 }
1401} 1394}
1402 1395
1403/* 1396/*
@@ -1511,9 +1504,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1511 nr_reclaimed += shrink_page_list(&page_list, zone, sc); 1504 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1512 } 1505 }
1513 1506
1514 if (!scanning_global_lru(sc))
1515 sc->memcg_record->nr_freed[file] += nr_reclaimed;
1516
1517 local_irq_disable(); 1507 local_irq_disable();
1518 if (current_is_kswapd()) 1508 if (current_is_kswapd())
1519 __count_vm_events(KSWAPD_STEAL, nr_reclaimed); 1509 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
@@ -1613,8 +1603,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1613 } 1603 }
1614 1604
1615 reclaim_stat->recent_scanned[file] += nr_taken; 1605 reclaim_stat->recent_scanned[file] += nr_taken;
1616 if (!scanning_global_lru(sc))
1617 sc->memcg_record->nr_scanned[file] += nr_taken;
1618 1606
1619 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1607 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1620 if (file) 1608 if (file)
@@ -1666,8 +1654,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1666 * get_scan_ratio. 1654 * get_scan_ratio.
1667 */ 1655 */
1668 reclaim_stat->recent_rotated[file] += nr_rotated; 1656 reclaim_stat->recent_rotated[file] += nr_rotated;
1669 if (!scanning_global_lru(sc))
1670 sc->memcg_record->nr_rotated[file] += nr_rotated;
1671 1657
1672 move_active_pages_to_lru(zone, &l_active, 1658 move_active_pages_to_lru(zone, &l_active,
1673 LRU_ACTIVE + file * LRU_FILE); 1659 LRU_ACTIVE + file * LRU_FILE);
@@ -1808,23 +1794,15 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1808 u64 fraction[2], denominator; 1794 u64 fraction[2], denominator;
1809 enum lru_list l; 1795 enum lru_list l;
1810 int noswap = 0; 1796 int noswap = 0;
1811 int force_scan = 0; 1797 bool force_scan = false;
1812 unsigned long nr_force_scan[2]; 1798 unsigned long nr_force_scan[2];
1813 1799
1814 1800 /* kswapd does zone balancing and needs to scan this zone */
1815 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1801 if (scanning_global_lru(sc) && current_is_kswapd())
1816 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 1802 force_scan = true;
1817 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + 1803 /* memcg may have small limit and need to avoid priority drop */
1818 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 1804 if (!scanning_global_lru(sc))
1819 1805 force_scan = true;
1820 if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
1821 /* kswapd does zone balancing and need to scan this zone */
1822 if (scanning_global_lru(sc) && current_is_kswapd())
1823 force_scan = 1;
1824 /* memcg may have small limit and need to avoid priority drop */
1825 if (!scanning_global_lru(sc))
1826 force_scan = 1;
1827 }
1828 1806
1829 /* If we have no swap space, do not bother scanning anon pages. */ 1807 /* If we have no swap space, do not bother scanning anon pages. */
1830 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1808 if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1837,6 +1815,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1837 goto out; 1815 goto out;
1838 } 1816 }
1839 1817
1818 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1819 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1820 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1821 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1822
1840 if (scanning_global_lru(sc)) { 1823 if (scanning_global_lru(sc)) {
1841 free = zone_page_state(zone, NR_FREE_PAGES); 1824 free = zone_page_state(zone, NR_FREE_PAGES);
1842 /* If we have very few page cache pages, 1825 /* If we have very few page cache pages,
@@ -2268,10 +2251,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2268#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2251#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2269 2252
2270unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2253unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2271 gfp_t gfp_mask, bool noswap, 2254 gfp_t gfp_mask, bool noswap,
2272 struct zone *zone, 2255 struct zone *zone,
2273 struct memcg_scanrecord *rec, 2256 unsigned long *nr_scanned)
2274 unsigned long *scanned)
2275{ 2257{
2276 struct scan_control sc = { 2258 struct scan_control sc = {
2277 .nr_scanned = 0, 2259 .nr_scanned = 0,
@@ -2281,9 +2263,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2281 .may_swap = !noswap, 2263 .may_swap = !noswap,
2282 .order = 0, 2264 .order = 0,
2283 .mem_cgroup = mem, 2265 .mem_cgroup = mem,
2284 .memcg_record = rec,
2285 }; 2266 };
2286 unsigned long start, end;
2287 2267
2288 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2268 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2289 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2269 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2292,7 +2272,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2292 sc.may_writepage, 2272 sc.may_writepage,
2293 sc.gfp_mask); 2273 sc.gfp_mask);
2294 2274
2295 start = sched_clock();
2296 /* 2275 /*
2297 * NOTE: Although we can get the priority field, using it 2276 * NOTE: Although we can get the priority field, using it
2298 * here is not a good idea, since it limits the pages we can scan. 2277 * here is not a good idea, since it limits the pages we can scan.
@@ -2301,25 +2280,19 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2301 * the priority and make it zero. 2280 * the priority and make it zero.
2302 */ 2281 */
2303 shrink_zone(0, zone, &sc); 2282 shrink_zone(0, zone, &sc);
2304 end = sched_clock();
2305
2306 if (rec)
2307 rec->elapsed += end - start;
2308 *scanned = sc.nr_scanned;
2309 2283
2310 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2284 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2311 2285
2286 *nr_scanned = sc.nr_scanned;
2312 return sc.nr_reclaimed; 2287 return sc.nr_reclaimed;
2313} 2288}
2314 2289
2315unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 2290unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2316 gfp_t gfp_mask, 2291 gfp_t gfp_mask,
2317 bool noswap, 2292 bool noswap)
2318 struct memcg_scanrecord *rec)
2319{ 2293{
2320 struct zonelist *zonelist; 2294 struct zonelist *zonelist;
2321 unsigned long nr_reclaimed; 2295 unsigned long nr_reclaimed;
2322 unsigned long start, end;
2323 int nid; 2296 int nid;
2324 struct scan_control sc = { 2297 struct scan_control sc = {
2325 .may_writepage = !laptop_mode, 2298 .may_writepage = !laptop_mode,
@@ -2328,7 +2301,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2328 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2301 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2329 .order = 0, 2302 .order = 0,
2330 .mem_cgroup = mem_cont, 2303 .mem_cgroup = mem_cont,
2331 .memcg_record = rec,
2332 .nodemask = NULL, /* we don't care the placement */ 2304 .nodemask = NULL, /* we don't care the placement */
2333 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2305 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2334 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 2306 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
@@ -2337,7 +2309,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2337 .gfp_mask = sc.gfp_mask, 2309 .gfp_mask = sc.gfp_mask,
2338 }; 2310 };
2339 2311
2340 start = sched_clock();
2341 /* 2312 /*
2342 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't 2313 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
2343 * take care of from where we get pages. So the node where we start the 2314 * take care of from where we get pages. So the node where we start the
@@ -2352,9 +2323,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2352 sc.gfp_mask); 2323 sc.gfp_mask);
2353 2324
2354 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2325 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2355 end = sched_clock();
2356 if (rec)
2357 rec->elapsed += end - start;
2358 2326
2359 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 2327 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2360 2328
@@ -2529,6 +2497,9 @@ loop_again:
2529 high_wmark_pages(zone), 0, 0)) { 2497 high_wmark_pages(zone), 0, 0)) {
2530 end_zone = i; 2498 end_zone = i;
2531 break; 2499 break;
2500 } else {
2501 /* If balanced, clear the congested flag */
2502 zone_clear_flag(zone, ZONE_CONGESTED);
2532 } 2503 }
2533 } 2504 }
2534 if (i < 0) 2505 if (i < 0)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c18b7694b2..d52b13d28e8f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -659,7 +659,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
659} 659}
660#endif 660#endif
661 661
662#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) 662#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
663#ifdef CONFIG_ZONE_DMA 663#ifdef CONFIG_ZONE_DMA
664#define TEXT_FOR_DMA(xx) xx "_dma", 664#define TEXT_FOR_DMA(xx) xx "_dma",
665#else 665#else
@@ -788,7 +788,7 @@ const char * const vmstat_text[] = {
788 788
789#endif /* CONFIG_VM_EVENTS_COUNTERS */ 789#endif /* CONFIG_VM_EVENTS_COUNTERS */
790}; 790};
791#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */ 791#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
792 792
793 793
794#ifdef CONFIG_PROC_FS 794#ifdef CONFIG_PROC_FS