12 files changed, 74 insertions, 46 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index c0018f2d50e0..5f0a3c91fdac 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1828,7 +1828,7 @@ repeat:
                page = __page_cache_alloc(gfp | __GFP_COLD);
                if (!page)
                        return ERR_PTR(-ENOMEM);
-                err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+                err = add_to_page_cache_lru(page, mapping, index, gfp);
                if (unlikely(err)) {
                        page_cache_release(page);
                        if (err == -EEXIST)
@@ -1925,10 +1925,7 @@ static struct page *wait_on_page_read(struct page *page)
 * @gfp:        the page allocator flags to use if allocating
 *
 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
- * any new page allocations done using the specified allocation flags. Note
+ * any new page allocations done using the specified allocation flags.
- * that the Radix tree operations will still use GFP_KERNEL, so you can't
- * expect to do this atomically or anything like that - but you can pass in
- * other page requirements.
 *
 * If the page does not get brought uptodate, return -EIO.
 */
@@ -2407,7 +2404,6 @@ static ssize_t generic_perform_write(struct file *file,
                                                iov_iter_count(i));
 again:
                /*
                 * Bring in the user page that we will copy from _first_.
                 * Otherwise there's a nasty deadlock on copying from the
@@ -2463,7 +2459,10 @@ again:
                written += copied;
                balance_dirty_pages_ratelimited(mapping);
+                if (fatal_signal_pending(current)) {
+                        status = -EINTR;
+                        break;
+                }
        } while (iov_iter_count(i));
        return written ? written : status;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4298abaae153..36b3d988b4ef 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2259,12 +2259,8 @@ static void khugepaged_do_scan(struct page **hpage)
 static void khugepaged_alloc_sleep(void)
 {
-        DEFINE_WAIT(wait);
+        wait_event_freezable_timeout(khugepaged_wait, false,
-        add_wait_queue(&khugepaged_wait, &wait);
+                        msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
-        schedule_timeout_interruptible(
-                msecs_to_jiffies(
-                        khugepaged_alloc_sleep_millisecs));
-        remove_wait_queue(&khugepaged_wait, &wait);
 }
 #ifndef CONFIG_NUMA
@@ -2313,14 +2309,10 @@ static void khugepaged_loop(void)
                if (unlikely(kthread_should_stop()))
                        break;
                if (khugepaged_has_work()) {
-                        DEFINE_WAIT(wait);
                        if (!khugepaged_scan_sleep_millisecs)
                                continue;
-                        add_wait_queue(&khugepaged_wait, &wait);
+                        wait_event_freezable_timeout(khugepaged_wait, false,
-                        schedule_timeout_interruptible(
+                            msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
-                                msecs_to_jiffies(
-                                        khugepaged_scan_sleep_millisecs));
-                        remove_wait_queue(&khugepaged_wait, &wait);
                } else if (khugepaged_enabled())
                        wait_event_freezable(khugepaged_wait,
                                             khugepaged_wait_event());
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bb28a5f9db8d..73f17c0293c0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -576,6 +576,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
        __SetPageHead(page);
        for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
                __SetPageTail(p);
+                set_page_count(p, 0);
                p->first_page = page;
        }
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6aff93c98aca..b63f5f7dfa07 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4907,9 +4907,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                int cpu;
                enable_swap_cgroup();
                parent = NULL;
-                root_mem_cgroup = memcg;
                if (mem_cgroup_soft_limit_tree_init())
                        goto free_out;
+                root_mem_cgroup = memcg;
                for_each_possible_cpu(cpu) {
                        struct memcg_stock_pcp *stock =
                                                &per_cpu(memcg_stock, cpu);
@@ -4948,7 +4948,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
        return &memcg->css;
 free_out:
        __mem_cgroup_free(memcg);
-        root_mem_cgroup = NULL;
        return ERR_PTR(error);
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 578e29174fa6..177aca424a06 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -871,9 +871,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        if (anon_vma)
                put_anon_vma(anon_vma);
-out:
        unlock_page(hpage);
+out:
        if (rc != -EAGAIN) {
                list_del(&hpage->lru);
                put_page(hpage);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 76f2c5ae908e..069b64e521fc 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -176,7 +176,7 @@ static bool oom_unkillable_task(struct task_struct *p,
 unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
                      const nodemask_t *nodemask, unsigned long totalpages)
 {
-        int points;
+        long points;
        if (oom_unkillable_task(p, mem, nodemask))
                return 0;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 71252486bc6f..50f08241f981 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -411,8 +411,13 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 *
 * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
 * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
- * And the "limit" in the name is not seriously taken as hard limit in
+ *
- * balance_dirty_pages().
+ * Note that balance_dirty_pages() will only seriously take it as a hard limit
+ * when sleeping max_pause per page is not enough to keep the dirty pages under
+ * control. For example, when the device is completely stalled due to some error
+ * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
+ * In the other normal situations, it acts more gently by throttling the tasks
+ * more (rather than completely block them) when the bdi dirty pages go high.
 *
 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
 * - starving fast devices
@@ -594,6 +599,13 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
         */
        if (unlikely(bdi_thresh > thresh))
                bdi_thresh = thresh;
+        /*
+         * It's very possible that bdi_thresh is close to 0 not because the
+         * device is slow, but that it has remained inactive for long time.
+         * Honour such devices a reasonable good (hopefully IO efficient)
+         * threshold, so that the occasional writes won't be blocked and active
+         * writes can rampup the threshold quickly.
+         */
        bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
        /*
         * scale global setpoint to bdi's:
@@ -977,8 +989,7 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
         *
         * 8 serves as the safety ratio.
         */
-        if (bdi_dirty)
+        t = min(t, bdi_dirty * HZ / (8 * bw + 1));
-                t = min(t, bdi_dirty * HZ / (8 * bw + 1));
        /*
         * The pause time will be settled within range (max_pause/4, max_pause).
@@ -1136,6 +1147,19 @@ pause:
                if (task_ratelimit)
                        break;
+                /*
+                 * In the case of an unresponding NFS server and the NFS dirty
+                 * pages exceeds dirty_thresh, give the other good bdi's a pipe
+                 * to go through, so that tasks on them still remain responsive.
+                 *
+                 * In theory 1 page is enough to keep the comsumer-producer
+                 * pipe going: the flusher cleans 1 page => the task dirties 1
+                 * more page. However bdi_dirty has accounting errors.  So use
+                 * the larger and more IO friendly bdi_stat_error.
+                 */
+                if (bdi_dirty <= bdi_stat_error(bdi))
+                        break;
                if (fatal_signal_pending(current))
                        break;
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9dd443d89d8b..2b8ba3aebf6e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -356,8 +356,8 @@ void prep_compound_page(struct page *page, unsigned long order)
        __SetPageHead(page);
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
                __SetPageTail(p);
+                set_page_count(p, 0);
                p->first_page = page;
        }
 }
@@ -3377,9 +3377,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
        unsigned long block_migratetype;
        int reserve;
-        /* Get the start pfn, end pfn and the number of blocks to reserve */
+        /*
+         * Get the start pfn, end pfn and the number of blocks to reserve
+         * We have to be careful to be aligned to pageblock_nr_pages to
+         * make sure that we always check pfn_valid for the first page in
+         * the block.
+         */
        start_pfn = zone->zone_start_pfn;
        end_pfn = start_pfn + zone->spanned_pages;
+        start_pfn = roundup(start_pfn, pageblock_nr_pages);
        reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
                                                        pageblock_order;
diff --git a/mm/percpu.c b/mm/percpu.c
index 3bb810a72006..716eb4acf2fc 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1023,9 +1023,11 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
                if (!is_vmalloc_addr(addr))
                        return __pa(addr);
                else
-                        return page_to_phys(vmalloc_to_page(addr));
+                        return page_to_phys(vmalloc_to_page(addr)) +
+                               offset_in_page(addr);
        } else
-                return page_to_phys(pcpu_addr_to_page(addr));
+                return page_to_phys(pcpu_addr_to_page(addr)) +
+                       offset_in_page(addr);
 }
 /**
diff --git a/mm/slab.c b/mm/slab.c
index 708efe886154..83311c9aaf9d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -595,6 +595,7 @@ static enum {
        PARTIAL_AC,
        PARTIAL_L3,
        EARLY,
+        LATE,
        FULL
 } g_cpucache_up;
@@ -671,7 +672,7 @@ static void init_node_lock_keys(int q)
 {
        struct cache_sizes *s = malloc_sizes;
-        if (g_cpucache_up != FULL)
+        if (g_cpucache_up < LATE)
                return;
        for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
@@ -1666,6 +1667,8 @@ void __init kmem_cache_init_late(void)
 {
        struct kmem_cache *cachep;
+        g_cpucache_up = LATE;
        /* Annotate slab for lockdep -- annotate the malloc caches */
        init_lock_keys();
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e583f770dfee..21fdf46ad5aa 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1315,7 +1315,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
                unsigned long align, unsigned long flags, unsigned long start,
                unsigned long end, int node, gfp_t gfp_mask, void *caller)
 {
-        static struct vmap_area *va;
+        struct vmap_area *va;
        struct vm_struct *area;
        BUG_ON(in_interrupt());
@@ -1658,6 +1658,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
                goto fail;
        addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
+        if (!addr)
+                return NULL;
        /*
         * In this function, newly allocated vm_struct is not added
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a1893c050795..f54a05b7a61d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -183,7 +183,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
 */
 void register_shrinker(struct shrinker *shrinker)
 {
-        shrinker->nr = 0;
+        atomic_long_set(&shrinker->nr_in_batch, 0);
        down_write(&shrinker_rwsem);
        list_add_tail(&shrinker->list, &shrinker_list);
        up_write(&shrinker_rwsem);
@@ -247,25 +247,26 @@ unsigned long shrink_slab(struct shrink_control *shrink,
        list_for_each_entry(shrinker, &shrinker_list, list) {
                unsigned long long delta;
-                unsigned long total_scan;
+                long total_scan;
-                unsigned long max_pass;
+                long max_pass;
                int shrink_ret = 0;
                long nr;
                long new_nr;
                long batch_size = shrinker->batch ? shrinker->batch
                                                  : SHRINK_BATCH;
+                max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+                if (max_pass <= 0)
+                        continue;
                /*
                 * copy the current shrinker scan count into a local variable
                 * and zero it so that other concurrent shrinker invocations
                 * don't also do this scanning work.
                 */
-                do {
+                nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
-                        nr = shrinker->nr;
-                } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
                total_scan = nr;
-                max_pass = do_shrinker_shrink(shrinker, shrink, 0);
                delta = (4 * nr_pages_scanned) / shrinker->seeks;
                delta *= max_pass;
                do_div(delta, lru_pages + 1);
@@ -325,12 +326,11 @@ unsigned long shrink_slab(struct shrink_control *shrink,
                 * manner that handles concurrent updates. If we exhausted the
                 * scan, there is no need to do an update.
                 */
-                do {
+                if (total_scan > 0)
-                        nr = shrinker->nr;
+                        new_nr = atomic_long_add_return(total_scan,
-                        new_nr = total_scan + nr;
+                                        &shrinker->nr_in_batch);
-                        if (total_scan <= 0)
+                else
-                                break;
+                        new_nr = atomic_long_read(&shrinker->nr_in_batch);
-                } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
                trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
        }

diff --git a/mm/filemap.c b/mm/filemap.c index c0018f2d50e0..5f0a3c91fdac 100644 --- a/mm/filemap.c +++ b/mm/filemap.c
@@ -1828,7 +1828,7 @@ repeat:
1828	page = __page_cache_alloc(gfp \| __GFP_COLD);	1828	page = __page_cache_alloc(gfp \| __GFP_COLD);
1829	if (!page)	1829	if (!page)
1830	return ERR_PTR(-ENOMEM);	1830	return ERR_PTR(-ENOMEM);
1831	err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);	1831	err = add_to_page_cache_lru(page, mapping, index, gfp);
1832	if (unlikely(err)) {	1832	if (unlikely(err)) {
1833	page_cache_release(page);	1833	page_cache_release(page);
1834	if (err == -EEXIST)	1834	if (err == -EEXIST)
@@ -1925,10 +1925,7 @@ static struct page wait_on_page_read(struct page page)
1925	* @gfp: the page allocator flags to use if allocating	1925	* @gfp: the page allocator flags to use if allocating
1926	*	1926	*
1927	* This is the same as "read_mapping_page(mapping, index, NULL)", but with	1927	* This is the same as "read_mapping_page(mapping, index, NULL)", but with
1928	* any new page allocations done using the specified allocation flags. Note	1928	* any new page allocations done using the specified allocation flags.
1929	* that the Radix tree operations will still use GFP_KERNEL, so you can't
1930	* expect to do this atomically or anything like that - but you can pass in
1931	* other page requirements.
1932	*	1929	*
1933	* If the page does not get brought uptodate, return -EIO.	1930	* If the page does not get brought uptodate, return -EIO.
1934	*/	1931	*/
@@ -2407,7 +2404,6 @@ static ssize_t generic_perform_write(struct file *file,
2407	iov_iter_count(i));	2404	iov_iter_count(i));
2408		2405
2409	again:	2406	again:
2410
2411	/*	2407	/*
2412	* Bring in the user page that we will copy from _first_.	2408	* Bring in the user page that we will copy from _first_.
2413	* Otherwise there's a nasty deadlock on copying from the	2409	* Otherwise there's a nasty deadlock on copying from the
@@ -2463,7 +2459,10 @@ again:
2463	written += copied;	2459	written += copied;
2464		2460
2465	balance_dirty_pages_ratelimited(mapping);	2461	balance_dirty_pages_ratelimited(mapping);
2466		2462	if (fatal_signal_pending(current)) {
		2463	status = -EINTR;
		2464	break;
		2465	}
2467	} while (iov_iter_count(i));	2466	} while (iov_iter_count(i));
2468		2467
2469	return written ? written : status;	2468	return written ? written : status;


diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4298abaae153..36b3d988b4ef 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c
@@ -2259,12 +2259,8 @@ static void khugepaged_do_scan(struct page **hpage)
2259		2259
2260	static void khugepaged_alloc_sleep(void)	2260	static void khugepaged_alloc_sleep(void)
2261	{	2261	{
2262	DEFINE_WAIT(wait);	2262	wait_event_freezable_timeout(khugepaged_wait, false,
2263	add_wait_queue(&khugepaged_wait, &wait);	2263	msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
2264	schedule_timeout_interruptible(
2265	msecs_to_jiffies(
2266	khugepaged_alloc_sleep_millisecs));
2267	remove_wait_queue(&khugepaged_wait, &wait);
2268	}	2264	}
2269		2265
2270	#ifndef CONFIG_NUMA	2266	#ifndef CONFIG_NUMA
@@ -2313,14 +2309,10 @@ static void khugepaged_loop(void)
2313	if (unlikely(kthread_should_stop()))	2309	if (unlikely(kthread_should_stop()))
2314	break;	2310	break;
2315	if (khugepaged_has_work()) {	2311	if (khugepaged_has_work()) {
2316	DEFINE_WAIT(wait);
2317	if (!khugepaged_scan_sleep_millisecs)	2312	if (!khugepaged_scan_sleep_millisecs)
2318	continue;	2313	continue;
2319	add_wait_queue(&khugepaged_wait, &wait);	2314	wait_event_freezable_timeout(khugepaged_wait, false,
2320	schedule_timeout_interruptible(	2315	msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
2321	msecs_to_jiffies(
2322	khugepaged_scan_sleep_millisecs));
2323	remove_wait_queue(&khugepaged_wait, &wait);
2324	} else if (khugepaged_enabled())	2316	} else if (khugepaged_enabled())
2325	wait_event_freezable(khugepaged_wait,	2317	wait_event_freezable(khugepaged_wait,
2326	khugepaged_wait_event());	2318	khugepaged_wait_event());


diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bb28a5f9db8d..73f17c0293c0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -576,6 +576,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
576	__SetPageHead(page);	576	__SetPageHead(page);
577	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {	577	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
578	__SetPageTail(p);	578	__SetPageTail(p);
		579	set_page_count(p, 0);
579	p->first_page = page;	580	p->first_page = page;
580	}	581	}
581	}	582	}


diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6aff93c98aca..b63f5f7dfa07 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -4907,9 +4907,9 @@ mem_cgroup_create(struct cgroup_subsys ss, struct cgroup cont)
4907	int cpu;	4907	int cpu;
4908	enable_swap_cgroup();	4908	enable_swap_cgroup();
4909	parent = NULL;	4909	parent = NULL;
4910	root_mem_cgroup = memcg;
4911	if (mem_cgroup_soft_limit_tree_init())	4910	if (mem_cgroup_soft_limit_tree_init())
4912	goto free_out;	4911	goto free_out;
		4912	root_mem_cgroup = memcg;
4913	for_each_possible_cpu(cpu) {	4913	for_each_possible_cpu(cpu) {
4914	struct memcg_stock_pcp *stock =	4914	struct memcg_stock_pcp *stock =
4915	&per_cpu(memcg_stock, cpu);	4915	&per_cpu(memcg_stock, cpu);
@@ -4948,7 +4948,6 @@ mem_cgroup_create(struct cgroup_subsys ss, struct cgroup cont)
4948	return &memcg->css;	4948	return &memcg->css;
4949	free_out:	4949	free_out:
4950	__mem_cgroup_free(memcg);	4950	__mem_cgroup_free(memcg);
4951	root_mem_cgroup = NULL;
4952	return ERR_PTR(error);	4951	return ERR_PTR(error);
4953	}	4952	}
4954		4953


diff --git a/mm/migrate.c b/mm/migrate.c index 578e29174fa6..177aca424a06 100644 --- a/mm/migrate.c +++ b/mm/migrate.c
@@ -871,9 +871,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
871		871
872	if (anon_vma)	872	if (anon_vma)
873	put_anon_vma(anon_vma);	873	put_anon_vma(anon_vma);
874	out:
875	unlock_page(hpage);	874	unlock_page(hpage);
876		875
		876	out:
877	if (rc != -EAGAIN) {	877	if (rc != -EAGAIN) {
878	list_del(&hpage->lru);	878	list_del(&hpage->lru);
879	put_page(hpage);	879	put_page(hpage);


diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 76f2c5ae908e..069b64e521fc 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c
@@ -176,7 +176,7 @@ static bool oom_unkillable_task(struct task_struct *p,
176	unsigned int oom_badness(struct task_struct p, struct mem_cgroup mem,	176	unsigned int oom_badness(struct task_struct p, struct mem_cgroup mem,
177	const nodemask_t *nodemask, unsigned long totalpages)	177	const nodemask_t *nodemask, unsigned long totalpages)
178	{	178	{
179	int points;	179	long points;
180		180
181	if (oom_unkillable_task(p, mem, nodemask))	181	if (oom_unkillable_task(p, mem, nodemask))
182	return 0;	182	return 0;


diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 71252486bc6f..50f08241f981 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c
@@ -411,8 +411,13 @@ void global_dirty_limits(unsigned long pbackground, unsigned long pdirty)
411	*	411	*
412	* Returns @bdi's dirty limit in pages. The term "dirty" in the context of	412	* Returns @bdi's dirty limit in pages. The term "dirty" in the context of
413	* dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.	413	* dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
414	* And the "limit" in the name is not seriously taken as hard limit in	414	*
415	* balance_dirty_pages().	415	* Note that balance_dirty_pages() will only seriously take it as a hard limit
		416	* when sleeping max_pause per page is not enough to keep the dirty pages under
		417	* control. For example, when the device is completely stalled due to some error
		418	* conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
		419	* In the other normal situations, it acts more gently by throttling the tasks
		420	* more (rather than completely block them) when the bdi dirty pages go high.
416	*	421	*
417	* It allocates high/low dirty limits to fast/slow devices, in order to prevent	422	* It allocates high/low dirty limits to fast/slow devices, in order to prevent
418	* - starving fast devices	423	* - starving fast devices
@@ -594,6 +599,13 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
594	*/	599	*/
595	if (unlikely(bdi_thresh > thresh))	600	if (unlikely(bdi_thresh > thresh))
596	bdi_thresh = thresh;	601	bdi_thresh = thresh;
		602	/*
		603	* It's very possible that bdi_thresh is close to 0 not because the
		604	* device is slow, but that it has remained inactive for long time.
		605	* Honour such devices a reasonable good (hopefully IO efficient)
		606	* threshold, so that the occasional writes won't be blocked and active
		607	* writes can rampup the threshold quickly.
		608	*/
597	bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);	609	bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
598	/*	610	/*
599	* scale global setpoint to bdi's:	611	* scale global setpoint to bdi's:
@@ -977,8 +989,7 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
977	*	989	*
978	* 8 serves as the safety ratio.	990	* 8 serves as the safety ratio.
979	*/	991	*/
980	if (bdi_dirty)	992	t = min(t, bdi_dirty * HZ / (8 * bw + 1));
981	t = min(t, bdi_dirty * HZ / (8 * bw + 1));
982		993
983	/*	994	/*
984	* The pause time will be settled within range (max_pause/4, max_pause).	995	* The pause time will be settled within range (max_pause/4, max_pause).
@@ -1136,6 +1147,19 @@ pause:
1136	if (task_ratelimit)	1147	if (task_ratelimit)
1137	break;	1148	break;
1138		1149
		1150	/*
		1151	* In the case of an unresponding NFS server and the NFS dirty
		1152	* pages exceeds dirty_thresh, give the other good bdi's a pipe
		1153	* to go through, so that tasks on them still remain responsive.
		1154	*
		1155	* In theory 1 page is enough to keep the comsumer-producer
		1156	* pipe going: the flusher cleans 1 page => the task dirties 1
		1157	* more page. However bdi_dirty has accounting errors. So use
		1158	* the larger and more IO friendly bdi_stat_error.
		1159	*/
		1160	if (bdi_dirty <= bdi_stat_error(bdi))
		1161	break;
		1162
1139	if (fatal_signal_pending(current))	1163	if (fatal_signal_pending(current))
1140	break;	1164	break;
1141	}	1165	}


diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9dd443d89d8b..2b8ba3aebf6e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -356,8 +356,8 @@ void prep_compound_page(struct page *page, unsigned long order)
356	__SetPageHead(page);	356	__SetPageHead(page);
357	for (i = 1; i < nr_pages; i++) {	357	for (i = 1; i < nr_pages; i++) {
358	struct page *p = page + i;	358	struct page *p = page + i;
359
360	__SetPageTail(p);	359	__SetPageTail(p);
		360	set_page_count(p, 0);
361	p->first_page = page;	361	p->first_page = page;
362	}	362	}
363	}	363	}
@@ -3377,9 +3377,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3377	unsigned long block_migratetype;	3377	unsigned long block_migratetype;
3378	int reserve;	3378	int reserve;
3379		3379
3380	/* Get the start pfn, end pfn and the number of blocks to reserve */	3380	/*
		3381	* Get the start pfn, end pfn and the number of blocks to reserve
		3382	* We have to be careful to be aligned to pageblock_nr_pages to
		3383	* make sure that we always check pfn_valid for the first page in
		3384	* the block.
		3385	*/
3381	start_pfn = zone->zone_start_pfn;	3386	start_pfn = zone->zone_start_pfn;
3382	end_pfn = start_pfn + zone->spanned_pages;	3387	end_pfn = start_pfn + zone->spanned_pages;
		3388	start_pfn = roundup(start_pfn, pageblock_nr_pages);
3383	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>	3389	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3384	pageblock_order;	3390	pageblock_order;
3385		3391


diff --git a/mm/percpu.c b/mm/percpu.c index 3bb810a72006..716eb4acf2fc 100644 --- a/mm/percpu.c +++ b/mm/percpu.c
@@ -1023,9 +1023,11 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
1023	if (!is_vmalloc_addr(addr))	1023	if (!is_vmalloc_addr(addr))
1024	return __pa(addr);	1024	return __pa(addr);
1025	else	1025	else
1026	return page_to_phys(vmalloc_to_page(addr));	1026	return page_to_phys(vmalloc_to_page(addr)) +
		1027	offset_in_page(addr);
1027	} else	1028	} else
1028	return page_to_phys(pcpu_addr_to_page(addr));	1029	return page_to_phys(pcpu_addr_to_page(addr)) +
		1030	offset_in_page(addr);
1029	}	1031	}
1030		1032
1031	/**	1033	/**


diff --git a/mm/slab.c b/mm/slab.c index 708efe886154..83311c9aaf9d 100644 --- a/mm/slab.c +++ b/mm/slab.c
@@ -595,6 +595,7 @@ static enum {
595	PARTIAL_AC,	595	PARTIAL_AC,
596	PARTIAL_L3,	596	PARTIAL_L3,
597	EARLY,	597	EARLY,
		598	LATE,
598	FULL	599	FULL
599	} g_cpucache_up;	600	} g_cpucache_up;
600		601
@@ -671,7 +672,7 @@ static void init_node_lock_keys(int q)
671	{	672	{
672	struct cache_sizes *s = malloc_sizes;	673	struct cache_sizes *s = malloc_sizes;
673		674
674	if (g_cpucache_up != FULL)	675	if (g_cpucache_up < LATE)
675	return;	676	return;
676		677
677	for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {	678	for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
@@ -1666,6 +1667,8 @@ void __init kmem_cache_init_late(void)
1666	{	1667	{
1667	struct kmem_cache *cachep;	1668	struct kmem_cache *cachep;
1668		1669
		1670	g_cpucache_up = LATE;
		1671
1669	/* Annotate slab for lockdep -- annotate the malloc caches */	1672	/* Annotate slab for lockdep -- annotate the malloc caches */
1670	init_lock_keys();	1673	init_lock_keys();
1671		1674


diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e583f770dfee..21fdf46ad5aa 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c
@@ -1315,7 +1315,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1315	unsigned long align, unsigned long flags, unsigned long start,	1315	unsigned long align, unsigned long flags, unsigned long start,
1316	unsigned long end, int node, gfp_t gfp_mask, void *caller)	1316	unsigned long end, int node, gfp_t gfp_mask, void *caller)
1317	{	1317	{
1318	static struct vmap_area *va;	1318	struct vmap_area *va;
1319	struct vm_struct *area;	1319	struct vm_struct *area;
1320		1320
1321	BUG_ON(in_interrupt());	1321	BUG_ON(in_interrupt());
@@ -1658,6 +1658,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1658	goto fail;	1658	goto fail;
1659		1659
1660	addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);	1660	addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
		1661	if (!addr)
		1662	return NULL;
1661		1663
1662	/*	1664	/*
1663	* In this function, newly allocated vm_struct is not added	1665	* In this function, newly allocated vm_struct is not added


diff --git a/mm/vmscan.c b/mm/vmscan.c index a1893c050795..f54a05b7a61d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -183,7 +183,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
183	*/	183	*/
184	void register_shrinker(struct shrinker *shrinker)	184	void register_shrinker(struct shrinker *shrinker)
185	{	185	{
186	shrinker->nr = 0;	186	atomic_long_set(&shrinker->nr_in_batch, 0);
187	down_write(&shrinker_rwsem);	187	down_write(&shrinker_rwsem);
188	list_add_tail(&shrinker->list, &shrinker_list);	188	list_add_tail(&shrinker->list, &shrinker_list);
189	up_write(&shrinker_rwsem);	189	up_write(&shrinker_rwsem);
@@ -247,25 +247,26 @@ unsigned long shrink_slab(struct shrink_control *shrink,
247		247
248	list_for_each_entry(shrinker, &shrinker_list, list) {	248	list_for_each_entry(shrinker, &shrinker_list, list) {
249	unsigned long long delta;	249	unsigned long long delta;
250	unsigned long total_scan;	250	long total_scan;
251	unsigned long max_pass;	251	long max_pass;
252	int shrink_ret = 0;	252	int shrink_ret = 0;
253	long nr;	253	long nr;
254	long new_nr;	254	long new_nr;
255	long batch_size = shrinker->batch ? shrinker->batch	255	long batch_size = shrinker->batch ? shrinker->batch
256	: SHRINK_BATCH;	256	: SHRINK_BATCH;
257		257
		258	max_pass = do_shrinker_shrink(shrinker, shrink, 0);
		259	if (max_pass <= 0)
		260	continue;
		261
258	/*	262	/*
259	* copy the current shrinker scan count into a local variable	263	* copy the current shrinker scan count into a local variable
260	* and zero it so that other concurrent shrinker invocations	264	* and zero it so that other concurrent shrinker invocations
261	* don't also do this scanning work.	265	* don't also do this scanning work.
262	*/	266	*/
263	do {	267	nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
264	nr = shrinker->nr;
265	} while (cmpxchg(&shrinker->nr, nr, 0) != nr);
266		268
267	total_scan = nr;	269	total_scan = nr;
268	max_pass = do_shrinker_shrink(shrinker, shrink, 0);
269	delta = (4 * nr_pages_scanned) / shrinker->seeks;	270	delta = (4 * nr_pages_scanned) / shrinker->seeks;
270	delta *= max_pass;	271	delta *= max_pass;
271	do_div(delta, lru_pages + 1);	272	do_div(delta, lru_pages + 1);
@@ -325,12 +326,11 @@ unsigned long shrink_slab(struct shrink_control *shrink,
325	* manner that handles concurrent updates. If we exhausted the	326	* manner that handles concurrent updates. If we exhausted the
326	* scan, there is no need to do an update.	327	* scan, there is no need to do an update.
327	*/	328	*/
328	do {	329	if (total_scan > 0)
329	nr = shrinker->nr;	330	new_nr = atomic_long_add_return(total_scan,
330	new_nr = total_scan + nr;	331	&shrinker->nr_in_batch);
331	if (total_scan <= 0)	332	else
332	break;	333	new_nr = atomic_long_read(&shrinker->nr_in_batch);
333	} while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
334		334
335	trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);	335	trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
336	}	336	}