aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/cpusets.txt2
-rw-r--r--Documentation/memory-hotplug.txt5
-rw-r--r--Documentation/vm/transhuge.txt19
-rw-r--r--arch/mips/include/asm/pgtable.h11
-rw-r--r--arch/powerpc/mm/fault.c27
-rw-r--r--arch/s390/include/asm/pgtable.h11
-rw-r--r--arch/sh/mm/fault.c19
-rw-r--r--arch/x86/kernel/vm86_32.c2
-rw-r--r--arch/x86/mm/fault.c23
-rw-r--r--arch/x86/mm/init_64.c4
-rw-r--r--drivers/base/node.c8
-rw-r--r--fs/buffer.c6
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/task_mmu.c6
-rw-r--r--include/asm-generic/pgtable.h26
-rw-r--r--include/linux/bootmem.h3
-rw-r--r--include/linux/cpuset.h2
-rw-r--r--include/linux/gfp.h1
-rw-r--r--include/linux/huge_mm.h18
-rw-r--r--include/linux/memcontrol.h9
-rw-r--r--include/linux/memory.h1
-rw-r--r--include/linux/mmzone.h41
-rw-r--r--include/linux/nodemask.h5
-rw-r--r--include/linux/res_counter.h5
-rw-r--r--include/linux/vm_event_item.h2
-rw-r--r--init/main.c2
-rw-r--r--kernel/cpuset.c32
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/res_counter.c22
-rw-r--r--mm/Kconfig8
-rw-r--r--mm/bootmem.c59
-rw-r--r--mm/compaction.c108
-rw-r--r--mm/huge_memory.c359
-rw-r--r--mm/hugetlb.c36
-rw-r--r--mm/memcontrol.c27
-rw-r--r--mm/memory.c25
-rw-r--r--mm/memory_hotplug.c113
-rw-r--r--mm/mempolicy.c14
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mmap.c32
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nobootmem.c22
-rw-r--r--mm/oom_kill.c52
-rw-r--r--mm/page_alloc.c115
-rw-r--r--mm/page_cgroup.c2
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/rmap.c12
-rw-r--r--mm/shmem.c92
-rw-r--r--mm/vmscan.c4
-rw-r--r--mm/vmstat.c12
52 files changed, 996 insertions, 422 deletions
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt
index cefd3d8bbd11..12e01d432bfe 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -218,7 +218,7 @@ and name space for cpusets, with a minimum of additional kernel code.
218The cpus and mems files in the root (top_cpuset) cpuset are 218The cpus and mems files in the root (top_cpuset) cpuset are
219read-only. The cpus file automatically tracks the value of 219read-only. The cpus file automatically tracks the value of
220cpu_online_mask using a CPU hotplug notifier, and the mems file 220cpu_online_mask using a CPU hotplug notifier, and the mems file
221automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e., 221automatically tracks the value of node_states[N_MEMORY]--i.e.,
222nodes with memory--using the cpuset_track_online_nodes() hook. 222nodes with memory--using the cpuset_track_online_nodes() hook.
223 223
224 224
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index c6f993d491b5..8e5eacbdcfa3 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -390,6 +390,7 @@ struct memory_notify {
390 unsigned long start_pfn; 390 unsigned long start_pfn;
391 unsigned long nr_pages; 391 unsigned long nr_pages;
392 int status_change_nid_normal; 392 int status_change_nid_normal;
393 int status_change_nid_high;
393 int status_change_nid; 394 int status_change_nid;
394} 395}
395 396
@@ -397,7 +398,9 @@ start_pfn is start_pfn of online/offline memory.
397nr_pages is # of pages of online/offline memory. 398nr_pages is # of pages of online/offline memory.
398status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask 399status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
399is (will be) set/clear, if this is -1, then nodemask status is not changed. 400is (will be) set/clear, if this is -1, then nodemask status is not changed.
400status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be) 401status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask
402is (will be) set/clear, if this is -1, then nodemask status is not changed.
403status_change_nid is set node id when N_MEMORY of nodemask is (will be)
401set/clear. It means a new(memoryless) node gets new memory by online and a 404set/clear. It means a new(memoryless) node gets new memory by online and a
402node loses all memory. If this is -1, then nodemask status is not changed. 405node loses all memory. If this is -1, then nodemask status is not changed.
403If status_changed_nid* >= 0, callback should create/discard structures for the 406If status_changed_nid* >= 0, callback should create/discard structures for the
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index f734bb2a78dc..8785fb87d9c7 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -116,6 +116,13 @@ echo always >/sys/kernel/mm/transparent_hugepage/defrag
116echo madvise >/sys/kernel/mm/transparent_hugepage/defrag 116echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
117echo never >/sys/kernel/mm/transparent_hugepage/defrag 117echo never >/sys/kernel/mm/transparent_hugepage/defrag
118 118
119By default kernel tries to use huge zero page on read page fault.
120It's possible to disable huge zero page by writing 0 or enable it
121back by writing 1:
122
123echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page
124echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page
125
119khugepaged will be automatically started when 126khugepaged will be automatically started when
120transparent_hugepage/enabled is set to "always" or "madvise, and it'll 127transparent_hugepage/enabled is set to "always" or "madvise, and it'll
121be automatically shutdown if it's set to "never". 128be automatically shutdown if it's set to "never".
@@ -197,6 +204,14 @@ thp_split is incremented every time a huge page is split into base
197 pages. This can happen for a variety of reasons but a common 204 pages. This can happen for a variety of reasons but a common
198 reason is that a huge page is old and is being reclaimed. 205 reason is that a huge page is old and is being reclaimed.
199 206
207thp_zero_page_alloc is incremented every time a huge zero page is
208 successfully allocated. It includes allocations which where
209 dropped due race with other allocation. Note, it doesn't count
210 every map of the huge zero page, only its allocation.
211
212thp_zero_page_alloc_failed is incremented if kernel fails to allocate
213 huge zero page and falls back to using small pages.
214
200As the system ages, allocating huge pages may be expensive as the 215As the system ages, allocating huge pages may be expensive as the
201system uses memory compaction to copy data around memory to free a 216system uses memory compaction to copy data around memory to free a
202huge page for use. There are some counters in /proc/vmstat to help 217huge page for use. There are some counters in /proc/vmstat to help
@@ -276,7 +291,7 @@ unaffected. libhugetlbfs will also work fine as usual.
276== Graceful fallback == 291== Graceful fallback ==
277 292
278Code walking pagetables but unware about huge pmds can simply call 293Code walking pagetables but unware about huge pmds can simply call
279split_huge_page_pmd(mm, pmd) where the pmd is the one returned by 294split_huge_page_pmd(vma, addr, pmd) where the pmd is the one returned by
280pmd_offset. It's trivial to make the code transparent hugepage aware 295pmd_offset. It's trivial to make the code transparent hugepage aware
281by just grepping for "pmd_offset" and adding split_huge_page_pmd where 296by just grepping for "pmd_offset" and adding split_huge_page_pmd where
282missing after pmd_offset returns the pmd. Thanks to the graceful 297missing after pmd_offset returns the pmd. Thanks to the graceful
@@ -299,7 +314,7 @@ diff --git a/mm/mremap.c b/mm/mremap.c
299 return NULL; 314 return NULL;
300 315
301 pmd = pmd_offset(pud, addr); 316 pmd = pmd_offset(pud, addr);
302+ split_huge_page_pmd(mm, pmd); 317+ split_huge_page_pmd(vma, addr, pmd);
303 if (pmd_none_or_clear_bad(pmd)) 318 if (pmd_none_or_clear_bad(pmd))
304 return NULL; 319 return NULL;
305 320
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index c02158be836c..14490e9443af 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -76,16 +76,7 @@ extern unsigned long zero_page_mask;
76 76
77#define ZERO_PAGE(vaddr) \ 77#define ZERO_PAGE(vaddr) \
78 (virt_to_page((void *)(empty_zero_page + (((unsigned long)(vaddr)) & zero_page_mask)))) 78 (virt_to_page((void *)(empty_zero_page + (((unsigned long)(vaddr)) & zero_page_mask))))
79 79#define __HAVE_COLOR_ZERO_PAGE
80#define is_zero_pfn is_zero_pfn
81static inline int is_zero_pfn(unsigned long pfn)
82{
83 extern unsigned long zero_pfn;
84 unsigned long offset_from_zero_pfn = pfn - zero_pfn;
85 return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
86}
87
88#define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr))
89 80
90extern void paging_init(void); 81extern void paging_init(void);
91 82
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 0a6b28336eb0..3a8489a354e9 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -113,19 +113,6 @@ static int store_updates_sp(struct pt_regs *regs)
113#define MM_FAULT_CONTINUE -1 113#define MM_FAULT_CONTINUE -1
114#define MM_FAULT_ERR(sig) (sig) 114#define MM_FAULT_ERR(sig) (sig)
115 115
116static int out_of_memory(struct pt_regs *regs)
117{
118 /*
119 * We ran out of memory, or some other thing happened to us that made
120 * us unable to handle the page fault gracefully.
121 */
122 up_read(&current->mm->mmap_sem);
123 if (!user_mode(regs))
124 return MM_FAULT_ERR(SIGKILL);
125 pagefault_out_of_memory();
126 return MM_FAULT_RETURN;
127}
128
129static int do_sigbus(struct pt_regs *regs, unsigned long address) 116static int do_sigbus(struct pt_regs *regs, unsigned long address)
130{ 117{
131 siginfo_t info; 118 siginfo_t info;
@@ -169,8 +156,18 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
169 return MM_FAULT_CONTINUE; 156 return MM_FAULT_CONTINUE;
170 157
171 /* Out of memory */ 158 /* Out of memory */
172 if (fault & VM_FAULT_OOM) 159 if (fault & VM_FAULT_OOM) {
173 return out_of_memory(regs); 160 up_read(&current->mm->mmap_sem);
161
162 /*
163 * We ran out of memory, or some other thing happened to us that
164 * made us unable to handle the page fault gracefully.
165 */
166 if (!user_mode(regs))
167 return MM_FAULT_ERR(SIGKILL);
168 pagefault_out_of_memory();
169 return MM_FAULT_RETURN;
170 }
174 171
175 /* Bus error. x86 handles HWPOISON here, we'll add this if/when 172 /* Bus error. x86 handles HWPOISON here, we'll add this if/when
176 * we support the feature in HW 173 * we support the feature in HW
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 2d3b7cb26005..c814e6f5b57d 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -55,16 +55,7 @@ extern unsigned long zero_page_mask;
55#define ZERO_PAGE(vaddr) \ 55#define ZERO_PAGE(vaddr) \
56 (virt_to_page((void *)(empty_zero_page + \ 56 (virt_to_page((void *)(empty_zero_page + \
57 (((unsigned long)(vaddr)) &zero_page_mask)))) 57 (((unsigned long)(vaddr)) &zero_page_mask))))
58 58#define __HAVE_COLOR_ZERO_PAGE
59#define is_zero_pfn is_zero_pfn
60static inline int is_zero_pfn(unsigned long pfn)
61{
62 extern unsigned long zero_pfn;
63 unsigned long offset_from_zero_pfn = pfn - zero_pfn;
64 return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
65}
66
67#define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr))
68 59
69#endif /* !__ASSEMBLY__ */ 60#endif /* !__ASSEMBLY__ */
70 61
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index cbbdcad8fcb3..1f49c28affa9 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -301,17 +301,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
301 __bad_area(regs, error_code, address, SEGV_ACCERR); 301 __bad_area(regs, error_code, address, SEGV_ACCERR);
302} 302}
303 303
304static void out_of_memory(void)
305{
306 /*
307 * We ran out of memory, call the OOM killer, and return the userspace
308 * (which will retry the fault, or kill us if we got oom-killed):
309 */
310 up_read(&current->mm->mmap_sem);
311
312 pagefault_out_of_memory();
313}
314
315static void 304static void
316do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) 305do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
317{ 306{
@@ -353,8 +342,14 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
353 no_context(regs, error_code, address); 342 no_context(regs, error_code, address);
354 return 1; 343 return 1;
355 } 344 }
345 up_read(&current->mm->mmap_sem);
356 346
357 out_of_memory(); 347 /*
348 * We ran out of memory, call the OOM killer, and return the
349 * userspace (which will retry the fault, or kill us if we got
350 * oom-killed):
351 */
352 pagefault_out_of_memory();
358 } else { 353 } else {
359 if (fault & VM_FAULT_SIGBUS) 354 if (fault & VM_FAULT_SIGBUS)
360 do_sigbus(regs, error_code, address); 355 do_sigbus(regs, error_code, address);
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 5c9687b1bde6..1dfe69cc78a8 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -182,7 +182,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
182 if (pud_none_or_clear_bad(pud)) 182 if (pud_none_or_clear_bad(pud))
183 goto out; 183 goto out;
184 pmd = pmd_offset(pud, 0xA0000); 184 pmd = pmd_offset(pud, 0xA0000);
185 split_huge_page_pmd(mm, pmd); 185 split_huge_page_pmd_mm(mm, 0xA0000, pmd);
186 if (pmd_none_or_clear_bad(pmd)) 186 if (pmd_none_or_clear_bad(pmd))
187 goto out; 187 goto out;
188 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); 188 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7a529cbab7ad..027088f2f7dd 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -803,20 +803,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
803 __bad_area(regs, error_code, address, SEGV_ACCERR); 803 __bad_area(regs, error_code, address, SEGV_ACCERR);
804} 804}
805 805
806/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
807static void
808out_of_memory(struct pt_regs *regs, unsigned long error_code,
809 unsigned long address)
810{
811 /*
812 * We ran out of memory, call the OOM killer, and return the userspace
813 * (which will retry the fault, or kill us if we got oom-killed):
814 */
815 up_read(&current->mm->mmap_sem);
816
817 pagefault_out_of_memory();
818}
819
820static void 806static void
821do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, 807do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
822 unsigned int fault) 808 unsigned int fault)
@@ -879,7 +865,14 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
879 return 1; 865 return 1;
880 } 866 }
881 867
882 out_of_memory(regs, error_code, address); 868 up_read(&current->mm->mmap_sem);
869
870 /*
871 * We ran out of memory, call the OOM killer, and return the
872 * userspace (which will retry the fault, or kill us if we got
873 * oom-killed):
874 */
875 pagefault_out_of_memory();
883 } else { 876 } else {
884 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| 877 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
885 VM_FAULT_HWPOISON_LARGE)) 878 VM_FAULT_HWPOISON_LARGE))
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3baff255adac..2ead3c8a4c84 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -630,7 +630,9 @@ void __init paging_init(void)
630 * numa support is not compiled in, and later node_set_state 630 * numa support is not compiled in, and later node_set_state
631 * will not set it back. 631 * will not set it back.
632 */ 632 */
633 node_clear_state(0, N_NORMAL_MEMORY); 633 node_clear_state(0, N_MEMORY);
634 if (N_MEMORY != N_NORMAL_MEMORY)
635 node_clear_state(0, N_NORMAL_MEMORY);
634 636
635 zone_sizes_init(); 637 zone_sizes_init();
636} 638}
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 294e31626210..fac124a7e1c5 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -227,7 +227,7 @@ static node_registration_func_t __hugetlb_unregister_node;
227static inline bool hugetlb_register_node(struct node *node) 227static inline bool hugetlb_register_node(struct node *node)
228{ 228{
229 if (__hugetlb_register_node && 229 if (__hugetlb_register_node &&
230 node_state(node->dev.id, N_HIGH_MEMORY)) { 230 node_state(node->dev.id, N_MEMORY)) {
231 __hugetlb_register_node(node); 231 __hugetlb_register_node(node);
232 return true; 232 return true;
233 } 233 }
@@ -644,6 +644,9 @@ static struct node_attr node_state_attr[] = {
644#ifdef CONFIG_HIGHMEM 644#ifdef CONFIG_HIGHMEM
645 [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY), 645 [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
646#endif 646#endif
647#ifdef CONFIG_MOVABLE_NODE
648 [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
649#endif
647 [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), 650 [N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
648}; 651};
649 652
@@ -654,6 +657,9 @@ static struct attribute *node_state_attrs[] = {
654#ifdef CONFIG_HIGHMEM 657#ifdef CONFIG_HIGHMEM
655 &node_state_attr[N_HIGH_MEMORY].attr.attr, 658 &node_state_attr[N_HIGH_MEMORY].attr.attr,
656#endif 659#endif
660#ifdef CONFIG_MOVABLE_NODE
661 &node_state_attr[N_MEMORY].attr.attr,
662#endif
657 &node_state_attr[N_CPU].attr.attr, 663 &node_state_attr[N_CPU].attr.attr,
658 NULL 664 NULL
659}; 665};
diff --git a/fs/buffer.c b/fs/buffer.c
index 6e9ed48064fc..c017a2dfb909 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -46,8 +46,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
46 46
47#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) 47#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48 48
49inline void 49void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
50init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51{ 50{
52 bh->b_end_io = handler; 51 bh->b_end_io = handler;
53 bh->b_private = private; 52 bh->b_private = private;
@@ -850,13 +849,10 @@ try_again:
850 if (!bh) 849 if (!bh)
851 goto no_grow; 850 goto no_grow;
852 851
853 bh->b_bdev = NULL;
854 bh->b_this_page = head; 852 bh->b_this_page = head;
855 bh->b_blocknr = -1; 853 bh->b_blocknr = -1;
856 head = bh; 854 head = bh;
857 855
858 bh->b_state = 0;
859 atomic_set(&bh->b_count, 0);
860 bh->b_size = size; 856 bh->b_size = size;
861 857
862 /* Link the buffer to its page */ 858 /* Link the buffer to its page */
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3e3422f7f0a4..310972b72a66 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1034,7 +1034,7 @@ int bdi_writeback_thread(void *data)
1034 while (!kthread_freezable_should_stop(NULL)) { 1034 while (!kthread_freezable_should_stop(NULL)) {
1035 /* 1035 /*
1036 * Remove own delayed wake-up timer, since we are already awake 1036 * Remove own delayed wake-up timer, since we are already awake
1037 * and we'll take care of the preriodic write-back. 1037 * and we'll take care of the periodic write-back.
1038 */ 1038 */
1039 del_timer(&wb->wakeup_timer); 1039 del_timer(&wb->wakeup_timer);
1040 1040
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 86c67eee439f..e96d4f18ca3a 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -249,7 +249,7 @@ static int kcore_update_ram(void)
249 /* Not inialized....update now */ 249 /* Not inialized....update now */
250 /* find out "max pfn" */ 250 /* find out "max pfn" */
251 end_pfn = 0; 251 end_pfn = 0;
252 for_each_node_state(nid, N_HIGH_MEMORY) { 252 for_each_node_state(nid, N_MEMORY) {
253 unsigned long node_end; 253 unsigned long node_end;
254 node_end = NODE_DATA(nid)->node_start_pfn + 254 node_end = NODE_DATA(nid)->node_start_pfn +
255 NODE_DATA(nid)->node_spanned_pages; 255 NODE_DATA(nid)->node_spanned_pages;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 90c63f9392a5..48775628abbf 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -643,7 +643,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
643 spinlock_t *ptl; 643 spinlock_t *ptl;
644 struct page *page; 644 struct page *page;
645 645
646 split_huge_page_pmd(walk->mm, pmd); 646 split_huge_page_pmd(vma, addr, pmd);
647 if (pmd_trans_unstable(pmd)) 647 if (pmd_trans_unstable(pmd))
648 return 0; 648 return 0;
649 649
@@ -1126,7 +1126,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
1126 return NULL; 1126 return NULL;
1127 1127
1128 nid = page_to_nid(page); 1128 nid = page_to_nid(page);
1129 if (!node_isset(nid, node_states[N_HIGH_MEMORY])) 1129 if (!node_isset(nid, node_states[N_MEMORY]))
1130 return NULL; 1130 return NULL;
1131 1131
1132 return page; 1132 return page;
@@ -1279,7 +1279,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1279 if (md->writeback) 1279 if (md->writeback)
1280 seq_printf(m, " writeback=%lu", md->writeback); 1280 seq_printf(m, " writeback=%lu", md->writeback);
1281 1281
1282 for_each_node_state(n, N_HIGH_MEMORY) 1282 for_each_node_state(n, N_MEMORY)
1283 if (md->node[n]) 1283 if (md->node[n])
1284 seq_printf(m, " N%d=%lu", n, md->node[n]); 1284 seq_printf(m, " N%d=%lu", n, md->node[n]);
1285out: 1285out:
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index b36ce40bd1c6..284e80831d2c 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -449,6 +449,32 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
449 unsigned long size); 449 unsigned long size);
450#endif 450#endif
451 451
452#ifdef __HAVE_COLOR_ZERO_PAGE
453static inline int is_zero_pfn(unsigned long pfn)
454{
455 extern unsigned long zero_pfn;
456 unsigned long offset_from_zero_pfn = pfn - zero_pfn;
457 return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
458}
459
460static inline unsigned long my_zero_pfn(unsigned long addr)
461{
462 return page_to_pfn(ZERO_PAGE(addr));
463}
464#else
465static inline int is_zero_pfn(unsigned long pfn)
466{
467 extern unsigned long zero_pfn;
468 return pfn == zero_pfn;
469}
470
471static inline unsigned long my_zero_pfn(unsigned long addr)
472{
473 extern unsigned long zero_pfn;
474 return zero_pfn;
475}
476#endif
477
452#ifdef CONFIG_MMU 478#ifdef CONFIG_MMU
453 479
454#ifndef CONFIG_TRANSPARENT_HUGEPAGE 480#ifndef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 7b74452c5317..3f778c27f825 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -137,9 +137,6 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
137#define alloc_bootmem_low_pages_node(pgdat, x) \ 137#define alloc_bootmem_low_pages_node(pgdat, x) \
138 __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) 138 __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
139 139
140extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
141 int flags);
142
143#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP 140#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
144extern void *alloc_remap(int nid, unsigned long size); 141extern void *alloc_remap(int nid, unsigned long size);
145#else 142#else
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 838320fc3d1d..8c8a60d29407 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -144,7 +144,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
144 return node_possible_map; 144 return node_possible_map;
145} 145}
146 146
147#define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY]) 147#define cpuset_current_mems_allowed (node_states[N_MEMORY])
148static inline void cpuset_init_current_mems_allowed(void) {} 148static inline void cpuset_init_current_mems_allowed(void) {}
149 149
150static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) 150static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 31e8041274f6..f74856e17e48 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -34,6 +34,7 @@ struct vm_area_struct;
34#define ___GFP_NO_KSWAPD 0x400000u 34#define ___GFP_NO_KSWAPD 0x400000u
35#define ___GFP_OTHER_NODE 0x800000u 35#define ___GFP_OTHER_NODE 0x800000u
36#define ___GFP_WRITE 0x1000000u 36#define ___GFP_WRITE 0x1000000u
37/* If the above are modified, __GFP_BITS_SHIFT may need updating */
37 38
38/* 39/*
39 * GFP bitmasks.. 40 * GFP bitmasks..
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 1af477552459..092dc5305a32 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -39,6 +39,7 @@ enum transparent_hugepage_flag {
39 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, 39 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
40 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, 40 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
41 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, 41 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
42 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
42#ifdef CONFIG_DEBUG_VM 43#ifdef CONFIG_DEBUG_VM
43 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, 44 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
44#endif 45#endif
@@ -78,6 +79,9 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
78 (transparent_hugepage_flags & \ 79 (transparent_hugepage_flags & \
79 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) && \ 80 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) && \
80 (__vma)->vm_flags & VM_HUGEPAGE)) 81 (__vma)->vm_flags & VM_HUGEPAGE))
82#define transparent_hugepage_use_zero_page() \
83 (transparent_hugepage_flags & \
84 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
81#ifdef CONFIG_DEBUG_VM 85#ifdef CONFIG_DEBUG_VM
82#define transparent_hugepage_debug_cow() \ 86#define transparent_hugepage_debug_cow() \
83 (transparent_hugepage_flags & \ 87 (transparent_hugepage_flags & \
@@ -95,12 +99,14 @@ extern int handle_pte_fault(struct mm_struct *mm,
95 struct vm_area_struct *vma, unsigned long address, 99 struct vm_area_struct *vma, unsigned long address,
96 pte_t *pte, pmd_t *pmd, unsigned int flags); 100 pte_t *pte, pmd_t *pmd, unsigned int flags);
97extern int split_huge_page(struct page *page); 101extern int split_huge_page(struct page *page);
98extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); 102extern void __split_huge_page_pmd(struct vm_area_struct *vma,
99#define split_huge_page_pmd(__mm, __pmd) \ 103 unsigned long address, pmd_t *pmd);
104#define split_huge_page_pmd(__vma, __address, __pmd) \
100 do { \ 105 do { \
101 pmd_t *____pmd = (__pmd); \ 106 pmd_t *____pmd = (__pmd); \
102 if (unlikely(pmd_trans_huge(*____pmd))) \ 107 if (unlikely(pmd_trans_huge(*____pmd))) \
103 __split_huge_page_pmd(__mm, ____pmd); \ 108 __split_huge_page_pmd(__vma, __address, \
109 ____pmd); \
104 } while (0) 110 } while (0)
105#define wait_split_huge_page(__anon_vma, __pmd) \ 111#define wait_split_huge_page(__anon_vma, __pmd) \
106 do { \ 112 do { \
@@ -110,6 +116,8 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
110 BUG_ON(pmd_trans_splitting(*____pmd) || \ 116 BUG_ON(pmd_trans_splitting(*____pmd) || \
111 pmd_trans_huge(*____pmd)); \ 117 pmd_trans_huge(*____pmd)); \
112 } while (0) 118 } while (0)
119extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
120 pmd_t *pmd);
113#if HPAGE_PMD_ORDER > MAX_ORDER 121#if HPAGE_PMD_ORDER > MAX_ORDER
114#error "hugepages can't be allocated by the buddy allocator" 122#error "hugepages can't be allocated by the buddy allocator"
115#endif 123#endif
@@ -177,10 +185,12 @@ static inline int split_huge_page(struct page *page)
177{ 185{
178 return 0; 186 return 0;
179} 187}
180#define split_huge_page_pmd(__mm, __pmd) \ 188#define split_huge_page_pmd(__vma, __address, __pmd) \
181 do { } while (0) 189 do { } while (0)
182#define wait_split_huge_page(__anon_vma, __pmd) \ 190#define wait_split_huge_page(__anon_vma, __pmd) \
183 do { } while (0) 191 do { } while (0)
192#define split_huge_page_pmd_mm(__mm, __address, __pmd) \
193 do { } while (0)
184#define compound_trans_head(page) compound_head(page) 194#define compound_trans_head(page) compound_head(page)
185static inline int hugepage_madvise(struct vm_area_struct *vma, 195static inline int hugepage_madvise(struct vm_area_struct *vma,
186 unsigned long *vm_flags, int advice) 196 unsigned long *vm_flags, int advice)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 11ddc7ffeba8..e98a74c0c9c0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -181,7 +181,14 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
181 gfp_t gfp_mask, 181 gfp_t gfp_mask,
182 unsigned long *total_scanned); 182 unsigned long *total_scanned);
183 183
184void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); 184void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
185static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
186 enum vm_event_item idx)
187{
188 if (mem_cgroup_disabled())
189 return;
190 __mem_cgroup_count_vm_event(mm, idx);
191}
185#ifdef CONFIG_TRANSPARENT_HUGEPAGE 192#ifdef CONFIG_TRANSPARENT_HUGEPAGE
186void mem_cgroup_split_huge_fixup(struct page *head); 193void mem_cgroup_split_huge_fixup(struct page *head);
187#endif 194#endif
diff --git a/include/linux/memory.h b/include/linux/memory.h
index a09216d0dcc7..45e93b468878 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -54,6 +54,7 @@ struct memory_notify {
54 unsigned long start_pfn; 54 unsigned long start_pfn;
55 unsigned long nr_pages; 55 unsigned long nr_pages;
56 int status_change_nid_normal; 56 int status_change_nid_normal;
57 int status_change_nid_high;
57 int status_change_nid; 58 int status_change_nid;
58}; 59};
59 60
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0c0b1d608a69..cd55dad56aac 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -460,17 +460,44 @@ struct zone {
460 unsigned long zone_start_pfn; 460 unsigned long zone_start_pfn;
461 461
462 /* 462 /*
463 * zone_start_pfn, spanned_pages and present_pages are all 463 * spanned_pages is the total pages spanned by the zone, including
464 * protected by span_seqlock. It is a seqlock because it has 464 * holes, which is calculated as:
465 * to be read outside of zone->lock, and it is done in the main 465 * spanned_pages = zone_end_pfn - zone_start_pfn;
466 * allocator path. But, it is written quite infrequently.
467 * 466 *
468 * The lock is declared along with zone->lock because it is 467 * present_pages is physical pages existing within the zone, which
468 * is calculated as:
469 * present_pages = spanned_pages - absent_pages(pags in holes);
470 *
471 * managed_pages is present pages managed by the buddy system, which
472 * is calculated as (reserved_pages includes pages allocated by the
473 * bootmem allocator):
474 * managed_pages = present_pages - reserved_pages;
475 *
476 * So present_pages may be used by memory hotplug or memory power
477 * management logic to figure out unmanaged pages by checking
478 * (present_pages - managed_pages). And managed_pages should be used
479 * by page allocator and vm scanner to calculate all kinds of watermarks
480 * and thresholds.
481 *
482 * Locking rules:
483 *
484 * zone_start_pfn and spanned_pages are protected by span_seqlock.
485 * It is a seqlock because it has to be read outside of zone->lock,
486 * and it is done in the main allocator path. But, it is written
487 * quite infrequently.
488 *
489 * The span_seq lock is declared along with zone->lock because it is
469 * frequently read in proximity to zone->lock. It's good to 490 * frequently read in proximity to zone->lock. It's good to
470 * give them a chance of being in the same cacheline. 491 * give them a chance of being in the same cacheline.
492 *
493 * Write access to present_pages and managed_pages at runtime should
494 * be protected by lock_memory_hotplug()/unlock_memory_hotplug().
495 * Any reader who can't tolerant drift of present_pages and
496 * managed_pages should hold memory hotplug lock to get a stable value.
471 */ 497 */
472 unsigned long spanned_pages; /* total size, including holes */ 498 unsigned long spanned_pages;
473 unsigned long present_pages; /* amount of memory (excluding holes) */ 499 unsigned long present_pages;
500 unsigned long managed_pages;
474 501
475 /* 502 /*
476 * rarely used fields: 503 * rarely used fields:
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 7afc36334d52..4e2cbfa640b7 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -380,6 +380,11 @@ enum node_states {
380#else 380#else
381 N_HIGH_MEMORY = N_NORMAL_MEMORY, 381 N_HIGH_MEMORY = N_NORMAL_MEMORY,
382#endif 382#endif
383#ifdef CONFIG_MOVABLE_NODE
384 N_MEMORY, /* The node has memory(regular, high, movable) */
385#else
386 N_MEMORY = N_HIGH_MEMORY,
387#endif
383 N_CPU, /* The node has one or more cpus */ 388 N_CPU, /* The node has one or more cpus */
384 NR_NODE_STATES 389 NR_NODE_STATES
385}; 390};
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 7d7fbe2ef782..6f54e40fa218 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -74,14 +74,9 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
74 const char __user *buf, size_t nbytes, loff_t *pos, 74 const char __user *buf, size_t nbytes, loff_t *pos,
75 int (*read_strategy)(unsigned long long val, char *s)); 75 int (*read_strategy)(unsigned long long val, char *s));
76 76
77typedef int (*write_strategy_fn)(const char *buf, unsigned long long *val);
78
79int res_counter_memparse_write_strategy(const char *buf, 77int res_counter_memparse_write_strategy(const char *buf,
80 unsigned long long *res); 78 unsigned long long *res);
81 79
82int res_counter_write(struct res_counter *counter, int member,
83 const char *buffer, write_strategy_fn write_strategy);
84
85/* 80/*
86 * the field descriptors. one for each member of res_counter 81 * the field descriptors. one for each member of res_counter
87 */ 82 */
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 3d3114594370..fe786f07d2bd 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -58,6 +58,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
58 THP_COLLAPSE_ALLOC, 58 THP_COLLAPSE_ALLOC,
59 THP_COLLAPSE_ALLOC_FAILED, 59 THP_COLLAPSE_ALLOC_FAILED,
60 THP_SPLIT, 60 THP_SPLIT,
61 THP_ZERO_PAGE_ALLOC,
62 THP_ZERO_PAGE_ALLOC_FAILED,
61#endif 63#endif
62 NR_VM_EVENT_ITEMS 64 NR_VM_EVENT_ITEMS
63}; 65};
diff --git a/init/main.c b/init/main.c
index e33e09df3cbc..63ae904a99a8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -857,7 +857,7 @@ static void __init kernel_init_freeable(void)
857 /* 857 /*
858 * init can allocate pages on any node 858 * init can allocate pages on any node
859 */ 859 */
860 set_mems_allowed(node_states[N_HIGH_MEMORY]); 860 set_mems_allowed(node_states[N_MEMORY]);
861 /* 861 /*
862 * init can run on any cpu. 862 * init can run on any cpu.
863 */ 863 */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b017887d632f..7bb63eea6eb8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -302,10 +302,10 @@ static void guarantee_online_cpus(const struct cpuset *cs,
302 * are online, with memory. If none are online with memory, walk 302 * are online, with memory. If none are online with memory, walk
303 * up the cpuset hierarchy until we find one that does have some 303 * up the cpuset hierarchy until we find one that does have some
304 * online mems. If we get all the way to the top and still haven't 304 * online mems. If we get all the way to the top and still haven't
305 * found any online mems, return node_states[N_HIGH_MEMORY]. 305 * found any online mems, return node_states[N_MEMORY].
306 * 306 *
307 * One way or another, we guarantee to return some non-empty subset 307 * One way or another, we guarantee to return some non-empty subset
308 * of node_states[N_HIGH_MEMORY]. 308 * of node_states[N_MEMORY].
309 * 309 *
310 * Call with callback_mutex held. 310 * Call with callback_mutex held.
311 */ 311 */
@@ -313,14 +313,14 @@ static void guarantee_online_cpus(const struct cpuset *cs,
313static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 313static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
314{ 314{
315 while (cs && !nodes_intersects(cs->mems_allowed, 315 while (cs && !nodes_intersects(cs->mems_allowed,
316 node_states[N_HIGH_MEMORY])) 316 node_states[N_MEMORY]))
317 cs = cs->parent; 317 cs = cs->parent;
318 if (cs) 318 if (cs)
319 nodes_and(*pmask, cs->mems_allowed, 319 nodes_and(*pmask, cs->mems_allowed,
320 node_states[N_HIGH_MEMORY]); 320 node_states[N_MEMORY]);
321 else 321 else
322 *pmask = node_states[N_HIGH_MEMORY]; 322 *pmask = node_states[N_MEMORY];
323 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); 323 BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
324} 324}
325 325
326/* 326/*
@@ -1100,7 +1100,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1100 return -ENOMEM; 1100 return -ENOMEM;
1101 1101
1102 /* 1102 /*
1103 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1103 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
1104 * it's read-only 1104 * it's read-only
1105 */ 1105 */
1106 if (cs == &top_cpuset) { 1106 if (cs == &top_cpuset) {
@@ -1122,7 +1122,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1122 goto done; 1122 goto done;
1123 1123
1124 if (!nodes_subset(trialcs->mems_allowed, 1124 if (!nodes_subset(trialcs->mems_allowed,
1125 node_states[N_HIGH_MEMORY])) { 1125 node_states[N_MEMORY])) {
1126 retval = -EINVAL; 1126 retval = -EINVAL;
1127 goto done; 1127 goto done;
1128 } 1128 }
@@ -2026,7 +2026,7 @@ static struct cpuset *cpuset_next(struct list_head *queue)
2026 * before dropping down to the next. It always processes a node before 2026 * before dropping down to the next. It always processes a node before
2027 * any of its children. 2027 * any of its children.
2028 * 2028 *
2029 * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY 2029 * In the case of memory hot-unplug, it will remove nodes from N_MEMORY
2030 * if all present pages from a node are offlined. 2030 * if all present pages from a node are offlined.
2031 */ 2031 */
2032static void 2032static void
@@ -2065,7 +2065,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2065 2065
2066 /* Continue past cpusets with all mems online */ 2066 /* Continue past cpusets with all mems online */
2067 if (nodes_subset(cp->mems_allowed, 2067 if (nodes_subset(cp->mems_allowed,
2068 node_states[N_HIGH_MEMORY])) 2068 node_states[N_MEMORY]))
2069 continue; 2069 continue;
2070 2070
2071 oldmems = cp->mems_allowed; 2071 oldmems = cp->mems_allowed;
@@ -2073,7 +2073,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2073 /* Remove offline mems from this cpuset. */ 2073 /* Remove offline mems from this cpuset. */
2074 mutex_lock(&callback_mutex); 2074 mutex_lock(&callback_mutex);
2075 nodes_and(cp->mems_allowed, cp->mems_allowed, 2075 nodes_and(cp->mems_allowed, cp->mems_allowed,
2076 node_states[N_HIGH_MEMORY]); 2076 node_states[N_MEMORY]);
2077 mutex_unlock(&callback_mutex); 2077 mutex_unlock(&callback_mutex);
2078 2078
2079 /* Move tasks from the empty cpuset to a parent */ 2079 /* Move tasks from the empty cpuset to a parent */
@@ -2126,8 +2126,8 @@ void cpuset_update_active_cpus(bool cpu_online)
2126 2126
2127#ifdef CONFIG_MEMORY_HOTPLUG 2127#ifdef CONFIG_MEMORY_HOTPLUG
2128/* 2128/*
2129 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2129 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
2130 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. 2130 * Call this routine anytime after node_states[N_MEMORY] changes.
2131 * See cpuset_update_active_cpus() for CPU hotplug handling. 2131 * See cpuset_update_active_cpus() for CPU hotplug handling.
2132 */ 2132 */
2133static int cpuset_track_online_nodes(struct notifier_block *self, 2133static int cpuset_track_online_nodes(struct notifier_block *self,
@@ -2140,7 +2140,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2140 case MEM_ONLINE: 2140 case MEM_ONLINE:
2141 oldmems = top_cpuset.mems_allowed; 2141 oldmems = top_cpuset.mems_allowed;
2142 mutex_lock(&callback_mutex); 2142 mutex_lock(&callback_mutex);
2143 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2143 top_cpuset.mems_allowed = node_states[N_MEMORY];
2144 mutex_unlock(&callback_mutex); 2144 mutex_unlock(&callback_mutex);
2145 update_tasks_nodemask(&top_cpuset, &oldmems, NULL); 2145 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2146 break; 2146 break;
@@ -2169,7 +2169,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2169void __init cpuset_init_smp(void) 2169void __init cpuset_init_smp(void)
2170{ 2170{
2171 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2171 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2172 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2172 top_cpuset.mems_allowed = node_states[N_MEMORY];
2173 2173
2174 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2174 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2175 2175
@@ -2237,7 +2237,7 @@ void cpuset_init_current_mems_allowed(void)
2237 * 2237 *
2238 * Description: Returns the nodemask_t mems_allowed of the cpuset 2238 * Description: Returns the nodemask_t mems_allowed of the cpuset
2239 * attached to the specified @tsk. Guaranteed to return some non-empty 2239 * attached to the specified @tsk. Guaranteed to return some non-empty
2240 * subset of node_states[N_HIGH_MEMORY], even if this means going outside the 2240 * subset of node_states[N_MEMORY], even if this means going outside the
2241 * tasks cpuset. 2241 * tasks cpuset.
2242 **/ 2242 **/
2243 2243
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 29fb60caecb5..691dc2ef9baf 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -428,7 +428,7 @@ int kthreadd(void *unused)
428 set_task_comm(tsk, "kthreadd"); 428 set_task_comm(tsk, "kthreadd");
429 ignore_signals(tsk); 429 ignore_signals(tsk);
430 set_cpus_allowed_ptr(tsk, cpu_all_mask); 430 set_cpus_allowed_ptr(tsk, cpu_all_mask);
431 set_mems_allowed(node_states[N_HIGH_MEMORY]); 431 set_mems_allowed(node_states[N_MEMORY]);
432 432
433 current->flags |= PF_NOFREEZE; 433 current->flags |= PF_NOFREEZE;
434 434
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index ad581aa2369a..3920d593e63c 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -192,25 +192,3 @@ int res_counter_memparse_write_strategy(const char *buf,
192 *res = PAGE_ALIGN(*res); 192 *res = PAGE_ALIGN(*res);
193 return 0; 193 return 0;
194} 194}
195
196int res_counter_write(struct res_counter *counter, int member,
197 const char *buf, write_strategy_fn write_strategy)
198{
199 char *end;
200 unsigned long flags;
201 unsigned long long tmp, *val;
202
203 if (write_strategy) {
204 if (write_strategy(buf, &tmp))
205 return -EINVAL;
206 } else {
207 tmp = simple_strtoull(buf, &end, 10);
208 if (*end != '\0')
209 return -EINVAL;
210 }
211 spin_lock_irqsave(&counter->lock, flags);
212 val = res_counter_member(counter, member);
213 *val = tmp;
214 spin_unlock_irqrestore(&counter->lock, flags);
215 return 0;
216}
diff --git a/mm/Kconfig b/mm/Kconfig
index e6651c5de14f..71259e052ce8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -143,6 +143,14 @@ config NO_BOOTMEM
143config MEMORY_ISOLATION 143config MEMORY_ISOLATION
144 boolean 144 boolean
145 145
146config MOVABLE_NODE
147 boolean "Enable to assign a node which has only movable memory"
148 depends on HAVE_MEMBLOCK
149 depends on NO_BOOTMEM
150 depends on X86_64
151 depends on NUMA
152 depends on BROKEN
153
146# eventually, we can have this option just 'select SPARSEMEM' 154# eventually, we can have this option just 'select SPARSEMEM'
147config MEMORY_HOTPLUG 155config MEMORY_HOTPLUG
148 bool "Allow for memory hot-add" 156 bool "Allow for memory hot-add"
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ecc45958ac0c..1324cd74faec 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -229,6 +229,22 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
229 return count; 229 return count;
230} 230}
231 231
232static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
233{
234 struct zone *z;
235
236 /*
237 * In free_area_init_core(), highmem zone's managed_pages is set to
238 * present_pages, and bootmem allocator doesn't allocate from highmem
239 * zones. So there's no need to recalculate managed_pages because all
240 * highmem pages will be managed by the buddy system. Here highmem
241 * zone also includes highmem movable zone.
242 */
243 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
244 if (!is_highmem(z))
245 z->managed_pages = 0;
246}
247
232/** 248/**
233 * free_all_bootmem_node - release a node's free pages to the buddy allocator 249 * free_all_bootmem_node - release a node's free pages to the buddy allocator
234 * @pgdat: node to be released 250 * @pgdat: node to be released
@@ -238,6 +254,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
238unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 254unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
239{ 255{
240 register_page_bootmem_info_node(pgdat); 256 register_page_bootmem_info_node(pgdat);
257 reset_node_lowmem_managed_pages(pgdat);
241 return free_all_bootmem_core(pgdat->bdata); 258 return free_all_bootmem_core(pgdat->bdata);
242} 259}
243 260
@@ -250,6 +267,10 @@ unsigned long __init free_all_bootmem(void)
250{ 267{
251 unsigned long total_pages = 0; 268 unsigned long total_pages = 0;
252 bootmem_data_t *bdata; 269 bootmem_data_t *bdata;
270 struct pglist_data *pgdat;
271
272 for_each_online_pgdat(pgdat)
273 reset_node_lowmem_managed_pages(pgdat);
253 274
254 list_for_each_entry(bdata, &bdata_list, list) 275 list_for_each_entry(bdata, &bdata_list, list)
255 total_pages += free_all_bootmem_core(bdata); 276 total_pages += free_all_bootmem_core(bdata);
@@ -439,12 +460,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
439 return mark_bootmem(start, end, 1, flags); 460 return mark_bootmem(start, end, 1, flags);
440} 461}
441 462
442int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
443 int flags)
444{
445 return reserve_bootmem(phys, len, flags);
446}
447
448static unsigned long __init align_idx(struct bootmem_data *bdata, 463static unsigned long __init align_idx(struct bootmem_data *bdata,
449 unsigned long idx, unsigned long step) 464 unsigned long idx, unsigned long step)
450{ 465{
@@ -575,27 +590,6 @@ find_block:
575 return NULL; 590 return NULL;
576} 591}
577 592
578static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
579 unsigned long size, unsigned long align,
580 unsigned long goal, unsigned long limit)
581{
582 if (WARN_ON_ONCE(slab_is_available()))
583 return kzalloc(size, GFP_NOWAIT);
584
585#ifdef CONFIG_HAVE_ARCH_BOOTMEM
586 {
587 bootmem_data_t *p_bdata;
588
589 p_bdata = bootmem_arch_preferred_node(bdata, size, align,
590 goal, limit);
591 if (p_bdata)
592 return alloc_bootmem_bdata(p_bdata, size, align,
593 goal, limit);
594 }
595#endif
596 return NULL;
597}
598
599static void * __init alloc_bootmem_core(unsigned long size, 593static void * __init alloc_bootmem_core(unsigned long size,
600 unsigned long align, 594 unsigned long align,
601 unsigned long goal, 595 unsigned long goal,
@@ -604,9 +598,8 @@ static void * __init alloc_bootmem_core(unsigned long size,
604 bootmem_data_t *bdata; 598 bootmem_data_t *bdata;
605 void *region; 599 void *region;
606 600
607 region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); 601 if (WARN_ON_ONCE(slab_is_available()))
608 if (region) 602 return kzalloc(size, GFP_NOWAIT);
609 return region;
610 603
611 list_for_each_entry(bdata, &bdata_list, list) { 604 list_for_each_entry(bdata, &bdata_list, list) {
612 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) 605 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
@@ -704,11 +697,9 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
704{ 697{
705 void *ptr; 698 void *ptr;
706 699
700 if (WARN_ON_ONCE(slab_is_available()))
701 return kzalloc(size, GFP_NOWAIT);
707again: 702again:
708 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
709 align, goal, limit);
710 if (ptr)
711 return ptr;
712 703
713 /* do not panic in alloc_bootmem_bdata() */ 704 /* do not panic in alloc_bootmem_bdata() */
714 if (limit && goal + size > limit) 705 if (limit && goal + size > limit)
diff --git a/mm/compaction.c b/mm/compaction.c
index d24dd2d7bad4..129791218226 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -215,60 +215,6 @@ static bool suitable_migration_target(struct page *page)
215 return false; 215 return false;
216} 216}
217 217
218static void compact_capture_page(struct compact_control *cc)
219{
220 unsigned long flags;
221 int mtype, mtype_low, mtype_high;
222
223 if (!cc->page || *cc->page)
224 return;
225
226 /*
227 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
228 * regardless of the migratetype of the freelist is is captured from.
229 * This is fine because the order for a high-order MIGRATE_MOVABLE
230 * allocation is typically at least a pageblock size and overall
231 * fragmentation is not impaired. Other allocation types must
232 * capture pages from their own migratelist because otherwise they
233 * could pollute other pageblocks like MIGRATE_MOVABLE with
234 * difficult to move pages and making fragmentation worse overall.
235 */
236 if (cc->migratetype == MIGRATE_MOVABLE) {
237 mtype_low = 0;
238 mtype_high = MIGRATE_PCPTYPES;
239 } else {
240 mtype_low = cc->migratetype;
241 mtype_high = cc->migratetype + 1;
242 }
243
244 /* Speculatively examine the free lists without zone lock */
245 for (mtype = mtype_low; mtype < mtype_high; mtype++) {
246 int order;
247 for (order = cc->order; order < MAX_ORDER; order++) {
248 struct page *page;
249 struct free_area *area;
250 area = &(cc->zone->free_area[order]);
251 if (list_empty(&area->free_list[mtype]))
252 continue;
253
254 /* Take the lock and attempt capture of the page */
255 if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
256 return;
257 if (!list_empty(&area->free_list[mtype])) {
258 page = list_entry(area->free_list[mtype].next,
259 struct page, lru);
260 if (capture_free_page(page, cc->order, mtype)) {
261 spin_unlock_irqrestore(&cc->zone->lock,
262 flags);
263 *cc->page = page;
264 return;
265 }
266 }
267 spin_unlock_irqrestore(&cc->zone->lock, flags);
268 }
269 }
270}
271
272/* 218/*
273 * Isolate free pages onto a private freelist. Caller must hold zone->lock. 219 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
274 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free 220 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
@@ -953,6 +899,60 @@ unsigned long compaction_suitable(struct zone *zone, int order)
953 return COMPACT_CONTINUE; 899 return COMPACT_CONTINUE;
954} 900}
955 901
902static void compact_capture_page(struct compact_control *cc)
903{
904 unsigned long flags;
905 int mtype, mtype_low, mtype_high;
906
907 if (!cc->page || *cc->page)
908 return;
909
910 /*
911 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
912 * regardless of the migratetype of the freelist is is captured from.
913 * This is fine because the order for a high-order MIGRATE_MOVABLE
914 * allocation is typically at least a pageblock size and overall
915 * fragmentation is not impaired. Other allocation types must
916 * capture pages from their own migratelist because otherwise they
917 * could pollute other pageblocks like MIGRATE_MOVABLE with
918 * difficult to move pages and making fragmentation worse overall.
919 */
920 if (cc->migratetype == MIGRATE_MOVABLE) {
921 mtype_low = 0;
922 mtype_high = MIGRATE_PCPTYPES;
923 } else {
924 mtype_low = cc->migratetype;
925 mtype_high = cc->migratetype + 1;
926 }
927
928 /* Speculatively examine the free lists without zone lock */
929 for (mtype = mtype_low; mtype < mtype_high; mtype++) {
930 int order;
931 for (order = cc->order; order < MAX_ORDER; order++) {
932 struct page *page;
933 struct free_area *area;
934 area = &(cc->zone->free_area[order]);
935 if (list_empty(&area->free_list[mtype]))
936 continue;
937
938 /* Take the lock and attempt capture of the page */
939 if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
940 return;
941 if (!list_empty(&area->free_list[mtype])) {
942 page = list_entry(area->free_list[mtype].next,
943 struct page, lru);
944 if (capture_free_page(page, cc->order, mtype)) {
945 spin_unlock_irqrestore(&cc->zone->lock,
946 flags);
947 *cc->page = page;
948 return;
949 }
950 }
951 spin_unlock_irqrestore(&cc->zone->lock, flags);
952 }
953 }
954}
955
956static int compact_zone(struct zone *zone, struct compact_control *cc) 956static int compact_zone(struct zone *zone, struct compact_control *cc)
957{ 957{
958 int ret; 958 int ret;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5f902e20e8c0..827d9c813051 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -12,12 +12,14 @@
12#include <linux/mmu_notifier.h> 12#include <linux/mmu_notifier.h>
13#include <linux/rmap.h> 13#include <linux/rmap.h>
14#include <linux/swap.h> 14#include <linux/swap.h>
15#include <linux/shrinker.h>
15#include <linux/mm_inline.h> 16#include <linux/mm_inline.h>
16#include <linux/kthread.h> 17#include <linux/kthread.h>
17#include <linux/khugepaged.h> 18#include <linux/khugepaged.h>
18#include <linux/freezer.h> 19#include <linux/freezer.h>
19#include <linux/mman.h> 20#include <linux/mman.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22
21#include <asm/tlb.h> 23#include <asm/tlb.h>
22#include <asm/pgalloc.h> 24#include <asm/pgalloc.h>
23#include "internal.h" 25#include "internal.h"
@@ -37,7 +39,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
37 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 39 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
38#endif 40#endif
39 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| 41 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
40 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 42 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
43 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
41 44
42/* default scan 8*512 pte (or vmas) every 30 second */ 45/* default scan 8*512 pte (or vmas) every 30 second */
43static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; 46static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
@@ -159,6 +162,77 @@ static int start_khugepaged(void)
159 return err; 162 return err;
160} 163}
161 164
165static atomic_t huge_zero_refcount;
166static unsigned long huge_zero_pfn __read_mostly;
167
168static inline bool is_huge_zero_pfn(unsigned long pfn)
169{
170 unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
171 return zero_pfn && pfn == zero_pfn;
172}
173
174static inline bool is_huge_zero_pmd(pmd_t pmd)
175{
176 return is_huge_zero_pfn(pmd_pfn(pmd));
177}
178
179static unsigned long get_huge_zero_page(void)
180{
181 struct page *zero_page;
182retry:
183 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
184 return ACCESS_ONCE(huge_zero_pfn);
185
186 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
187 HPAGE_PMD_ORDER);
188 if (!zero_page) {
189 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
190 return 0;
191 }
192 count_vm_event(THP_ZERO_PAGE_ALLOC);
193 preempt_disable();
194 if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
195 preempt_enable();
196 __free_page(zero_page);
197 goto retry;
198 }
199
200 /* We take additional reference here. It will be put back by shrinker */
201 atomic_set(&huge_zero_refcount, 2);
202 preempt_enable();
203 return ACCESS_ONCE(huge_zero_pfn);
204}
205
206static void put_huge_zero_page(void)
207{
208 /*
209 * Counter should never go to zero here. Only shrinker can put
210 * last reference.
211 */
212 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
213}
214
215static int shrink_huge_zero_page(struct shrinker *shrink,
216 struct shrink_control *sc)
217{
218 if (!sc->nr_to_scan)
219 /* we can free zero page only if last reference remains */
220 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
221
222 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
223 unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
224 BUG_ON(zero_pfn == 0);
225 __free_page(__pfn_to_page(zero_pfn));
226 }
227
228 return 0;
229}
230
231static struct shrinker huge_zero_page_shrinker = {
232 .shrink = shrink_huge_zero_page,
233 .seeks = DEFAULT_SEEKS,
234};
235
162#ifdef CONFIG_SYSFS 236#ifdef CONFIG_SYSFS
163 237
164static ssize_t double_flag_show(struct kobject *kobj, 238static ssize_t double_flag_show(struct kobject *kobj,
@@ -284,6 +358,20 @@ static ssize_t defrag_store(struct kobject *kobj,
284static struct kobj_attribute defrag_attr = 358static struct kobj_attribute defrag_attr =
285 __ATTR(defrag, 0644, defrag_show, defrag_store); 359 __ATTR(defrag, 0644, defrag_show, defrag_store);
286 360
361static ssize_t use_zero_page_show(struct kobject *kobj,
362 struct kobj_attribute *attr, char *buf)
363{
364 return single_flag_show(kobj, attr, buf,
365 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
366}
367static ssize_t use_zero_page_store(struct kobject *kobj,
368 struct kobj_attribute *attr, const char *buf, size_t count)
369{
370 return single_flag_store(kobj, attr, buf, count,
371 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
372}
373static struct kobj_attribute use_zero_page_attr =
374 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
287#ifdef CONFIG_DEBUG_VM 375#ifdef CONFIG_DEBUG_VM
288static ssize_t debug_cow_show(struct kobject *kobj, 376static ssize_t debug_cow_show(struct kobject *kobj,
289 struct kobj_attribute *attr, char *buf) 377 struct kobj_attribute *attr, char *buf)
@@ -305,6 +393,7 @@ static struct kobj_attribute debug_cow_attr =
305static struct attribute *hugepage_attr[] = { 393static struct attribute *hugepage_attr[] = {
306 &enabled_attr.attr, 394 &enabled_attr.attr,
307 &defrag_attr.attr, 395 &defrag_attr.attr,
396 &use_zero_page_attr.attr,
308#ifdef CONFIG_DEBUG_VM 397#ifdef CONFIG_DEBUG_VM
309 &debug_cow_attr.attr, 398 &debug_cow_attr.attr,
310#endif 399#endif
@@ -550,6 +639,8 @@ static int __init hugepage_init(void)
550 goto out; 639 goto out;
551 } 640 }
552 641
642 register_shrinker(&huge_zero_page_shrinker);
643
553 /* 644 /*
554 * By default disable transparent hugepages on smaller systems, 645 * By default disable transparent hugepages on smaller systems,
555 * where the extra memory used could hurt more than TLB overhead 646 * where the extra memory used could hurt more than TLB overhead
@@ -678,6 +769,22 @@ static inline struct page *alloc_hugepage(int defrag)
678} 769}
679#endif 770#endif
680 771
772static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
773 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
774 unsigned long zero_pfn)
775{
776 pmd_t entry;
777 if (!pmd_none(*pmd))
778 return false;
779 entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
780 entry = pmd_wrprotect(entry);
781 entry = pmd_mkhuge(entry);
782 set_pmd_at(mm, haddr, pmd, entry);
783 pgtable_trans_huge_deposit(mm, pgtable);
784 mm->nr_ptes++;
785 return true;
786}
787
681int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 788int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
682 unsigned long address, pmd_t *pmd, 789 unsigned long address, pmd_t *pmd,
683 unsigned int flags) 790 unsigned int flags)
@@ -691,6 +798,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
691 return VM_FAULT_OOM; 798 return VM_FAULT_OOM;
692 if (unlikely(khugepaged_enter(vma))) 799 if (unlikely(khugepaged_enter(vma)))
693 return VM_FAULT_OOM; 800 return VM_FAULT_OOM;
801 if (!(flags & FAULT_FLAG_WRITE) &&
802 transparent_hugepage_use_zero_page()) {
803 pgtable_t pgtable;
804 unsigned long zero_pfn;
805 bool set;
806 pgtable = pte_alloc_one(mm, haddr);
807 if (unlikely(!pgtable))
808 return VM_FAULT_OOM;
809 zero_pfn = get_huge_zero_page();
810 if (unlikely(!zero_pfn)) {
811 pte_free(mm, pgtable);
812 count_vm_event(THP_FAULT_FALLBACK);
813 goto out;
814 }
815 spin_lock(&mm->page_table_lock);
816 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
817 zero_pfn);
818 spin_unlock(&mm->page_table_lock);
819 if (!set) {
820 pte_free(mm, pgtable);
821 put_huge_zero_page();
822 }
823 return 0;
824 }
694 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 825 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
695 vma, haddr, numa_node_id(), 0); 826 vma, haddr, numa_node_id(), 0);
696 if (unlikely(!page)) { 827 if (unlikely(!page)) {
@@ -755,6 +886,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
755 pte_free(dst_mm, pgtable); 886 pte_free(dst_mm, pgtable);
756 goto out_unlock; 887 goto out_unlock;
757 } 888 }
889 /*
890 * mm->page_table_lock is enough to be sure that huge zero pmd is not
891 * under splitting since we don't split the page itself, only pmd to
892 * a page table.
893 */
894 if (is_huge_zero_pmd(pmd)) {
895 unsigned long zero_pfn;
896 bool set;
897 /*
898 * get_huge_zero_page() will never allocate a new page here,
899 * since we already have a zero page to copy. It just takes a
900 * reference.
901 */
902 zero_pfn = get_huge_zero_page();
903 set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
904 zero_pfn);
905 BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
906 ret = 0;
907 goto out_unlock;
908 }
758 if (unlikely(pmd_trans_splitting(pmd))) { 909 if (unlikely(pmd_trans_splitting(pmd))) {
759 /* split huge page running from under us */ 910 /* split huge page running from under us */
760 spin_unlock(&src_mm->page_table_lock); 911 spin_unlock(&src_mm->page_table_lock);
@@ -806,6 +957,80 @@ unlock:
806 spin_unlock(&mm->page_table_lock); 957 spin_unlock(&mm->page_table_lock);
807} 958}
808 959
960static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
961 struct vm_area_struct *vma, unsigned long address,
962 pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
963{
964 pgtable_t pgtable;
965 pmd_t _pmd;
966 struct page *page;
967 int i, ret = 0;
968 unsigned long mmun_start; /* For mmu_notifiers */
969 unsigned long mmun_end; /* For mmu_notifiers */
970
971 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
972 if (!page) {
973 ret |= VM_FAULT_OOM;
974 goto out;
975 }
976
977 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
978 put_page(page);
979 ret |= VM_FAULT_OOM;
980 goto out;
981 }
982
983 clear_user_highpage(page, address);
984 __SetPageUptodate(page);
985
986 mmun_start = haddr;
987 mmun_end = haddr + HPAGE_PMD_SIZE;
988 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
989
990 spin_lock(&mm->page_table_lock);
991 if (unlikely(!pmd_same(*pmd, orig_pmd)))
992 goto out_free_page;
993
994 pmdp_clear_flush(vma, haddr, pmd);
995 /* leave pmd empty until pte is filled */
996
997 pgtable = pgtable_trans_huge_withdraw(mm);
998 pmd_populate(mm, &_pmd, pgtable);
999
1000 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1001 pte_t *pte, entry;
1002 if (haddr == (address & PAGE_MASK)) {
1003 entry = mk_pte(page, vma->vm_page_prot);
1004 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1005 page_add_new_anon_rmap(page, vma, haddr);
1006 } else {
1007 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
1008 entry = pte_mkspecial(entry);
1009 }
1010 pte = pte_offset_map(&_pmd, haddr);
1011 VM_BUG_ON(!pte_none(*pte));
1012 set_pte_at(mm, haddr, pte, entry);
1013 pte_unmap(pte);
1014 }
1015 smp_wmb(); /* make pte visible before pmd */
1016 pmd_populate(mm, pmd, pgtable);
1017 spin_unlock(&mm->page_table_lock);
1018 put_huge_zero_page();
1019 inc_mm_counter(mm, MM_ANONPAGES);
1020
1021 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1022
1023 ret |= VM_FAULT_WRITE;
1024out:
1025 return ret;
1026out_free_page:
1027 spin_unlock(&mm->page_table_lock);
1028 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1029 mem_cgroup_uncharge_page(page);
1030 put_page(page);
1031 goto out;
1032}
1033
809static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 1034static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
810 struct vm_area_struct *vma, 1035 struct vm_area_struct *vma,
811 unsigned long address, 1036 unsigned long address,
@@ -912,19 +1137,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
912 unsigned long address, pmd_t *pmd, pmd_t orig_pmd) 1137 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
913{ 1138{
914 int ret = 0; 1139 int ret = 0;
915 struct page *page, *new_page; 1140 struct page *page = NULL, *new_page;
916 unsigned long haddr; 1141 unsigned long haddr;
917 unsigned long mmun_start; /* For mmu_notifiers */ 1142 unsigned long mmun_start; /* For mmu_notifiers */
918 unsigned long mmun_end; /* For mmu_notifiers */ 1143 unsigned long mmun_end; /* For mmu_notifiers */
919 1144
920 VM_BUG_ON(!vma->anon_vma); 1145 VM_BUG_ON(!vma->anon_vma);
1146 haddr = address & HPAGE_PMD_MASK;
1147 if (is_huge_zero_pmd(orig_pmd))
1148 goto alloc;
921 spin_lock(&mm->page_table_lock); 1149 spin_lock(&mm->page_table_lock);
922 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1150 if (unlikely(!pmd_same(*pmd, orig_pmd)))
923 goto out_unlock; 1151 goto out_unlock;
924 1152
925 page = pmd_page(orig_pmd); 1153 page = pmd_page(orig_pmd);
926 VM_BUG_ON(!PageCompound(page) || !PageHead(page)); 1154 VM_BUG_ON(!PageCompound(page) || !PageHead(page));
927 haddr = address & HPAGE_PMD_MASK;
928 if (page_mapcount(page) == 1) { 1155 if (page_mapcount(page) == 1) {
929 pmd_t entry; 1156 pmd_t entry;
930 entry = pmd_mkyoung(orig_pmd); 1157 entry = pmd_mkyoung(orig_pmd);
@@ -936,7 +1163,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
936 } 1163 }
937 get_page(page); 1164 get_page(page);
938 spin_unlock(&mm->page_table_lock); 1165 spin_unlock(&mm->page_table_lock);
939 1166alloc:
940 if (transparent_hugepage_enabled(vma) && 1167 if (transparent_hugepage_enabled(vma) &&
941 !transparent_hugepage_debug_cow()) 1168 !transparent_hugepage_debug_cow())
942 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 1169 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
@@ -946,24 +1173,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
946 1173
947 if (unlikely(!new_page)) { 1174 if (unlikely(!new_page)) {
948 count_vm_event(THP_FAULT_FALLBACK); 1175 count_vm_event(THP_FAULT_FALLBACK);
949 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 1176 if (is_huge_zero_pmd(orig_pmd)) {
950 pmd, orig_pmd, page, haddr); 1177 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
951 if (ret & VM_FAULT_OOM) 1178 address, pmd, orig_pmd, haddr);
952 split_huge_page(page); 1179 } else {
953 put_page(page); 1180 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
1181 pmd, orig_pmd, page, haddr);
1182 if (ret & VM_FAULT_OOM)
1183 split_huge_page(page);
1184 put_page(page);
1185 }
954 goto out; 1186 goto out;
955 } 1187 }
956 count_vm_event(THP_FAULT_ALLOC); 1188 count_vm_event(THP_FAULT_ALLOC);
957 1189
958 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1190 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
959 put_page(new_page); 1191 put_page(new_page);
960 split_huge_page(page); 1192 if (page) {
961 put_page(page); 1193 split_huge_page(page);
1194 put_page(page);
1195 }
962 ret |= VM_FAULT_OOM; 1196 ret |= VM_FAULT_OOM;
963 goto out; 1197 goto out;
964 } 1198 }
965 1199
966 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 1200 if (is_huge_zero_pmd(orig_pmd))
1201 clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
1202 else
1203 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
967 __SetPageUptodate(new_page); 1204 __SetPageUptodate(new_page);
968 1205
969 mmun_start = haddr; 1206 mmun_start = haddr;
@@ -971,7 +1208,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
971 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1208 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
972 1209
973 spin_lock(&mm->page_table_lock); 1210 spin_lock(&mm->page_table_lock);
974 put_page(page); 1211 if (page)
1212 put_page(page);
975 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1213 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
976 spin_unlock(&mm->page_table_lock); 1214 spin_unlock(&mm->page_table_lock);
977 mem_cgroup_uncharge_page(new_page); 1215 mem_cgroup_uncharge_page(new_page);
@@ -979,14 +1217,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
979 goto out_mn; 1217 goto out_mn;
980 } else { 1218 } else {
981 pmd_t entry; 1219 pmd_t entry;
982 VM_BUG_ON(!PageHead(page));
983 entry = mk_huge_pmd(new_page, vma); 1220 entry = mk_huge_pmd(new_page, vma);
984 pmdp_clear_flush(vma, haddr, pmd); 1221 pmdp_clear_flush(vma, haddr, pmd);
985 page_add_new_anon_rmap(new_page, vma, haddr); 1222 page_add_new_anon_rmap(new_page, vma, haddr);
986 set_pmd_at(mm, haddr, pmd, entry); 1223 set_pmd_at(mm, haddr, pmd, entry);
987 update_mmu_cache_pmd(vma, address, pmd); 1224 update_mmu_cache_pmd(vma, address, pmd);
988 page_remove_rmap(page); 1225 if (is_huge_zero_pmd(orig_pmd)) {
989 put_page(page); 1226 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
1227 put_huge_zero_page();
1228 } else {
1229 VM_BUG_ON(!PageHead(page));
1230 page_remove_rmap(page);
1231 put_page(page);
1232 }
990 ret |= VM_FAULT_WRITE; 1233 ret |= VM_FAULT_WRITE;
991 } 1234 }
992 spin_unlock(&mm->page_table_lock); 1235 spin_unlock(&mm->page_table_lock);
@@ -1055,15 +1298,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1055 pmd_t orig_pmd; 1298 pmd_t orig_pmd;
1056 pgtable = pgtable_trans_huge_withdraw(tlb->mm); 1299 pgtable = pgtable_trans_huge_withdraw(tlb->mm);
1057 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); 1300 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
1058 page = pmd_page(orig_pmd);
1059 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1301 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1060 page_remove_rmap(page); 1302 if (is_huge_zero_pmd(orig_pmd)) {
1061 VM_BUG_ON(page_mapcount(page) < 0); 1303 tlb->mm->nr_ptes--;
1062 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1304 spin_unlock(&tlb->mm->page_table_lock);
1063 VM_BUG_ON(!PageHead(page)); 1305 put_huge_zero_page();
1064 tlb->mm->nr_ptes--; 1306 } else {
1065 spin_unlock(&tlb->mm->page_table_lock); 1307 page = pmd_page(orig_pmd);
1066 tlb_remove_page(tlb, page); 1308 page_remove_rmap(page);
1309 VM_BUG_ON(page_mapcount(page) < 0);
1310 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1311 VM_BUG_ON(!PageHead(page));
1312 tlb->mm->nr_ptes--;
1313 spin_unlock(&tlb->mm->page_table_lock);
1314 tlb_remove_page(tlb, page);
1315 }
1067 pte_free(tlb->mm, pgtable); 1316 pte_free(tlb->mm, pgtable);
1068 ret = 1; 1317 ret = 1;
1069 } 1318 }
@@ -1135,6 +1384,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1135 pmd_t entry; 1384 pmd_t entry;
1136 entry = pmdp_get_and_clear(mm, addr, pmd); 1385 entry = pmdp_get_and_clear(mm, addr, pmd);
1137 entry = pmd_modify(entry, newprot); 1386 entry = pmd_modify(entry, newprot);
1387 BUG_ON(pmd_write(entry));
1138 set_pmd_at(mm, addr, pmd, entry); 1388 set_pmd_at(mm, addr, pmd, entry);
1139 spin_unlock(&vma->vm_mm->page_table_lock); 1389 spin_unlock(&vma->vm_mm->page_table_lock);
1140 ret = 1; 1390 ret = 1;
@@ -1477,6 +1727,7 @@ int split_huge_page(struct page *page)
1477 struct anon_vma *anon_vma; 1727 struct anon_vma *anon_vma;
1478 int ret = 1; 1728 int ret = 1;
1479 1729
1730 BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
1480 BUG_ON(!PageAnon(page)); 1731 BUG_ON(!PageAnon(page));
1481 anon_vma = page_lock_anon_vma(page); 1732 anon_vma = page_lock_anon_vma(page);
1482 if (!anon_vma) 1733 if (!anon_vma)
@@ -2336,19 +2587,65 @@ static int khugepaged(void *none)
2336 return 0; 2587 return 0;
2337} 2588}
2338 2589
2339void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) 2590static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2591 unsigned long haddr, pmd_t *pmd)
2592{
2593 struct mm_struct *mm = vma->vm_mm;
2594 pgtable_t pgtable;
2595 pmd_t _pmd;
2596 int i;
2597
2598 pmdp_clear_flush(vma, haddr, pmd);
2599 /* leave pmd empty until pte is filled */
2600
2601 pgtable = pgtable_trans_huge_withdraw(mm);
2602 pmd_populate(mm, &_pmd, pgtable);
2603
2604 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2605 pte_t *pte, entry;
2606 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
2607 entry = pte_mkspecial(entry);
2608 pte = pte_offset_map(&_pmd, haddr);
2609 VM_BUG_ON(!pte_none(*pte));
2610 set_pte_at(mm, haddr, pte, entry);
2611 pte_unmap(pte);
2612 }
2613 smp_wmb(); /* make pte visible before pmd */
2614 pmd_populate(mm, pmd, pgtable);
2615 put_huge_zero_page();
2616}
2617
2618void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
2619 pmd_t *pmd)
2340{ 2620{
2341 struct page *page; 2621 struct page *page;
2622 struct mm_struct *mm = vma->vm_mm;
2623 unsigned long haddr = address & HPAGE_PMD_MASK;
2624 unsigned long mmun_start; /* For mmu_notifiers */
2625 unsigned long mmun_end; /* For mmu_notifiers */
2626
2627 BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
2342 2628
2629 mmun_start = haddr;
2630 mmun_end = haddr + HPAGE_PMD_SIZE;
2631 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2343 spin_lock(&mm->page_table_lock); 2632 spin_lock(&mm->page_table_lock);
2344 if (unlikely(!pmd_trans_huge(*pmd))) { 2633 if (unlikely(!pmd_trans_huge(*pmd))) {
2345 spin_unlock(&mm->page_table_lock); 2634 spin_unlock(&mm->page_table_lock);
2635 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2636 return;
2637 }
2638 if (is_huge_zero_pmd(*pmd)) {
2639 __split_huge_zero_page_pmd(vma, haddr, pmd);
2640 spin_unlock(&mm->page_table_lock);
2641 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2346 return; 2642 return;
2347 } 2643 }
2348 page = pmd_page(*pmd); 2644 page = pmd_page(*pmd);
2349 VM_BUG_ON(!page_count(page)); 2645 VM_BUG_ON(!page_count(page));
2350 get_page(page); 2646 get_page(page);
2351 spin_unlock(&mm->page_table_lock); 2647 spin_unlock(&mm->page_table_lock);
2648 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2352 2649
2353 split_huge_page(page); 2650 split_huge_page(page);
2354 2651
@@ -2356,6 +2653,16 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
2356 BUG_ON(pmd_trans_huge(*pmd)); 2653 BUG_ON(pmd_trans_huge(*pmd));
2357} 2654}
2358 2655
2656void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
2657 pmd_t *pmd)
2658{
2659 struct vm_area_struct *vma;
2660
2661 vma = find_vma(mm, address);
2662 BUG_ON(vma == NULL);
2663 split_huge_page_pmd(vma, address, pmd);
2664}
2665
2359static void split_huge_page_address(struct mm_struct *mm, 2666static void split_huge_page_address(struct mm_struct *mm,
2360 unsigned long address) 2667 unsigned long address)
2361{ 2668{
@@ -2370,7 +2677,7 @@ static void split_huge_page_address(struct mm_struct *mm,
2370 * Caller holds the mmap_sem write mode, so a huge pmd cannot 2677 * Caller holds the mmap_sem write mode, so a huge pmd cannot
2371 * materialize from under us. 2678 * materialize from under us.
2372 */ 2679 */
2373 split_huge_page_pmd(mm, pmd); 2680 split_huge_page_pmd_mm(mm, address, pmd);
2374} 2681}
2375 2682
2376void __vma_adjust_trans_huge(struct vm_area_struct *vma, 2683void __vma_adjust_trans_huge(struct vm_area_struct *vma,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 34f372ad89d0..88e7293b96bd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1057,7 +1057,7 @@ static void return_unused_surplus_pages(struct hstate *h,
1057 * on-line nodes with memory and will handle the hstate accounting. 1057 * on-line nodes with memory and will handle the hstate accounting.
1058 */ 1058 */
1059 while (nr_pages--) { 1059 while (nr_pages--) {
1060 if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) 1060 if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
1061 break; 1061 break;
1062 } 1062 }
1063} 1063}
@@ -1180,14 +1180,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1180int __weak alloc_bootmem_huge_page(struct hstate *h) 1180int __weak alloc_bootmem_huge_page(struct hstate *h)
1181{ 1181{
1182 struct huge_bootmem_page *m; 1182 struct huge_bootmem_page *m;
1183 int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 1183 int nr_nodes = nodes_weight(node_states[N_MEMORY]);
1184 1184
1185 while (nr_nodes) { 1185 while (nr_nodes) {
1186 void *addr; 1186 void *addr;
1187 1187
1188 addr = __alloc_bootmem_node_nopanic( 1188 addr = __alloc_bootmem_node_nopanic(
1189 NODE_DATA(hstate_next_node_to_alloc(h, 1189 NODE_DATA(hstate_next_node_to_alloc(h,
1190 &node_states[N_HIGH_MEMORY])), 1190 &node_states[N_MEMORY])),
1191 huge_page_size(h), huge_page_size(h), 0); 1191 huge_page_size(h), huge_page_size(h), 0);
1192 1192
1193 if (addr) { 1193 if (addr) {
@@ -1259,7 +1259,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1259 if (!alloc_bootmem_huge_page(h)) 1259 if (!alloc_bootmem_huge_page(h))
1260 break; 1260 break;
1261 } else if (!alloc_fresh_huge_page(h, 1261 } else if (!alloc_fresh_huge_page(h,
1262 &node_states[N_HIGH_MEMORY])) 1262 &node_states[N_MEMORY]))
1263 break; 1263 break;
1264 } 1264 }
1265 h->max_huge_pages = i; 1265 h->max_huge_pages = i;
@@ -1527,7 +1527,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1527 if (!(obey_mempolicy && 1527 if (!(obey_mempolicy &&
1528 init_nodemask_of_mempolicy(nodes_allowed))) { 1528 init_nodemask_of_mempolicy(nodes_allowed))) {
1529 NODEMASK_FREE(nodes_allowed); 1529 NODEMASK_FREE(nodes_allowed);
1530 nodes_allowed = &node_states[N_HIGH_MEMORY]; 1530 nodes_allowed = &node_states[N_MEMORY];
1531 } 1531 }
1532 } else if (nodes_allowed) { 1532 } else if (nodes_allowed) {
1533 /* 1533 /*
@@ -1537,11 +1537,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1537 count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 1537 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
1538 init_nodemask_of_node(nodes_allowed, nid); 1538 init_nodemask_of_node(nodes_allowed, nid);
1539 } else 1539 } else
1540 nodes_allowed = &node_states[N_HIGH_MEMORY]; 1540 nodes_allowed = &node_states[N_MEMORY];
1541 1541
1542 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); 1542 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1543 1543
1544 if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 1544 if (nodes_allowed != &node_states[N_MEMORY])
1545 NODEMASK_FREE(nodes_allowed); 1545 NODEMASK_FREE(nodes_allowed);
1546 1546
1547 return len; 1547 return len;
@@ -1844,7 +1844,7 @@ static void hugetlb_register_all_nodes(void)
1844{ 1844{
1845 int nid; 1845 int nid;
1846 1846
1847 for_each_node_state(nid, N_HIGH_MEMORY) { 1847 for_each_node_state(nid, N_MEMORY) {
1848 struct node *node = node_devices[nid]; 1848 struct node *node = node_devices[nid];
1849 if (node->dev.id == nid) 1849 if (node->dev.id == nid)
1850 hugetlb_register_node(node); 1850 hugetlb_register_node(node);
@@ -1939,8 +1939,8 @@ void __init hugetlb_add_hstate(unsigned order)
1939 for (i = 0; i < MAX_NUMNODES; ++i) 1939 for (i = 0; i < MAX_NUMNODES; ++i)
1940 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1940 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1941 INIT_LIST_HEAD(&h->hugepage_activelist); 1941 INIT_LIST_HEAD(&h->hugepage_activelist);
1942 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); 1942 h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
1943 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); 1943 h->next_nid_to_free = first_node(node_states[N_MEMORY]);
1944 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1944 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1945 huge_page_size(h)/1024); 1945 huge_page_size(h)/1024);
1946 /* 1946 /*
@@ -2035,11 +2035,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
2035 if (!(obey_mempolicy && 2035 if (!(obey_mempolicy &&
2036 init_nodemask_of_mempolicy(nodes_allowed))) { 2036 init_nodemask_of_mempolicy(nodes_allowed))) {
2037 NODEMASK_FREE(nodes_allowed); 2037 NODEMASK_FREE(nodes_allowed);
2038 nodes_allowed = &node_states[N_HIGH_MEMORY]; 2038 nodes_allowed = &node_states[N_MEMORY];
2039 } 2039 }
2040 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); 2040 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
2041 2041
2042 if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 2042 if (nodes_allowed != &node_states[N_MEMORY])
2043 NODEMASK_FREE(nodes_allowed); 2043 NODEMASK_FREE(nodes_allowed);
2044 } 2044 }
2045out: 2045out:
@@ -2386,8 +2386,10 @@ again:
2386 /* 2386 /*
2387 * HWPoisoned hugepage is already unmapped and dropped reference 2387 * HWPoisoned hugepage is already unmapped and dropped reference
2388 */ 2388 */
2389 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) 2389 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
2390 pte_clear(mm, address, ptep);
2390 continue; 2391 continue;
2392 }
2391 2393
2392 page = pte_page(pte); 2394 page = pte_page(pte);
2393 /* 2395 /*
@@ -3170,7 +3172,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
3170 3172
3171 spin_lock(&hugetlb_lock); 3173 spin_lock(&hugetlb_lock);
3172 if (is_hugepage_on_freelist(hpage)) { 3174 if (is_hugepage_on_freelist(hpage)) {
3173 list_del(&hpage->lru); 3175 /*
3176 * Hwpoisoned hugepage isn't linked to activelist or freelist,
3177 * but dangling hpage->lru can trigger list-debug warnings
3178 * (this happens when we call unpoison_memory() on it),
3179 * so let it point to itself with list_del_init().
3180 */
3181 list_del_init(&hpage->lru);
3174 set_page_refcounted(hpage); 3182 set_page_refcounted(hpage);
3175 h->free_huge_pages--; 3183 h->free_huge_pages--;
3176 h->free_huge_pages_node[nid]--; 3184 h->free_huge_pages_node[nid]--;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 12307b3838fb..6c055929c8cc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,6 +59,8 @@
59#include <trace/events/vmscan.h> 59#include <trace/events/vmscan.h>
60 60
61struct cgroup_subsys mem_cgroup_subsys __read_mostly; 61struct cgroup_subsys mem_cgroup_subsys __read_mostly;
62EXPORT_SYMBOL(mem_cgroup_subsys);
63
62#define MEM_CGROUP_RECLAIM_RETRIES 5 64#define MEM_CGROUP_RECLAIM_RETRIES 5
63static struct mem_cgroup *root_mem_cgroup __read_mostly; 65static struct mem_cgroup *root_mem_cgroup __read_mostly;
64 66
@@ -800,7 +802,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
800 int nid; 802 int nid;
801 u64 total = 0; 803 u64 total = 0;
802 804
803 for_each_node_state(nid, N_HIGH_MEMORY) 805 for_each_node_state(nid, N_MEMORY)
804 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 806 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
805 return total; 807 return total;
806} 808}
@@ -1015,13 +1017,10 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
1015 iter != NULL; \ 1017 iter != NULL; \
1016 iter = mem_cgroup_iter(NULL, iter, NULL)) 1018 iter = mem_cgroup_iter(NULL, iter, NULL))
1017 1019
1018void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1020void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1019{ 1021{
1020 struct mem_cgroup *memcg; 1022 struct mem_cgroup *memcg;
1021 1023
1022 if (!mm)
1023 return;
1024
1025 rcu_read_lock(); 1024 rcu_read_lock();
1026 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1025 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1027 if (unlikely(!memcg)) 1026 if (unlikely(!memcg))
@@ -1040,7 +1039,7 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1040out: 1039out:
1041 rcu_read_unlock(); 1040 rcu_read_unlock();
1042} 1041}
1043EXPORT_SYMBOL(mem_cgroup_count_vm_event); 1042EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1044 1043
1045/** 1044/**
1046 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1045 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
@@ -1644,9 +1643,9 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1644 return; 1643 return;
1645 1644
1646 /* make a nodemask where this memcg uses memory from */ 1645 /* make a nodemask where this memcg uses memory from */
1647 memcg->scan_nodes = node_states[N_HIGH_MEMORY]; 1646 memcg->scan_nodes = node_states[N_MEMORY];
1648 1647
1649 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1648 for_each_node_mask(nid, node_states[N_MEMORY]) {
1650 1649
1651 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1650 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1652 node_clear(nid, memcg->scan_nodes); 1651 node_clear(nid, memcg->scan_nodes);
@@ -1717,7 +1716,7 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1717 /* 1716 /*
1718 * Check rest of nodes. 1717 * Check rest of nodes.
1719 */ 1718 */
1720 for_each_node_state(nid, N_HIGH_MEMORY) { 1719 for_each_node_state(nid, N_MEMORY) {
1721 if (node_isset(nid, memcg->scan_nodes)) 1720 if (node_isset(nid, memcg->scan_nodes))
1722 continue; 1721 continue;
1723 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1722 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
@@ -3776,7 +3775,7 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3776 lru_add_drain_all(); 3775 lru_add_drain_all();
3777 drain_all_stock_sync(memcg); 3776 drain_all_stock_sync(memcg);
3778 mem_cgroup_start_move(memcg); 3777 mem_cgroup_start_move(memcg);
3779 for_each_node_state(node, N_HIGH_MEMORY) { 3778 for_each_node_state(node, N_MEMORY) {
3780 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3779 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3781 enum lru_list lru; 3780 enum lru_list lru;
3782 for_each_lru(lru) { 3781 for_each_lru(lru) {
@@ -4122,7 +4121,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4122 4121
4123 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 4122 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
4124 seq_printf(m, "total=%lu", total_nr); 4123 seq_printf(m, "total=%lu", total_nr);
4125 for_each_node_state(nid, N_HIGH_MEMORY) { 4124 for_each_node_state(nid, N_MEMORY) {
4126 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); 4125 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
4127 seq_printf(m, " N%d=%lu", nid, node_nr); 4126 seq_printf(m, " N%d=%lu", nid, node_nr);
4128 } 4127 }
@@ -4130,7 +4129,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4130 4129
4131 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); 4130 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
4132 seq_printf(m, "file=%lu", file_nr); 4131 seq_printf(m, "file=%lu", file_nr);
4133 for_each_node_state(nid, N_HIGH_MEMORY) { 4132 for_each_node_state(nid, N_MEMORY) {
4134 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 4133 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4135 LRU_ALL_FILE); 4134 LRU_ALL_FILE);
4136 seq_printf(m, " N%d=%lu", nid, node_nr); 4135 seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4139,7 +4138,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4139 4138
4140 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); 4139 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
4141 seq_printf(m, "anon=%lu", anon_nr); 4140 seq_printf(m, "anon=%lu", anon_nr);
4142 for_each_node_state(nid, N_HIGH_MEMORY) { 4141 for_each_node_state(nid, N_MEMORY) {
4143 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 4142 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4144 LRU_ALL_ANON); 4143 LRU_ALL_ANON);
4145 seq_printf(m, " N%d=%lu", nid, node_nr); 4144 seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4148,7 +4147,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4148 4147
4149 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); 4148 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4150 seq_printf(m, "unevictable=%lu", unevictable_nr); 4149 seq_printf(m, "unevictable=%lu", unevictable_nr);
4151 for_each_node_state(nid, N_HIGH_MEMORY) { 4150 for_each_node_state(nid, N_MEMORY) {
4152 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 4151 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4153 BIT(LRU_UNEVICTABLE)); 4152 BIT(LRU_UNEVICTABLE));
4154 seq_printf(m, " N%d=%lu", nid, node_nr); 4153 seq_printf(m, " N%d=%lu", nid, node_nr);
diff --git a/mm/memory.c b/mm/memory.c
index 765377385632..db2e9e797a05 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -717,20 +717,6 @@ static inline bool is_cow_mapping(vm_flags_t flags)
717 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 717 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
718} 718}
719 719
720#ifndef is_zero_pfn
721static inline int is_zero_pfn(unsigned long pfn)
722{
723 return pfn == zero_pfn;
724}
725#endif
726
727#ifndef my_zero_pfn
728static inline unsigned long my_zero_pfn(unsigned long addr)
729{
730 return zero_pfn;
731}
732#endif
733
734/* 720/*
735 * vm_normal_page -- This function gets the "struct page" associated with a pte. 721 * vm_normal_page -- This function gets the "struct page" associated with a pte.
736 * 722 *
@@ -1250,7 +1236,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1250 BUG(); 1236 BUG();
1251 } 1237 }
1252#endif 1238#endif
1253 split_huge_page_pmd(vma->vm_mm, pmd); 1239 split_huge_page_pmd(vma, addr, pmd);
1254 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1240 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1255 goto next; 1241 goto next;
1256 /* fall through */ 1242 /* fall through */
@@ -1519,7 +1505,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1519 } 1505 }
1520 if (pmd_trans_huge(*pmd)) { 1506 if (pmd_trans_huge(*pmd)) {
1521 if (flags & FOLL_SPLIT) { 1507 if (flags & FOLL_SPLIT) {
1522 split_huge_page_pmd(mm, pmd); 1508 split_huge_page_pmd(vma, address, pmd);
1523 goto split_fallthrough; 1509 goto split_fallthrough;
1524 } 1510 }
1525 spin_lock(&mm->page_table_lock); 1511 spin_lock(&mm->page_table_lock);
@@ -2794,13 +2780,8 @@ unlock:
2794oom_free_new: 2780oom_free_new:
2795 page_cache_release(new_page); 2781 page_cache_release(new_page);
2796oom: 2782oom:
2797 if (old_page) { 2783 if (old_page)
2798 if (page_mkwrite) {
2799 unlock_page(old_page);
2800 page_cache_release(old_page);
2801 }
2802 page_cache_release(old_page); 2784 page_cache_release(old_page);
2803 }
2804 return VM_FAULT_OOM; 2785 return VM_FAULT_OOM;
2805 2786
2806unwritable_page: 2787unwritable_page:
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c3e66ae411fd..518baa896e83 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page,
106void __ref put_page_bootmem(struct page *page) 106void __ref put_page_bootmem(struct page *page)
107{ 107{
108 unsigned long type; 108 unsigned long type;
109 static DEFINE_MUTEX(ppb_lock);
109 110
110 type = (unsigned long) page->lru.next; 111 type = (unsigned long) page->lru.next;
111 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -115,7 +116,14 @@ void __ref put_page_bootmem(struct page *page)
115 ClearPagePrivate(page); 116 ClearPagePrivate(page);
116 set_page_private(page, 0); 117 set_page_private(page, 0);
117 INIT_LIST_HEAD(&page->lru); 118 INIT_LIST_HEAD(&page->lru);
119
120 /*
121 * Please refer to comment for __free_pages_bootmem()
122 * for why we serialize here.
123 */
124 mutex_lock(&ppb_lock);
118 __free_pages_bootmem(page, 0); 125 __free_pages_bootmem(page, 0);
126 mutex_unlock(&ppb_lock);
119 } 127 }
120 128
121} 129}
@@ -581,11 +589,19 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
581 return 0; 589 return 0;
582} 590}
583 591
592#ifdef CONFIG_MOVABLE_NODE
593/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */
594static bool can_online_high_movable(struct zone *zone)
595{
596 return true;
597}
598#else /* #ifdef CONFIG_MOVABLE_NODE */
584/* ensure every online node has NORMAL memory */ 599/* ensure every online node has NORMAL memory */
585static bool can_online_high_movable(struct zone *zone) 600static bool can_online_high_movable(struct zone *zone)
586{ 601{
587 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 602 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
588} 603}
604#endif /* #ifdef CONFIG_MOVABLE_NODE */
589 605
590/* check which state of node_states will be changed when online memory */ 606/* check which state of node_states will be changed when online memory */
591static void node_states_check_changes_online(unsigned long nr_pages, 607static void node_states_check_changes_online(unsigned long nr_pages,
@@ -595,13 +611,15 @@ static void node_states_check_changes_online(unsigned long nr_pages,
595 enum zone_type zone_last = ZONE_NORMAL; 611 enum zone_type zone_last = ZONE_NORMAL;
596 612
597 /* 613 /*
598 * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes 614 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
599 * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. 615 * contains nodes which have zones of 0...ZONE_NORMAL,
616 * set zone_last to ZONE_NORMAL.
600 * 617 *
601 * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes 618 * If we don't have HIGHMEM nor movable node,
602 * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 619 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
620 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
603 */ 621 */
604 if (N_HIGH_MEMORY == N_NORMAL_MEMORY) 622 if (N_MEMORY == N_NORMAL_MEMORY)
605 zone_last = ZONE_MOVABLE; 623 zone_last = ZONE_MOVABLE;
606 624
607 /* 625 /*
@@ -615,12 +633,34 @@ static void node_states_check_changes_online(unsigned long nr_pages,
615 else 633 else
616 arg->status_change_nid_normal = -1; 634 arg->status_change_nid_normal = -1;
617 635
636#ifdef CONFIG_HIGHMEM
637 /*
638 * If we have movable node, node_states[N_HIGH_MEMORY]
639 * contains nodes which have zones of 0...ZONE_HIGHMEM,
640 * set zone_last to ZONE_HIGHMEM.
641 *
642 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
643 * contains nodes which have zones of 0...ZONE_MOVABLE,
644 * set zone_last to ZONE_MOVABLE.
645 */
646 zone_last = ZONE_HIGHMEM;
647 if (N_MEMORY == N_HIGH_MEMORY)
648 zone_last = ZONE_MOVABLE;
649
650 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
651 arg->status_change_nid_high = nid;
652 else
653 arg->status_change_nid_high = -1;
654#else
655 arg->status_change_nid_high = arg->status_change_nid_normal;
656#endif
657
618 /* 658 /*
619 * if the node don't have memory befor online, we will need to 659 * if the node don't have memory befor online, we will need to
620 * set the node to node_states[N_HIGH_MEMORY] after the memory 660 * set the node to node_states[N_MEMORY] after the memory
621 * is online. 661 * is online.
622 */ 662 */
623 if (!node_state(nid, N_HIGH_MEMORY)) 663 if (!node_state(nid, N_MEMORY))
624 arg->status_change_nid = nid; 664 arg->status_change_nid = nid;
625 else 665 else
626 arg->status_change_nid = -1; 666 arg->status_change_nid = -1;
@@ -631,7 +671,10 @@ static void node_states_set_node(int node, struct memory_notify *arg)
631 if (arg->status_change_nid_normal >= 0) 671 if (arg->status_change_nid_normal >= 0)
632 node_set_state(node, N_NORMAL_MEMORY); 672 node_set_state(node, N_NORMAL_MEMORY);
633 673
634 node_set_state(node, N_HIGH_MEMORY); 674 if (arg->status_change_nid_high >= 0)
675 node_set_state(node, N_HIGH_MEMORY);
676
677 node_set_state(node, N_MEMORY);
635} 678}
636 679
637 680
@@ -713,6 +756,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
713 return ret; 756 return ret;
714 } 757 }
715 758
759 zone->managed_pages += onlined_pages;
716 zone->present_pages += onlined_pages; 760 zone->present_pages += onlined_pages;
717 zone->zone_pgdat->node_present_pages += onlined_pages; 761 zone->zone_pgdat->node_present_pages += onlined_pages;
718 if (onlined_pages) { 762 if (onlined_pages) {
@@ -1066,6 +1110,13 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
1066 return offlined; 1110 return offlined;
1067} 1111}
1068 1112
1113#ifdef CONFIG_MOVABLE_NODE
1114/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */
1115static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1116{
1117 return true;
1118}
1119#else /* #ifdef CONFIG_MOVABLE_NODE */
1069/* ensure the node has NORMAL memory if it is still online */ 1120/* ensure the node has NORMAL memory if it is still online */
1070static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1121static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1071{ 1122{
@@ -1089,6 +1140,7 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1089 */ 1140 */
1090 return present_pages == 0; 1141 return present_pages == 0;
1091} 1142}
1143#endif /* #ifdef CONFIG_MOVABLE_NODE */
1092 1144
1093/* check which state of node_states will be changed when offline memory */ 1145/* check which state of node_states will be changed when offline memory */
1094static void node_states_check_changes_offline(unsigned long nr_pages, 1146static void node_states_check_changes_offline(unsigned long nr_pages,
@@ -1099,13 +1151,15 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
1099 enum zone_type zt, zone_last = ZONE_NORMAL; 1151 enum zone_type zt, zone_last = ZONE_NORMAL;
1100 1152
1101 /* 1153 /*
1102 * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes 1154 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
1103 * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. 1155 * contains nodes which have zones of 0...ZONE_NORMAL,
1156 * set zone_last to ZONE_NORMAL.
1104 * 1157 *
1105 * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes 1158 * If we don't have HIGHMEM nor movable node,
1106 * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1159 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
1160 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
1107 */ 1161 */
1108 if (N_HIGH_MEMORY == N_NORMAL_MEMORY) 1162 if (N_MEMORY == N_NORMAL_MEMORY)
1109 zone_last = ZONE_MOVABLE; 1163 zone_last = ZONE_MOVABLE;
1110 1164
1111 /* 1165 /*
@@ -1122,6 +1176,30 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
1122 else 1176 else
1123 arg->status_change_nid_normal = -1; 1177 arg->status_change_nid_normal = -1;
1124 1178
1179#ifdef CONFIG_HIGHMEM
1180 /*
1181 * If we have movable node, node_states[N_HIGH_MEMORY]
1182 * contains nodes which have zones of 0...ZONE_HIGHMEM,
1183 * set zone_last to ZONE_HIGHMEM.
1184 *
1185 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
1186 * contains nodes which have zones of 0...ZONE_MOVABLE,
1187 * set zone_last to ZONE_MOVABLE.
1188 */
1189 zone_last = ZONE_HIGHMEM;
1190 if (N_MEMORY == N_HIGH_MEMORY)
1191 zone_last = ZONE_MOVABLE;
1192
1193 for (; zt <= zone_last; zt++)
1194 present_pages += pgdat->node_zones[zt].present_pages;
1195 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1196 arg->status_change_nid_high = zone_to_nid(zone);
1197 else
1198 arg->status_change_nid_high = -1;
1199#else
1200 arg->status_change_nid_high = arg->status_change_nid_normal;
1201#endif
1202
1125 /* 1203 /*
1126 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1204 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
1127 */ 1205 */
@@ -1146,9 +1224,13 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
1146 if (arg->status_change_nid_normal >= 0) 1224 if (arg->status_change_nid_normal >= 0)
1147 node_clear_state(node, N_NORMAL_MEMORY); 1225 node_clear_state(node, N_NORMAL_MEMORY);
1148 1226
1149 if ((N_HIGH_MEMORY != N_NORMAL_MEMORY) && 1227 if ((N_MEMORY != N_NORMAL_MEMORY) &&
1150 (arg->status_change_nid >= 0)) 1228 (arg->status_change_nid_high >= 0))
1151 node_clear_state(node, N_HIGH_MEMORY); 1229 node_clear_state(node, N_HIGH_MEMORY);
1230
1231 if ((N_MEMORY != N_HIGH_MEMORY) &&
1232 (arg->status_change_nid >= 0))
1233 node_clear_state(node, N_MEMORY);
1152} 1234}
1153 1235
1154static int __ref __offline_pages(unsigned long start_pfn, 1236static int __ref __offline_pages(unsigned long start_pfn,
@@ -1248,6 +1330,7 @@ repeat:
1248 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1330 /* reset pagetype flags and makes migrate type to be MOVABLE */
1249 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1331 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1250 /* removal success */ 1332 /* removal success */
1333 zone->managed_pages -= offlined_pages;
1251 zone->present_pages -= offlined_pages; 1334 zone->present_pages -= offlined_pages;
1252 zone->zone_pgdat->node_present_pages -= offlined_pages; 1335 zone->zone_pgdat->node_present_pages -= offlined_pages;
1253 totalram_pages -= offlined_pages; 1336 totalram_pages -= offlined_pages;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 05b28361a39b..aaf54566cb6b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -212,9 +212,9 @@ static int mpol_set_nodemask(struct mempolicy *pol,
212 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ 212 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
213 if (pol == NULL) 213 if (pol == NULL)
214 return 0; 214 return 0;
215 /* Check N_HIGH_MEMORY */ 215 /* Check N_MEMORY */
216 nodes_and(nsc->mask1, 216 nodes_and(nsc->mask1,
217 cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); 217 cpuset_current_mems_allowed, node_states[N_MEMORY]);
218 218
219 VM_BUG_ON(!nodes); 219 VM_BUG_ON(!nodes);
220 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) 220 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
@@ -511,7 +511,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
511 pmd = pmd_offset(pud, addr); 511 pmd = pmd_offset(pud, addr);
512 do { 512 do {
513 next = pmd_addr_end(addr, end); 513 next = pmd_addr_end(addr, end);
514 split_huge_page_pmd(vma->vm_mm, pmd); 514 split_huge_page_pmd(vma, addr, pmd);
515 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 515 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
516 continue; 516 continue;
517 if (check_pte_range(vma, pmd, addr, next, nodes, 517 if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -1388,7 +1388,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1388 goto out_put; 1388 goto out_put;
1389 } 1389 }
1390 1390
1391 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { 1391 if (!nodes_subset(*new, node_states[N_MEMORY])) {
1392 err = -EINVAL; 1392 err = -EINVAL;
1393 goto out_put; 1393 goto out_put;
1394 } 1394 }
@@ -2326,7 +2326,7 @@ void __init numa_policy_init(void)
2326 * fall back to the largest node if they're all smaller. 2326 * fall back to the largest node if they're all smaller.
2327 */ 2327 */
2328 nodes_clear(interleave_nodes); 2328 nodes_clear(interleave_nodes);
2329 for_each_node_state(nid, N_HIGH_MEMORY) { 2329 for_each_node_state(nid, N_MEMORY) {
2330 unsigned long total_pages = node_present_pages(nid); 2330 unsigned long total_pages = node_present_pages(nid);
2331 2331
2332 /* Preserve the largest node */ 2332 /* Preserve the largest node */
@@ -2407,7 +2407,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2407 *nodelist++ = '\0'; 2407 *nodelist++ = '\0';
2408 if (nodelist_parse(nodelist, nodes)) 2408 if (nodelist_parse(nodelist, nodes))
2409 goto out; 2409 goto out;
2410 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY])) 2410 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2411 goto out; 2411 goto out;
2412 } else 2412 } else
2413 nodes_clear(nodes); 2413 nodes_clear(nodes);
@@ -2441,7 +2441,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2441 * Default to online nodes with memory if no nodelist 2441 * Default to online nodes with memory if no nodelist
2442 */ 2442 */
2443 if (!nodelist) 2443 if (!nodelist)
2444 nodes = node_states[N_HIGH_MEMORY]; 2444 nodes = node_states[N_MEMORY];
2445 break; 2445 break;
2446 case MPOL_LOCAL: 2446 case MPOL_LOCAL:
2447 /* 2447 /*
diff --git a/mm/migrate.c b/mm/migrate.c
index 3f675ca08279..cae02711181d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1238,7 +1238,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1238 if (node < 0 || node >= MAX_NUMNODES) 1238 if (node < 0 || node >= MAX_NUMNODES)
1239 goto out_pm; 1239 goto out_pm;
1240 1240
1241 if (!node_state(node, N_HIGH_MEMORY)) 1241 if (!node_state(node, N_MEMORY))
1242 goto out_pm; 1242 goto out_pm;
1243 1243
1244 err = -EACCES; 1244 err = -EACCES;
diff --git a/mm/mmap.c b/mm/mmap.c
index f940062c8d4b..2b7d9e78a569 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1488,7 +1488,11 @@ munmap_back:
1488 * 1488 *
1489 * Answer: Yes, several device drivers can do it in their 1489 * Answer: Yes, several device drivers can do it in their
1490 * f_op->mmap method. -DaveM 1490 * f_op->mmap method. -DaveM
1491 * Bug: If addr is changed, prev, rb_link, rb_parent should
1492 * be updated for vma_link()
1491 */ 1493 */
1494 WARN_ON_ONCE(addr != vma->vm_start);
1495
1492 addr = vma->vm_start; 1496 addr = vma->vm_start;
1493 pgoff = vma->vm_pgoff; 1497 pgoff = vma->vm_pgoff;
1494 vm_flags = vma->vm_flags; 1498 vm_flags = vma->vm_flags;
@@ -2065,6 +2069,18 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2065 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { 2069 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
2066 error = acct_stack_growth(vma, size, grow); 2070 error = acct_stack_growth(vma, size, grow);
2067 if (!error) { 2071 if (!error) {
2072 /*
2073 * vma_gap_update() doesn't support concurrent
2074 * updates, but we only hold a shared mmap_sem
2075 * lock here, so we need to protect against
2076 * concurrent vma expansions.
2077 * vma_lock_anon_vma() doesn't help here, as
2078 * we don't guarantee that all growable vmas
2079 * in a mm share the same root anon vma.
2080 * So, we reuse mm->page_table_lock to guard
2081 * against concurrent vma expansions.
2082 */
2083 spin_lock(&vma->vm_mm->page_table_lock);
2068 anon_vma_interval_tree_pre_update_vma(vma); 2084 anon_vma_interval_tree_pre_update_vma(vma);
2069 vma->vm_end = address; 2085 vma->vm_end = address;
2070 anon_vma_interval_tree_post_update_vma(vma); 2086 anon_vma_interval_tree_post_update_vma(vma);
@@ -2072,6 +2088,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2072 vma_gap_update(vma->vm_next); 2088 vma_gap_update(vma->vm_next);
2073 else 2089 else
2074 vma->vm_mm->highest_vm_end = address; 2090 vma->vm_mm->highest_vm_end = address;
2091 spin_unlock(&vma->vm_mm->page_table_lock);
2092
2075 perf_event_mmap(vma); 2093 perf_event_mmap(vma);
2076 } 2094 }
2077 } 2095 }
@@ -2122,11 +2140,25 @@ int expand_downwards(struct vm_area_struct *vma,
2122 if (grow <= vma->vm_pgoff) { 2140 if (grow <= vma->vm_pgoff) {
2123 error = acct_stack_growth(vma, size, grow); 2141 error = acct_stack_growth(vma, size, grow);
2124 if (!error) { 2142 if (!error) {
2143 /*
2144 * vma_gap_update() doesn't support concurrent
2145 * updates, but we only hold a shared mmap_sem
2146 * lock here, so we need to protect against
2147 * concurrent vma expansions.
2148 * vma_lock_anon_vma() doesn't help here, as
2149 * we don't guarantee that all growable vmas
2150 * in a mm share the same root anon vma.
2151 * So, we reuse mm->page_table_lock to guard
2152 * against concurrent vma expansions.
2153 */
2154 spin_lock(&vma->vm_mm->page_table_lock);
2125 anon_vma_interval_tree_pre_update_vma(vma); 2155 anon_vma_interval_tree_pre_update_vma(vma);
2126 vma->vm_start = address; 2156 vma->vm_start = address;
2127 vma->vm_pgoff -= grow; 2157 vma->vm_pgoff -= grow;
2128 anon_vma_interval_tree_post_update_vma(vma); 2158 anon_vma_interval_tree_post_update_vma(vma);
2129 vma_gap_update(vma); 2159 vma_gap_update(vma);
2160 spin_unlock(&vma->vm_mm->page_table_lock);
2161
2130 perf_event_mmap(vma); 2162 perf_event_mmap(vma);
2131 } 2163 }
2132 } 2164 }
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a40992610ab6..e8c3938db6fa 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -90,7 +90,7 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
90 next = pmd_addr_end(addr, end); 90 next = pmd_addr_end(addr, end);
91 if (pmd_trans_huge(*pmd)) { 91 if (pmd_trans_huge(*pmd)) {
92 if (next - addr != HPAGE_PMD_SIZE) 92 if (next - addr != HPAGE_PMD_SIZE)
93 split_huge_page_pmd(vma->vm_mm, pmd); 93 split_huge_page_pmd(vma, addr, pmd);
94 else if (change_huge_pmd(vma, pmd, addr, newprot)) 94 else if (change_huge_pmd(vma, pmd, addr, newprot))
95 continue; 95 continue;
96 /* fall through */ 96 /* fall through */
diff --git a/mm/mremap.c b/mm/mremap.c
index 1b61c2d3307a..eabb24da6c9e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -182,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
182 need_flush = true; 182 need_flush = true;
183 continue; 183 continue;
184 } else if (!err) { 184 } else if (!err) {
185 split_huge_page_pmd(vma->vm_mm, old_pmd); 185 split_huge_page_pmd(vma, old_addr, old_pmd);
186 } 186 }
187 VM_BUG_ON(pmd_trans_huge(*old_pmd)); 187 VM_BUG_ON(pmd_trans_huge(*old_pmd));
188 } 188 }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bd82f6b31411..b8294fc03df8 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -137,6 +137,22 @@ unsigned long __init free_low_memory_core_early(int nodeid)
137 return count; 137 return count;
138} 138}
139 139
140static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
141{
142 struct zone *z;
143
144 /*
145 * In free_area_init_core(), highmem zone's managed_pages is set to
146 * present_pages, and bootmem allocator doesn't allocate from highmem
147 * zones. So there's no need to recalculate managed_pages because all
148 * highmem pages will be managed by the buddy system. Here highmem
149 * zone also includes highmem movable zone.
150 */
151 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
152 if (!is_highmem(z))
153 z->managed_pages = 0;
154}
155
140/** 156/**
141 * free_all_bootmem_node - release a node's free pages to the buddy allocator 157 * free_all_bootmem_node - release a node's free pages to the buddy allocator
142 * @pgdat: node to be released 158 * @pgdat: node to be released
@@ -146,6 +162,7 @@ unsigned long __init free_low_memory_core_early(int nodeid)
146unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 162unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
147{ 163{
148 register_page_bootmem_info_node(pgdat); 164 register_page_bootmem_info_node(pgdat);
165 reset_node_lowmem_managed_pages(pgdat);
149 166
150 /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ 167 /* free_low_memory_core_early(MAX_NUMNODES) will be called later */
151 return 0; 168 return 0;
@@ -158,6 +175,11 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
158 */ 175 */
159unsigned long __init free_all_bootmem(void) 176unsigned long __init free_all_bootmem(void)
160{ 177{
178 struct pglist_data *pgdat;
179
180 for_each_online_pgdat(pgdat)
181 reset_node_lowmem_managed_pages(pgdat);
182
161 /* 183 /*
162 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id 184 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
163 * because in some case like Node0 doesn't have RAM installed 185 * because in some case like Node0 doesn't have RAM installed
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 18f1ae2b45de..0399f146ae49 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -215,7 +215,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
215 * the page allocator means a mempolicy is in effect. Cpuset policy 215 * the page allocator means a mempolicy is in effect. Cpuset policy
216 * is enforced in get_page_from_freelist(). 216 * is enforced in get_page_from_freelist().
217 */ 217 */
218 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) { 218 if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
219 *totalpages = total_swap_pages; 219 *totalpages = total_swap_pages;
220 for_each_node_mask(nid, *nodemask) 220 for_each_node_mask(nid, *nodemask)
221 *totalpages += node_spanned_pages(nid); 221 *totalpages += node_spanned_pages(nid);
@@ -591,43 +591,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
591 spin_unlock(&zone_scan_lock); 591 spin_unlock(&zone_scan_lock);
592} 592}
593 593
594/*
595 * Try to acquire the oom killer lock for all system zones. Returns zero if a
596 * parallel oom killing is taking place, otherwise locks all zones and returns
597 * non-zero.
598 */
599static int try_set_system_oom(void)
600{
601 struct zone *zone;
602 int ret = 1;
603
604 spin_lock(&zone_scan_lock);
605 for_each_populated_zone(zone)
606 if (zone_is_oom_locked(zone)) {
607 ret = 0;
608 goto out;
609 }
610 for_each_populated_zone(zone)
611 zone_set_flag(zone, ZONE_OOM_LOCKED);
612out:
613 spin_unlock(&zone_scan_lock);
614 return ret;
615}
616
617/*
618 * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
619 * attempts or page faults may now recall the oom killer, if necessary.
620 */
621static void clear_system_oom(void)
622{
623 struct zone *zone;
624
625 spin_lock(&zone_scan_lock);
626 for_each_populated_zone(zone)
627 zone_clear_flag(zone, ZONE_OOM_LOCKED);
628 spin_unlock(&zone_scan_lock);
629}
630
631/** 594/**
632 * out_of_memory - kill the "best" process when we run out of memory 595 * out_of_memory - kill the "best" process when we run out of memory
633 * @zonelist: zonelist pointer 596 * @zonelist: zonelist pointer
@@ -708,15 +671,16 @@ out:
708 671
709/* 672/*
710 * The pagefault handler calls here because it is out of memory, so kill a 673 * The pagefault handler calls here because it is out of memory, so kill a
711 * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel 674 * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
712 * oom killing is already in progress so do nothing. If a task is found with 675 * parallel oom killing is already in progress so do nothing.
713 * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
714 */ 676 */
715void pagefault_out_of_memory(void) 677void pagefault_out_of_memory(void)
716{ 678{
717 if (try_set_system_oom()) { 679 struct zonelist *zonelist = node_zonelist(first_online_node,
680 GFP_KERNEL);
681
682 if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
718 out_of_memory(NULL, 0, 0, NULL, false); 683 out_of_memory(NULL, 0, 0, NULL, false);
719 clear_system_oom(); 684 clear_zonelist_oom(zonelist, GFP_KERNEL);
720 } 685 }
721 schedule_timeout_killable(1);
722} 686}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eee3efa58c91..83637dfba110 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -90,6 +90,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
90#ifdef CONFIG_HIGHMEM 90#ifdef CONFIG_HIGHMEM
91 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 91 [N_HIGH_MEMORY] = { { [0] = 1UL } },
92#endif 92#endif
93#ifdef CONFIG_MOVABLE_NODE
94 [N_MEMORY] = { { [0] = 1UL } },
95#endif
93 [N_CPU] = { { [0] = 1UL } }, 96 [N_CPU] = { { [0] = 1UL } },
94#endif /* NUMA */ 97#endif /* NUMA */
95}; 98};
@@ -732,6 +735,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
732 local_irq_restore(flags); 735 local_irq_restore(flags);
733} 736}
734 737
738/*
739 * Read access to zone->managed_pages is safe because it's unsigned long,
740 * but we still need to serialize writers. Currently all callers of
741 * __free_pages_bootmem() except put_page_bootmem() should only be used
742 * at boot time. So for shorter boot time, we shift the burden to
743 * put_page_bootmem() to serialize writers.
744 */
735void __meminit __free_pages_bootmem(struct page *page, unsigned int order) 745void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
736{ 746{
737 unsigned int nr_pages = 1 << order; 747 unsigned int nr_pages = 1 << order;
@@ -747,6 +757,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
747 set_page_count(p, 0); 757 set_page_count(p, 0);
748 } 758 }
749 759
760 page_zone(page)->managed_pages += 1 << order;
750 set_page_refcounted(page); 761 set_page_refcounted(page);
751 __free_pages(page, order); 762 __free_pages(page, order);
752} 763}
@@ -1695,7 +1706,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1695 * 1706 *
1696 * If the zonelist cache is present in the passed in zonelist, then 1707 * If the zonelist cache is present in the passed in zonelist, then
1697 * returns a pointer to the allowed node mask (either the current 1708 * returns a pointer to the allowed node mask (either the current
1698 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) 1709 * tasks mems_allowed, or node_states[N_MEMORY].)
1699 * 1710 *
1700 * If the zonelist cache is not available for this zonelist, does 1711 * If the zonelist cache is not available for this zonelist, does
1701 * nothing and returns NULL. 1712 * nothing and returns NULL.
@@ -1724,7 +1735,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1724 1735
1725 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1736 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1726 &cpuset_current_mems_allowed : 1737 &cpuset_current_mems_allowed :
1727 &node_states[N_HIGH_MEMORY]; 1738 &node_states[N_MEMORY];
1728 return allowednodes; 1739 return allowednodes;
1729} 1740}
1730 1741
@@ -2981,6 +2992,7 @@ void show_free_areas(unsigned int filter)
2981 " isolated(anon):%lukB" 2992 " isolated(anon):%lukB"
2982 " isolated(file):%lukB" 2993 " isolated(file):%lukB"
2983 " present:%lukB" 2994 " present:%lukB"
2995 " managed:%lukB"
2984 " mlocked:%lukB" 2996 " mlocked:%lukB"
2985 " dirty:%lukB" 2997 " dirty:%lukB"
2986 " writeback:%lukB" 2998 " writeback:%lukB"
@@ -3010,6 +3022,7 @@ void show_free_areas(unsigned int filter)
3010 K(zone_page_state(zone, NR_ISOLATED_ANON)), 3022 K(zone_page_state(zone, NR_ISOLATED_ANON)),
3011 K(zone_page_state(zone, NR_ISOLATED_FILE)), 3023 K(zone_page_state(zone, NR_ISOLATED_FILE)),
3012 K(zone->present_pages), 3024 K(zone->present_pages),
3025 K(zone->managed_pages),
3013 K(zone_page_state(zone, NR_MLOCK)), 3026 K(zone_page_state(zone, NR_MLOCK)),
3014 K(zone_page_state(zone, NR_FILE_DIRTY)), 3027 K(zone_page_state(zone, NR_FILE_DIRTY)),
3015 K(zone_page_state(zone, NR_WRITEBACK)), 3028 K(zone_page_state(zone, NR_WRITEBACK)),
@@ -3238,7 +3251,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
3238 return node; 3251 return node;
3239 } 3252 }
3240 3253
3241 for_each_node_state(n, N_HIGH_MEMORY) { 3254 for_each_node_state(n, N_MEMORY) {
3242 3255
3243 /* Don't want a node to appear more than once */ 3256 /* Don't want a node to appear more than once */
3244 if (node_isset(n, *used_node_mask)) 3257 if (node_isset(n, *used_node_mask))
@@ -3380,7 +3393,7 @@ static int default_zonelist_order(void)
3380 * local memory, NODE_ORDER may be suitable. 3393 * local memory, NODE_ORDER may be suitable.
3381 */ 3394 */
3382 average_size = total_size / 3395 average_size = total_size /
3383 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); 3396 (nodes_weight(node_states[N_MEMORY]) + 1);
3384 for_each_online_node(nid) { 3397 for_each_online_node(nid) {
3385 low_kmem_size = 0; 3398 low_kmem_size = 0;
3386 total_size = 0; 3399 total_size = 0;
@@ -4476,6 +4489,26 @@ void __init set_pageblock_order(void)
4476 4489
4477#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4490#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4478 4491
4492static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4493 unsigned long present_pages)
4494{
4495 unsigned long pages = spanned_pages;
4496
4497 /*
4498 * Provide a more accurate estimation if there are holes within
4499 * the zone and SPARSEMEM is in use. If there are holes within the
4500 * zone, each populated memory region may cost us one or two extra
4501 * memmap pages due to alignment because memmap pages for each
4502 * populated regions may not naturally algined on page boundary.
4503 * So the (present_pages >> 4) heuristic is a tradeoff for that.
4504 */
4505 if (spanned_pages > present_pages + (present_pages >> 4) &&
4506 IS_ENABLED(CONFIG_SPARSEMEM))
4507 pages = present_pages;
4508
4509 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
4510}
4511
4479/* 4512/*
4480 * Set up the zone data structures: 4513 * Set up the zone data structures:
4481 * - mark all pages reserved 4514 * - mark all pages reserved
@@ -4499,48 +4532,56 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4499 4532
4500 for (j = 0; j < MAX_NR_ZONES; j++) { 4533 for (j = 0; j < MAX_NR_ZONES; j++) {
4501 struct zone *zone = pgdat->node_zones + j; 4534 struct zone *zone = pgdat->node_zones + j;
4502 unsigned long size, realsize, memmap_pages; 4535 unsigned long size, realsize, freesize, memmap_pages;
4503 4536
4504 size = zone_spanned_pages_in_node(nid, j, zones_size); 4537 size = zone_spanned_pages_in_node(nid, j, zones_size);
4505 realsize = size - zone_absent_pages_in_node(nid, j, 4538 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4506 zholes_size); 4539 zholes_size);
4507 4540
4508 /* 4541 /*
4509 * Adjust realsize so that it accounts for how much memory 4542 * Adjust freesize so that it accounts for how much memory
4510 * is used by this zone for memmap. This affects the watermark 4543 * is used by this zone for memmap. This affects the watermark
4511 * and per-cpu initialisations 4544 * and per-cpu initialisations
4512 */ 4545 */
4513 memmap_pages = 4546 memmap_pages = calc_memmap_size(size, realsize);
4514 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 4547 if (freesize >= memmap_pages) {
4515 if (realsize >= memmap_pages) { 4548 freesize -= memmap_pages;
4516 realsize -= memmap_pages;
4517 if (memmap_pages) 4549 if (memmap_pages)
4518 printk(KERN_DEBUG 4550 printk(KERN_DEBUG
4519 " %s zone: %lu pages used for memmap\n", 4551 " %s zone: %lu pages used for memmap\n",
4520 zone_names[j], memmap_pages); 4552 zone_names[j], memmap_pages);
4521 } else 4553 } else
4522 printk(KERN_WARNING 4554 printk(KERN_WARNING
4523 " %s zone: %lu pages exceeds realsize %lu\n", 4555 " %s zone: %lu pages exceeds freesize %lu\n",
4524 zone_names[j], memmap_pages, realsize); 4556 zone_names[j], memmap_pages, freesize);
4525 4557
4526 /* Account for reserved pages */ 4558 /* Account for reserved pages */
4527 if (j == 0 && realsize > dma_reserve) { 4559 if (j == 0 && freesize > dma_reserve) {
4528 realsize -= dma_reserve; 4560 freesize -= dma_reserve;
4529 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 4561 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4530 zone_names[0], dma_reserve); 4562 zone_names[0], dma_reserve);
4531 } 4563 }
4532 4564
4533 if (!is_highmem_idx(j)) 4565 if (!is_highmem_idx(j))
4534 nr_kernel_pages += realsize; 4566 nr_kernel_pages += freesize;
4535 nr_all_pages += realsize; 4567 /* Charge for highmem memmap if there are enough kernel pages */
4568 else if (nr_kernel_pages > memmap_pages * 2)
4569 nr_kernel_pages -= memmap_pages;
4570 nr_all_pages += freesize;
4536 4571
4537 zone->spanned_pages = size; 4572 zone->spanned_pages = size;
4538 zone->present_pages = realsize; 4573 zone->present_pages = freesize;
4574 /*
4575 * Set an approximate value for lowmem here, it will be adjusted
4576 * when the bootmem allocator frees pages into the buddy system.
4577 * And all highmem pages will be managed by the buddy system.
4578 */
4579 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4539#ifdef CONFIG_NUMA 4580#ifdef CONFIG_NUMA
4540 zone->node = nid; 4581 zone->node = nid;
4541 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4582 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4542 / 100; 4583 / 100;
4543 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 4584 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4544#endif 4585#endif
4545 zone->name = zone_names[j]; 4586 zone->name = zone_names[j];
4546 spin_lock_init(&zone->lock); 4587 spin_lock_init(&zone->lock);
@@ -4731,7 +4772,7 @@ unsigned long __init find_min_pfn_with_active_regions(void)
4731/* 4772/*
4732 * early_calculate_totalpages() 4773 * early_calculate_totalpages()
4733 * Sum pages in active regions for movable zone. 4774 * Sum pages in active regions for movable zone.
4734 * Populate N_HIGH_MEMORY for calculating usable_nodes. 4775 * Populate N_MEMORY for calculating usable_nodes.
4735 */ 4776 */
4736static unsigned long __init early_calculate_totalpages(void) 4777static unsigned long __init early_calculate_totalpages(void)
4737{ 4778{
@@ -4744,7 +4785,7 @@ static unsigned long __init early_calculate_totalpages(void)
4744 4785
4745 totalpages += pages; 4786 totalpages += pages;
4746 if (pages) 4787 if (pages)
4747 node_set_state(nid, N_HIGH_MEMORY); 4788 node_set_state(nid, N_MEMORY);
4748 } 4789 }
4749 return totalpages; 4790 return totalpages;
4750} 4791}
@@ -4761,9 +4802,9 @@ static void __init find_zone_movable_pfns_for_nodes(void)
4761 unsigned long usable_startpfn; 4802 unsigned long usable_startpfn;
4762 unsigned long kernelcore_node, kernelcore_remaining; 4803 unsigned long kernelcore_node, kernelcore_remaining;
4763 /* save the state before borrow the nodemask */ 4804 /* save the state before borrow the nodemask */
4764 nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; 4805 nodemask_t saved_node_state = node_states[N_MEMORY];
4765 unsigned long totalpages = early_calculate_totalpages(); 4806 unsigned long totalpages = early_calculate_totalpages();
4766 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 4807 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
4767 4808
4768 /* 4809 /*
4769 * If movablecore was specified, calculate what size of 4810 * If movablecore was specified, calculate what size of
@@ -4798,7 +4839,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
4798restart: 4839restart:
4799 /* Spread kernelcore memory as evenly as possible throughout nodes */ 4840 /* Spread kernelcore memory as evenly as possible throughout nodes */
4800 kernelcore_node = required_kernelcore / usable_nodes; 4841 kernelcore_node = required_kernelcore / usable_nodes;
4801 for_each_node_state(nid, N_HIGH_MEMORY) { 4842 for_each_node_state(nid, N_MEMORY) {
4802 unsigned long start_pfn, end_pfn; 4843 unsigned long start_pfn, end_pfn;
4803 4844
4804 /* 4845 /*
@@ -4890,23 +4931,27 @@ restart:
4890 4931
4891out: 4932out:
4892 /* restore the node_state */ 4933 /* restore the node_state */
4893 node_states[N_HIGH_MEMORY] = saved_node_state; 4934 node_states[N_MEMORY] = saved_node_state;
4894} 4935}
4895 4936
4896/* Any regular memory on that node ? */ 4937/* Any regular or high memory on that node ? */
4897static void __init check_for_regular_memory(pg_data_t *pgdat) 4938static void check_for_memory(pg_data_t *pgdat, int nid)
4898{ 4939{
4899#ifdef CONFIG_HIGHMEM
4900 enum zone_type zone_type; 4940 enum zone_type zone_type;
4901 4941
4902 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { 4942 if (N_MEMORY == N_NORMAL_MEMORY)
4943 return;
4944
4945 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
4903 struct zone *zone = &pgdat->node_zones[zone_type]; 4946 struct zone *zone = &pgdat->node_zones[zone_type];
4904 if (zone->present_pages) { 4947 if (zone->present_pages) {
4905 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); 4948 node_set_state(nid, N_HIGH_MEMORY);
4949 if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
4950 zone_type <= ZONE_NORMAL)
4951 node_set_state(nid, N_NORMAL_MEMORY);
4906 break; 4952 break;
4907 } 4953 }
4908 } 4954 }
4909#endif
4910} 4955}
4911 4956
4912/** 4957/**
@@ -4989,8 +5034,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4989 5034
4990 /* Any memory on that node */ 5035 /* Any memory on that node */
4991 if (pgdat->node_present_pages) 5036 if (pgdat->node_present_pages)
4992 node_set_state(nid, N_HIGH_MEMORY); 5037 node_set_state(nid, N_MEMORY);
4993 check_for_regular_memory(pgdat); 5038 check_for_memory(pgdat, nid);
4994 } 5039 }
4995} 5040}
4996 5041
@@ -5727,7 +5772,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
5727 unsigned int tries = 0; 5772 unsigned int tries = 0;
5728 int ret = 0; 5773 int ret = 0;
5729 5774
5730 migrate_prep_local(); 5775 migrate_prep();
5731 5776
5732 while (pfn < end || !list_empty(&cc->migratepages)) { 5777 while (pfn < end || !list_empty(&cc->migratepages)) {
5733 if (fatal_signal_pending(current)) { 5778 if (fatal_signal_pending(current)) {
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 44db00e253ed..6d757e3a872a 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -274,7 +274,7 @@ void __init page_cgroup_init(void)
274 if (mem_cgroup_disabled()) 274 if (mem_cgroup_disabled())
275 return; 275 return;
276 276
277 for_each_node_state(nid, N_HIGH_MEMORY) { 277 for_each_node_state(nid, N_MEMORY) {
278 unsigned long start_pfn, end_pfn; 278 unsigned long start_pfn, end_pfn;
279 279
280 start_pfn = node_start_pfn(nid); 280 start_pfn = node_start_pfn(nid);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 6c118d012bb5..35aa294656cd 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ again:
58 if (!walk->pte_entry) 58 if (!walk->pte_entry)
59 continue; 59 continue;
60 60
61 split_huge_page_pmd(walk->mm, pmd); 61 split_huge_page_pmd_mm(walk->mm, addr, pmd);
62 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 62 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
63 goto again; 63 goto again;
64 err = walk_pte_range(pmd, addr, next, walk); 64 err = walk_pte_range(pmd, addr, next, walk);
diff --git a/mm/rmap.c b/mm/rmap.c
index cf7e99a87c32..face808a489e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1249,12 +1249,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1249 update_hiwater_rss(mm); 1249 update_hiwater_rss(mm);
1250 1250
1251 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 1251 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
1252 if (PageAnon(page)) 1252 if (!PageHuge(page)) {
1253 dec_mm_counter(mm, MM_ANONPAGES); 1253 if (PageAnon(page))
1254 else 1254 dec_mm_counter(mm, MM_ANONPAGES);
1255 dec_mm_counter(mm, MM_FILEPAGES); 1255 else
1256 dec_mm_counter(mm, MM_FILEPAGES);
1257 }
1256 set_pte_at(mm, address, pte, 1258 set_pte_at(mm, address, pte,
1257 swp_entry_to_pte(make_hwpoison_entry(page))); 1259 swp_entry_to_pte(make_hwpoison_entry(page)));
1258 } else if (PageAnon(page)) { 1260 } else if (PageAnon(page)) {
1259 swp_entry_t entry = { .val = page_private(page) }; 1261 swp_entry_t entry = { .val = page_private(page) };
1260 1262
diff --git a/mm/shmem.c b/mm/shmem.c
index 50c5b8f3a359..03f9ba8fb8e5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1715,6 +1715,96 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1715 return error; 1715 return error;
1716} 1716}
1717 1717
1718/*
1719 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
1720 */
1721static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
1722 pgoff_t index, pgoff_t end, int origin)
1723{
1724 struct page *page;
1725 struct pagevec pvec;
1726 pgoff_t indices[PAGEVEC_SIZE];
1727 bool done = false;
1728 int i;
1729
1730 pagevec_init(&pvec, 0);
1731 pvec.nr = 1; /* start small: we may be there already */
1732 while (!done) {
1733 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
1734 pvec.nr, pvec.pages, indices);
1735 if (!pvec.nr) {
1736 if (origin == SEEK_DATA)
1737 index = end;
1738 break;
1739 }
1740 for (i = 0; i < pvec.nr; i++, index++) {
1741 if (index < indices[i]) {
1742 if (origin == SEEK_HOLE) {
1743 done = true;
1744 break;
1745 }
1746 index = indices[i];
1747 }
1748 page = pvec.pages[i];
1749 if (page && !radix_tree_exceptional_entry(page)) {
1750 if (!PageUptodate(page))
1751 page = NULL;
1752 }
1753 if (index >= end ||
1754 (page && origin == SEEK_DATA) ||
1755 (!page && origin == SEEK_HOLE)) {
1756 done = true;
1757 break;
1758 }
1759 }
1760 shmem_deswap_pagevec(&pvec);
1761 pagevec_release(&pvec);
1762 pvec.nr = PAGEVEC_SIZE;
1763 cond_resched();
1764 }
1765 return index;
1766}
1767
1768static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
1769{
1770 struct address_space *mapping = file->f_mapping;
1771 struct inode *inode = mapping->host;
1772 pgoff_t start, end;
1773 loff_t new_offset;
1774
1775 if (origin != SEEK_DATA && origin != SEEK_HOLE)
1776 return generic_file_llseek_size(file, offset, origin,
1777 MAX_LFS_FILESIZE, i_size_read(inode));
1778 mutex_lock(&inode->i_mutex);
1779 /* We're holding i_mutex so we can access i_size directly */
1780
1781 if (offset < 0)
1782 offset = -EINVAL;
1783 else if (offset >= inode->i_size)
1784 offset = -ENXIO;
1785 else {
1786 start = offset >> PAGE_CACHE_SHIFT;
1787 end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1788 new_offset = shmem_seek_hole_data(mapping, start, end, origin);
1789 new_offset <<= PAGE_CACHE_SHIFT;
1790 if (new_offset > offset) {
1791 if (new_offset < inode->i_size)
1792 offset = new_offset;
1793 else if (origin == SEEK_DATA)
1794 offset = -ENXIO;
1795 else
1796 offset = inode->i_size;
1797 }
1798 }
1799
1800 if (offset >= 0 && offset != file->f_pos) {
1801 file->f_pos = offset;
1802 file->f_version = 0;
1803 }
1804 mutex_unlock(&inode->i_mutex);
1805 return offset;
1806}
1807
1718static long shmem_fallocate(struct file *file, int mode, loff_t offset, 1808static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1719 loff_t len) 1809 loff_t len)
1720{ 1810{
@@ -2586,7 +2676,7 @@ static const struct address_space_operations shmem_aops = {
2586static const struct file_operations shmem_file_operations = { 2676static const struct file_operations shmem_file_operations = {
2587 .mmap = shmem_mmap, 2677 .mmap = shmem_mmap,
2588#ifdef CONFIG_TMPFS 2678#ifdef CONFIG_TMPFS
2589 .llseek = generic_file_llseek, 2679 .llseek = shmem_file_llseek,
2590 .read = do_sync_read, 2680 .read = do_sync_read,
2591 .write = do_sync_write, 2681 .write = do_sync_write,
2592 .aio_read = shmem_file_aio_read, 2682 .aio_read = shmem_file_aio_read,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 157bb116dec8..7f3096137b8a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3131,7 +3131,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
3131 int nid; 3131 int nid;
3132 3132
3133 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { 3133 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
3134 for_each_node_state(nid, N_HIGH_MEMORY) { 3134 for_each_node_state(nid, N_MEMORY) {
3135 pg_data_t *pgdat = NODE_DATA(nid); 3135 pg_data_t *pgdat = NODE_DATA(nid);
3136 const struct cpumask *mask; 3136 const struct cpumask *mask;
3137 3137
@@ -3187,7 +3187,7 @@ static int __init kswapd_init(void)
3187 int nid; 3187 int nid;
3188 3188
3189 swap_setup(); 3189 swap_setup();
3190 for_each_node_state(nid, N_HIGH_MEMORY) 3190 for_each_node_state(nid, N_MEMORY)
3191 kswapd_run(nid); 3191 kswapd_run(nid);
3192 hotcpu_notifier(cpu_callback, 0); 3192 hotcpu_notifier(cpu_callback, 0);
3193 return 0; 3193 return 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c7370579111b..df14808f0a36 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -801,6 +801,8 @@ const char * const vmstat_text[] = {
801 "thp_collapse_alloc", 801 "thp_collapse_alloc",
802 "thp_collapse_alloc_failed", 802 "thp_collapse_alloc_failed",
803 "thp_split", 803 "thp_split",
804 "thp_zero_page_alloc",
805 "thp_zero_page_alloc_failed",
804#endif 806#endif
805 807
806#endif /* CONFIG_VM_EVENTS_COUNTERS */ 808#endif /* CONFIG_VM_EVENTS_COUNTERS */
@@ -930,7 +932,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
930 pg_data_t *pgdat = (pg_data_t *)arg; 932 pg_data_t *pgdat = (pg_data_t *)arg;
931 933
932 /* check memoryless node */ 934 /* check memoryless node */
933 if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) 935 if (!node_state(pgdat->node_id, N_MEMORY))
934 return 0; 936 return 0;
935 937
936 seq_printf(m, "Page block order: %d\n", pageblock_order); 938 seq_printf(m, "Page block order: %d\n", pageblock_order);
@@ -992,14 +994,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
992 "\n high %lu" 994 "\n high %lu"
993 "\n scanned %lu" 995 "\n scanned %lu"
994 "\n spanned %lu" 996 "\n spanned %lu"
995 "\n present %lu", 997 "\n present %lu"
998 "\n managed %lu",
996 zone_page_state(zone, NR_FREE_PAGES), 999 zone_page_state(zone, NR_FREE_PAGES),
997 min_wmark_pages(zone), 1000 min_wmark_pages(zone),
998 low_wmark_pages(zone), 1001 low_wmark_pages(zone),
999 high_wmark_pages(zone), 1002 high_wmark_pages(zone),
1000 zone->pages_scanned, 1003 zone->pages_scanned,
1001 zone->spanned_pages, 1004 zone->spanned_pages,
1002 zone->present_pages); 1005 zone->present_pages,
1006 zone->managed_pages);
1003 1007
1004 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 1008 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1005 seq_printf(m, "\n %-12s %lu", vmstat_text[i], 1009 seq_printf(m, "\n %-12s %lu", vmstat_text[i],
@@ -1292,7 +1296,7 @@ static int unusable_show(struct seq_file *m, void *arg)
1292 pg_data_t *pgdat = (pg_data_t *)arg; 1296 pg_data_t *pgdat = (pg_data_t *)arg;
1293 1297
1294 /* check memoryless node */ 1298 /* check memoryless node */
1295 if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) 1299 if (!node_state(pgdat->node_id, N_MEMORY))
1296 return 0; 1300 return 0;
1297 1301
1298 walk_zones_in_node(m, pgdat, unusable_show_print); 1302 walk_zones_in_node(m, pgdat, unusable_show_print);