diff options
52 files changed, 996 insertions, 422 deletions
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt index cefd3d8bbd11..12e01d432bfe 100644 --- a/Documentation/cgroups/cpusets.txt +++ b/Documentation/cgroups/cpusets.txt | |||
@@ -218,7 +218,7 @@ and name space for cpusets, with a minimum of additional kernel code. | |||
218 | The cpus and mems files in the root (top_cpuset) cpuset are | 218 | The cpus and mems files in the root (top_cpuset) cpuset are |
219 | read-only. The cpus file automatically tracks the value of | 219 | read-only. The cpus file automatically tracks the value of |
220 | cpu_online_mask using a CPU hotplug notifier, and the mems file | 220 | cpu_online_mask using a CPU hotplug notifier, and the mems file |
221 | automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e., | 221 | automatically tracks the value of node_states[N_MEMORY]--i.e., |
222 | nodes with memory--using the cpuset_track_online_nodes() hook. | 222 | nodes with memory--using the cpuset_track_online_nodes() hook. |
223 | 223 | ||
224 | 224 | ||
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index c6f993d491b5..8e5eacbdcfa3 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt | |||
@@ -390,6 +390,7 @@ struct memory_notify { | |||
390 | unsigned long start_pfn; | 390 | unsigned long start_pfn; |
391 | unsigned long nr_pages; | 391 | unsigned long nr_pages; |
392 | int status_change_nid_normal; | 392 | int status_change_nid_normal; |
393 | int status_change_nid_high; | ||
393 | int status_change_nid; | 394 | int status_change_nid; |
394 | } | 395 | } |
395 | 396 | ||
@@ -397,7 +398,9 @@ start_pfn is start_pfn of online/offline memory. | |||
397 | nr_pages is # of pages of online/offline memory. | 398 | nr_pages is # of pages of online/offline memory. |
398 | status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask | 399 | status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask |
399 | is (will be) set/clear, if this is -1, then nodemask status is not changed. | 400 | is (will be) set/clear, if this is -1, then nodemask status is not changed. |
400 | status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be) | 401 | status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask |
402 | is (will be) set/clear, if this is -1, then nodemask status is not changed. | ||
403 | status_change_nid is set node id when N_MEMORY of nodemask is (will be) | ||
401 | set/clear. It means a new(memoryless) node gets new memory by online and a | 404 | set/clear. It means a new(memoryless) node gets new memory by online and a |
402 | node loses all memory. If this is -1, then nodemask status is not changed. | 405 | node loses all memory. If this is -1, then nodemask status is not changed. |
403 | If status_changed_nid* >= 0, callback should create/discard structures for the | 406 | If status_changed_nid* >= 0, callback should create/discard structures for the |
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index f734bb2a78dc..8785fb87d9c7 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt | |||
@@ -116,6 +116,13 @@ echo always >/sys/kernel/mm/transparent_hugepage/defrag | |||
116 | echo madvise >/sys/kernel/mm/transparent_hugepage/defrag | 116 | echo madvise >/sys/kernel/mm/transparent_hugepage/defrag |
117 | echo never >/sys/kernel/mm/transparent_hugepage/defrag | 117 | echo never >/sys/kernel/mm/transparent_hugepage/defrag |
118 | 118 | ||
119 | By default kernel tries to use huge zero page on read page fault. | ||
120 | It's possible to disable huge zero page by writing 0 or enable it | ||
121 | back by writing 1: | ||
122 | |||
123 | echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page | ||
124 | echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page | ||
125 | |||
119 | khugepaged will be automatically started when | 126 | khugepaged will be automatically started when |
120 | transparent_hugepage/enabled is set to "always" or "madvise, and it'll | 127 | transparent_hugepage/enabled is set to "always" or "madvise, and it'll |
121 | be automatically shutdown if it's set to "never". | 128 | be automatically shutdown if it's set to "never". |
@@ -197,6 +204,14 @@ thp_split is incremented every time a huge page is split into base | |||
197 | pages. This can happen for a variety of reasons but a common | 204 | pages. This can happen for a variety of reasons but a common |
198 | reason is that a huge page is old and is being reclaimed. | 205 | reason is that a huge page is old and is being reclaimed. |
199 | 206 | ||
207 | thp_zero_page_alloc is incremented every time a huge zero page is | ||
208 | successfully allocated. It includes allocations which where | ||
209 | dropped due race with other allocation. Note, it doesn't count | ||
210 | every map of the huge zero page, only its allocation. | ||
211 | |||
212 | thp_zero_page_alloc_failed is incremented if kernel fails to allocate | ||
213 | huge zero page and falls back to using small pages. | ||
214 | |||
200 | As the system ages, allocating huge pages may be expensive as the | 215 | As the system ages, allocating huge pages may be expensive as the |
201 | system uses memory compaction to copy data around memory to free a | 216 | system uses memory compaction to copy data around memory to free a |
202 | huge page for use. There are some counters in /proc/vmstat to help | 217 | huge page for use. There are some counters in /proc/vmstat to help |
@@ -276,7 +291,7 @@ unaffected. libhugetlbfs will also work fine as usual. | |||
276 | == Graceful fallback == | 291 | == Graceful fallback == |
277 | 292 | ||
278 | Code walking pagetables but unware about huge pmds can simply call | 293 | Code walking pagetables but unware about huge pmds can simply call |
279 | split_huge_page_pmd(mm, pmd) where the pmd is the one returned by | 294 | split_huge_page_pmd(vma, addr, pmd) where the pmd is the one returned by |
280 | pmd_offset. It's trivial to make the code transparent hugepage aware | 295 | pmd_offset. It's trivial to make the code transparent hugepage aware |
281 | by just grepping for "pmd_offset" and adding split_huge_page_pmd where | 296 | by just grepping for "pmd_offset" and adding split_huge_page_pmd where |
282 | missing after pmd_offset returns the pmd. Thanks to the graceful | 297 | missing after pmd_offset returns the pmd. Thanks to the graceful |
@@ -299,7 +314,7 @@ diff --git a/mm/mremap.c b/mm/mremap.c | |||
299 | return NULL; | 314 | return NULL; |
300 | 315 | ||
301 | pmd = pmd_offset(pud, addr); | 316 | pmd = pmd_offset(pud, addr); |
302 | + split_huge_page_pmd(mm, pmd); | 317 | + split_huge_page_pmd(vma, addr, pmd); |
303 | if (pmd_none_or_clear_bad(pmd)) | 318 | if (pmd_none_or_clear_bad(pmd)) |
304 | return NULL; | 319 | return NULL; |
305 | 320 | ||
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index c02158be836c..14490e9443af 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h | |||
@@ -76,16 +76,7 @@ extern unsigned long zero_page_mask; | |||
76 | 76 | ||
77 | #define ZERO_PAGE(vaddr) \ | 77 | #define ZERO_PAGE(vaddr) \ |
78 | (virt_to_page((void *)(empty_zero_page + (((unsigned long)(vaddr)) & zero_page_mask)))) | 78 | (virt_to_page((void *)(empty_zero_page + (((unsigned long)(vaddr)) & zero_page_mask)))) |
79 | 79 | #define __HAVE_COLOR_ZERO_PAGE | |
80 | #define is_zero_pfn is_zero_pfn | ||
81 | static inline int is_zero_pfn(unsigned long pfn) | ||
82 | { | ||
83 | extern unsigned long zero_pfn; | ||
84 | unsigned long offset_from_zero_pfn = pfn - zero_pfn; | ||
85 | return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); | ||
86 | } | ||
87 | |||
88 | #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) | ||
89 | 80 | ||
90 | extern void paging_init(void); | 81 | extern void paging_init(void); |
91 | 82 | ||
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 0a6b28336eb0..3a8489a354e9 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c | |||
@@ -113,19 +113,6 @@ static int store_updates_sp(struct pt_regs *regs) | |||
113 | #define MM_FAULT_CONTINUE -1 | 113 | #define MM_FAULT_CONTINUE -1 |
114 | #define MM_FAULT_ERR(sig) (sig) | 114 | #define MM_FAULT_ERR(sig) (sig) |
115 | 115 | ||
116 | static int out_of_memory(struct pt_regs *regs) | ||
117 | { | ||
118 | /* | ||
119 | * We ran out of memory, or some other thing happened to us that made | ||
120 | * us unable to handle the page fault gracefully. | ||
121 | */ | ||
122 | up_read(¤t->mm->mmap_sem); | ||
123 | if (!user_mode(regs)) | ||
124 | return MM_FAULT_ERR(SIGKILL); | ||
125 | pagefault_out_of_memory(); | ||
126 | return MM_FAULT_RETURN; | ||
127 | } | ||
128 | |||
129 | static int do_sigbus(struct pt_regs *regs, unsigned long address) | 116 | static int do_sigbus(struct pt_regs *regs, unsigned long address) |
130 | { | 117 | { |
131 | siginfo_t info; | 118 | siginfo_t info; |
@@ -169,8 +156,18 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) | |||
169 | return MM_FAULT_CONTINUE; | 156 | return MM_FAULT_CONTINUE; |
170 | 157 | ||
171 | /* Out of memory */ | 158 | /* Out of memory */ |
172 | if (fault & VM_FAULT_OOM) | 159 | if (fault & VM_FAULT_OOM) { |
173 | return out_of_memory(regs); | 160 | up_read(¤t->mm->mmap_sem); |
161 | |||
162 | /* | ||
163 | * We ran out of memory, or some other thing happened to us that | ||
164 | * made us unable to handle the page fault gracefully. | ||
165 | */ | ||
166 | if (!user_mode(regs)) | ||
167 | return MM_FAULT_ERR(SIGKILL); | ||
168 | pagefault_out_of_memory(); | ||
169 | return MM_FAULT_RETURN; | ||
170 | } | ||
174 | 171 | ||
175 | /* Bus error. x86 handles HWPOISON here, we'll add this if/when | 172 | /* Bus error. x86 handles HWPOISON here, we'll add this if/when |
176 | * we support the feature in HW | 173 | * we support the feature in HW |
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2d3b7cb26005..c814e6f5b57d 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h | |||
@@ -55,16 +55,7 @@ extern unsigned long zero_page_mask; | |||
55 | #define ZERO_PAGE(vaddr) \ | 55 | #define ZERO_PAGE(vaddr) \ |
56 | (virt_to_page((void *)(empty_zero_page + \ | 56 | (virt_to_page((void *)(empty_zero_page + \ |
57 | (((unsigned long)(vaddr)) &zero_page_mask)))) | 57 | (((unsigned long)(vaddr)) &zero_page_mask)))) |
58 | 58 | #define __HAVE_COLOR_ZERO_PAGE | |
59 | #define is_zero_pfn is_zero_pfn | ||
60 | static inline int is_zero_pfn(unsigned long pfn) | ||
61 | { | ||
62 | extern unsigned long zero_pfn; | ||
63 | unsigned long offset_from_zero_pfn = pfn - zero_pfn; | ||
64 | return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); | ||
65 | } | ||
66 | |||
67 | #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) | ||
68 | 59 | ||
69 | #endif /* !__ASSEMBLY__ */ | 60 | #endif /* !__ASSEMBLY__ */ |
70 | 61 | ||
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index cbbdcad8fcb3..1f49c28affa9 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c | |||
@@ -301,17 +301,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, | |||
301 | __bad_area(regs, error_code, address, SEGV_ACCERR); | 301 | __bad_area(regs, error_code, address, SEGV_ACCERR); |
302 | } | 302 | } |
303 | 303 | ||
304 | static void out_of_memory(void) | ||
305 | { | ||
306 | /* | ||
307 | * We ran out of memory, call the OOM killer, and return the userspace | ||
308 | * (which will retry the fault, or kill us if we got oom-killed): | ||
309 | */ | ||
310 | up_read(¤t->mm->mmap_sem); | ||
311 | |||
312 | pagefault_out_of_memory(); | ||
313 | } | ||
314 | |||
315 | static void | 304 | static void |
316 | do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) | 305 | do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) |
317 | { | 306 | { |
@@ -353,8 +342,14 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, | |||
353 | no_context(regs, error_code, address); | 342 | no_context(regs, error_code, address); |
354 | return 1; | 343 | return 1; |
355 | } | 344 | } |
345 | up_read(¤t->mm->mmap_sem); | ||
356 | 346 | ||
357 | out_of_memory(); | 347 | /* |
348 | * We ran out of memory, call the OOM killer, and return the | ||
349 | * userspace (which will retry the fault, or kill us if we got | ||
350 | * oom-killed): | ||
351 | */ | ||
352 | pagefault_out_of_memory(); | ||
358 | } else { | 353 | } else { |
359 | if (fault & VM_FAULT_SIGBUS) | 354 | if (fault & VM_FAULT_SIGBUS) |
360 | do_sigbus(regs, error_code, address); | 355 | do_sigbus(regs, error_code, address); |
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 5c9687b1bde6..1dfe69cc78a8 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
@@ -182,7 +182,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) | |||
182 | if (pud_none_or_clear_bad(pud)) | 182 | if (pud_none_or_clear_bad(pud)) |
183 | goto out; | 183 | goto out; |
184 | pmd = pmd_offset(pud, 0xA0000); | 184 | pmd = pmd_offset(pud, 0xA0000); |
185 | split_huge_page_pmd(mm, pmd); | 185 | split_huge_page_pmd_mm(mm, 0xA0000, pmd); |
186 | if (pmd_none_or_clear_bad(pmd)) | 186 | if (pmd_none_or_clear_bad(pmd)) |
187 | goto out; | 187 | goto out; |
188 | pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); | 188 | pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7a529cbab7ad..027088f2f7dd 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -803,20 +803,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, | |||
803 | __bad_area(regs, error_code, address, SEGV_ACCERR); | 803 | __bad_area(regs, error_code, address, SEGV_ACCERR); |
804 | } | 804 | } |
805 | 805 | ||
806 | /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ | ||
807 | static void | ||
808 | out_of_memory(struct pt_regs *regs, unsigned long error_code, | ||
809 | unsigned long address) | ||
810 | { | ||
811 | /* | ||
812 | * We ran out of memory, call the OOM killer, and return the userspace | ||
813 | * (which will retry the fault, or kill us if we got oom-killed): | ||
814 | */ | ||
815 | up_read(¤t->mm->mmap_sem); | ||
816 | |||
817 | pagefault_out_of_memory(); | ||
818 | } | ||
819 | |||
820 | static void | 806 | static void |
821 | do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, | 807 | do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, |
822 | unsigned int fault) | 808 | unsigned int fault) |
@@ -879,7 +865,14 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, | |||
879 | return 1; | 865 | return 1; |
880 | } | 866 | } |
881 | 867 | ||
882 | out_of_memory(regs, error_code, address); | 868 | up_read(¤t->mm->mmap_sem); |
869 | |||
870 | /* | ||
871 | * We ran out of memory, call the OOM killer, and return the | ||
872 | * userspace (which will retry the fault, or kill us if we got | ||
873 | * oom-killed): | ||
874 | */ | ||
875 | pagefault_out_of_memory(); | ||
883 | } else { | 876 | } else { |
884 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| | 877 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| |
885 | VM_FAULT_HWPOISON_LARGE)) | 878 | VM_FAULT_HWPOISON_LARGE)) |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3baff255adac..2ead3c8a4c84 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -630,7 +630,9 @@ void __init paging_init(void) | |||
630 | * numa support is not compiled in, and later node_set_state | 630 | * numa support is not compiled in, and later node_set_state |
631 | * will not set it back. | 631 | * will not set it back. |
632 | */ | 632 | */ |
633 | node_clear_state(0, N_NORMAL_MEMORY); | 633 | node_clear_state(0, N_MEMORY); |
634 | if (N_MEMORY != N_NORMAL_MEMORY) | ||
635 | node_clear_state(0, N_NORMAL_MEMORY); | ||
634 | 636 | ||
635 | zone_sizes_init(); | 637 | zone_sizes_init(); |
636 | } | 638 | } |
diff --git a/drivers/base/node.c b/drivers/base/node.c index 294e31626210..fac124a7e1c5 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c | |||
@@ -227,7 +227,7 @@ static node_registration_func_t __hugetlb_unregister_node; | |||
227 | static inline bool hugetlb_register_node(struct node *node) | 227 | static inline bool hugetlb_register_node(struct node *node) |
228 | { | 228 | { |
229 | if (__hugetlb_register_node && | 229 | if (__hugetlb_register_node && |
230 | node_state(node->dev.id, N_HIGH_MEMORY)) { | 230 | node_state(node->dev.id, N_MEMORY)) { |
231 | __hugetlb_register_node(node); | 231 | __hugetlb_register_node(node); |
232 | return true; | 232 | return true; |
233 | } | 233 | } |
@@ -644,6 +644,9 @@ static struct node_attr node_state_attr[] = { | |||
644 | #ifdef CONFIG_HIGHMEM | 644 | #ifdef CONFIG_HIGHMEM |
645 | [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY), | 645 | [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY), |
646 | #endif | 646 | #endif |
647 | #ifdef CONFIG_MOVABLE_NODE | ||
648 | [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY), | ||
649 | #endif | ||
647 | [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), | 650 | [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), |
648 | }; | 651 | }; |
649 | 652 | ||
@@ -654,6 +657,9 @@ static struct attribute *node_state_attrs[] = { | |||
654 | #ifdef CONFIG_HIGHMEM | 657 | #ifdef CONFIG_HIGHMEM |
655 | &node_state_attr[N_HIGH_MEMORY].attr.attr, | 658 | &node_state_attr[N_HIGH_MEMORY].attr.attr, |
656 | #endif | 659 | #endif |
660 | #ifdef CONFIG_MOVABLE_NODE | ||
661 | &node_state_attr[N_MEMORY].attr.attr, | ||
662 | #endif | ||
657 | &node_state_attr[N_CPU].attr.attr, | 663 | &node_state_attr[N_CPU].attr.attr, |
658 | NULL | 664 | NULL |
659 | }; | 665 | }; |
diff --git a/fs/buffer.c b/fs/buffer.c index 6e9ed48064fc..c017a2dfb909 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -46,8 +46,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); | |||
46 | 46 | ||
47 | #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) | 47 | #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) |
48 | 48 | ||
49 | inline void | 49 | void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) |
50 | init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) | ||
51 | { | 50 | { |
52 | bh->b_end_io = handler; | 51 | bh->b_end_io = handler; |
53 | bh->b_private = private; | 52 | bh->b_private = private; |
@@ -850,13 +849,10 @@ try_again: | |||
850 | if (!bh) | 849 | if (!bh) |
851 | goto no_grow; | 850 | goto no_grow; |
852 | 851 | ||
853 | bh->b_bdev = NULL; | ||
854 | bh->b_this_page = head; | 852 | bh->b_this_page = head; |
855 | bh->b_blocknr = -1; | 853 | bh->b_blocknr = -1; |
856 | head = bh; | 854 | head = bh; |
857 | 855 | ||
858 | bh->b_state = 0; | ||
859 | atomic_set(&bh->b_count, 0); | ||
860 | bh->b_size = size; | 856 | bh->b_size = size; |
861 | 857 | ||
862 | /* Link the buffer to its page */ | 858 | /* Link the buffer to its page */ |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3e3422f7f0a4..310972b72a66 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -1034,7 +1034,7 @@ int bdi_writeback_thread(void *data) | |||
1034 | while (!kthread_freezable_should_stop(NULL)) { | 1034 | while (!kthread_freezable_should_stop(NULL)) { |
1035 | /* | 1035 | /* |
1036 | * Remove own delayed wake-up timer, since we are already awake | 1036 | * Remove own delayed wake-up timer, since we are already awake |
1037 | * and we'll take care of the preriodic write-back. | 1037 | * and we'll take care of the periodic write-back. |
1038 | */ | 1038 | */ |
1039 | del_timer(&wb->wakeup_timer); | 1039 | del_timer(&wb->wakeup_timer); |
1040 | 1040 | ||
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 86c67eee439f..e96d4f18ca3a 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c | |||
@@ -249,7 +249,7 @@ static int kcore_update_ram(void) | |||
249 | /* Not inialized....update now */ | 249 | /* Not inialized....update now */ |
250 | /* find out "max pfn" */ | 250 | /* find out "max pfn" */ |
251 | end_pfn = 0; | 251 | end_pfn = 0; |
252 | for_each_node_state(nid, N_HIGH_MEMORY) { | 252 | for_each_node_state(nid, N_MEMORY) { |
253 | unsigned long node_end; | 253 | unsigned long node_end; |
254 | node_end = NODE_DATA(nid)->node_start_pfn + | 254 | node_end = NODE_DATA(nid)->node_start_pfn + |
255 | NODE_DATA(nid)->node_spanned_pages; | 255 | NODE_DATA(nid)->node_spanned_pages; |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 90c63f9392a5..48775628abbf 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -643,7 +643,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, | |||
643 | spinlock_t *ptl; | 643 | spinlock_t *ptl; |
644 | struct page *page; | 644 | struct page *page; |
645 | 645 | ||
646 | split_huge_page_pmd(walk->mm, pmd); | 646 | split_huge_page_pmd(vma, addr, pmd); |
647 | if (pmd_trans_unstable(pmd)) | 647 | if (pmd_trans_unstable(pmd)) |
648 | return 0; | 648 | return 0; |
649 | 649 | ||
@@ -1126,7 +1126,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, | |||
1126 | return NULL; | 1126 | return NULL; |
1127 | 1127 | ||
1128 | nid = page_to_nid(page); | 1128 | nid = page_to_nid(page); |
1129 | if (!node_isset(nid, node_states[N_HIGH_MEMORY])) | 1129 | if (!node_isset(nid, node_states[N_MEMORY])) |
1130 | return NULL; | 1130 | return NULL; |
1131 | 1131 | ||
1132 | return page; | 1132 | return page; |
@@ -1279,7 +1279,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) | |||
1279 | if (md->writeback) | 1279 | if (md->writeback) |
1280 | seq_printf(m, " writeback=%lu", md->writeback); | 1280 | seq_printf(m, " writeback=%lu", md->writeback); |
1281 | 1281 | ||
1282 | for_each_node_state(n, N_HIGH_MEMORY) | 1282 | for_each_node_state(n, N_MEMORY) |
1283 | if (md->node[n]) | 1283 | if (md->node[n]) |
1284 | seq_printf(m, " N%d=%lu", n, md->node[n]); | 1284 | seq_printf(m, " N%d=%lu", n, md->node[n]); |
1285 | out: | 1285 | out: |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index b36ce40bd1c6..284e80831d2c 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
@@ -449,6 +449,32 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, | |||
449 | unsigned long size); | 449 | unsigned long size); |
450 | #endif | 450 | #endif |
451 | 451 | ||
452 | #ifdef __HAVE_COLOR_ZERO_PAGE | ||
453 | static inline int is_zero_pfn(unsigned long pfn) | ||
454 | { | ||
455 | extern unsigned long zero_pfn; | ||
456 | unsigned long offset_from_zero_pfn = pfn - zero_pfn; | ||
457 | return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); | ||
458 | } | ||
459 | |||
460 | static inline unsigned long my_zero_pfn(unsigned long addr) | ||
461 | { | ||
462 | return page_to_pfn(ZERO_PAGE(addr)); | ||
463 | } | ||
464 | #else | ||
465 | static inline int is_zero_pfn(unsigned long pfn) | ||
466 | { | ||
467 | extern unsigned long zero_pfn; | ||
468 | return pfn == zero_pfn; | ||
469 | } | ||
470 | |||
471 | static inline unsigned long my_zero_pfn(unsigned long addr) | ||
472 | { | ||
473 | extern unsigned long zero_pfn; | ||
474 | return zero_pfn; | ||
475 | } | ||
476 | #endif | ||
477 | |||
452 | #ifdef CONFIG_MMU | 478 | #ifdef CONFIG_MMU |
453 | 479 | ||
454 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE | 480 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE |
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 7b74452c5317..3f778c27f825 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h | |||
@@ -137,9 +137,6 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, | |||
137 | #define alloc_bootmem_low_pages_node(pgdat, x) \ | 137 | #define alloc_bootmem_low_pages_node(pgdat, x) \ |
138 | __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) | 138 | __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) |
139 | 139 | ||
140 | extern int reserve_bootmem_generic(unsigned long addr, unsigned long size, | ||
141 | int flags); | ||
142 | |||
143 | #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP | 140 | #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP |
144 | extern void *alloc_remap(int nid, unsigned long size); | 141 | extern void *alloc_remap(int nid, unsigned long size); |
145 | #else | 142 | #else |
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 838320fc3d1d..8c8a60d29407 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -144,7 +144,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) | |||
144 | return node_possible_map; | 144 | return node_possible_map; |
145 | } | 145 | } |
146 | 146 | ||
147 | #define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY]) | 147 | #define cpuset_current_mems_allowed (node_states[N_MEMORY]) |
148 | static inline void cpuset_init_current_mems_allowed(void) {} | 148 | static inline void cpuset_init_current_mems_allowed(void) {} |
149 | 149 | ||
150 | static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) | 150 | static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) |
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 31e8041274f6..f74856e17e48 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
@@ -34,6 +34,7 @@ struct vm_area_struct; | |||
34 | #define ___GFP_NO_KSWAPD 0x400000u | 34 | #define ___GFP_NO_KSWAPD 0x400000u |
35 | #define ___GFP_OTHER_NODE 0x800000u | 35 | #define ___GFP_OTHER_NODE 0x800000u |
36 | #define ___GFP_WRITE 0x1000000u | 36 | #define ___GFP_WRITE 0x1000000u |
37 | /* If the above are modified, __GFP_BITS_SHIFT may need updating */ | ||
37 | 38 | ||
38 | /* | 39 | /* |
39 | * GFP bitmasks.. | 40 | * GFP bitmasks.. |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1af477552459..092dc5305a32 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -39,6 +39,7 @@ enum transparent_hugepage_flag { | |||
39 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, | 39 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, |
40 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, | 40 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, |
41 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, | 41 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, |
42 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, | ||
42 | #ifdef CONFIG_DEBUG_VM | 43 | #ifdef CONFIG_DEBUG_VM |
43 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, | 44 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, |
44 | #endif | 45 | #endif |
@@ -78,6 +79,9 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); | |||
78 | (transparent_hugepage_flags & \ | 79 | (transparent_hugepage_flags & \ |
79 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) && \ | 80 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) && \ |
80 | (__vma)->vm_flags & VM_HUGEPAGE)) | 81 | (__vma)->vm_flags & VM_HUGEPAGE)) |
82 | #define transparent_hugepage_use_zero_page() \ | ||
83 | (transparent_hugepage_flags & \ | ||
84 | (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) | ||
81 | #ifdef CONFIG_DEBUG_VM | 85 | #ifdef CONFIG_DEBUG_VM |
82 | #define transparent_hugepage_debug_cow() \ | 86 | #define transparent_hugepage_debug_cow() \ |
83 | (transparent_hugepage_flags & \ | 87 | (transparent_hugepage_flags & \ |
@@ -95,12 +99,14 @@ extern int handle_pte_fault(struct mm_struct *mm, | |||
95 | struct vm_area_struct *vma, unsigned long address, | 99 | struct vm_area_struct *vma, unsigned long address, |
96 | pte_t *pte, pmd_t *pmd, unsigned int flags); | 100 | pte_t *pte, pmd_t *pmd, unsigned int flags); |
97 | extern int split_huge_page(struct page *page); | 101 | extern int split_huge_page(struct page *page); |
98 | extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); | 102 | extern void __split_huge_page_pmd(struct vm_area_struct *vma, |
99 | #define split_huge_page_pmd(__mm, __pmd) \ | 103 | unsigned long address, pmd_t *pmd); |
104 | #define split_huge_page_pmd(__vma, __address, __pmd) \ | ||
100 | do { \ | 105 | do { \ |
101 | pmd_t *____pmd = (__pmd); \ | 106 | pmd_t *____pmd = (__pmd); \ |
102 | if (unlikely(pmd_trans_huge(*____pmd))) \ | 107 | if (unlikely(pmd_trans_huge(*____pmd))) \ |
103 | __split_huge_page_pmd(__mm, ____pmd); \ | 108 | __split_huge_page_pmd(__vma, __address, \ |
109 | ____pmd); \ | ||
104 | } while (0) | 110 | } while (0) |
105 | #define wait_split_huge_page(__anon_vma, __pmd) \ | 111 | #define wait_split_huge_page(__anon_vma, __pmd) \ |
106 | do { \ | 112 | do { \ |
@@ -110,6 +116,8 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); | |||
110 | BUG_ON(pmd_trans_splitting(*____pmd) || \ | 116 | BUG_ON(pmd_trans_splitting(*____pmd) || \ |
111 | pmd_trans_huge(*____pmd)); \ | 117 | pmd_trans_huge(*____pmd)); \ |
112 | } while (0) | 118 | } while (0) |
119 | extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, | ||
120 | pmd_t *pmd); | ||
113 | #if HPAGE_PMD_ORDER > MAX_ORDER | 121 | #if HPAGE_PMD_ORDER > MAX_ORDER |
114 | #error "hugepages can't be allocated by the buddy allocator" | 122 | #error "hugepages can't be allocated by the buddy allocator" |
115 | #endif | 123 | #endif |
@@ -177,10 +185,12 @@ static inline int split_huge_page(struct page *page) | |||
177 | { | 185 | { |
178 | return 0; | 186 | return 0; |
179 | } | 187 | } |
180 | #define split_huge_page_pmd(__mm, __pmd) \ | 188 | #define split_huge_page_pmd(__vma, __address, __pmd) \ |
181 | do { } while (0) | 189 | do { } while (0) |
182 | #define wait_split_huge_page(__anon_vma, __pmd) \ | 190 | #define wait_split_huge_page(__anon_vma, __pmd) \ |
183 | do { } while (0) | 191 | do { } while (0) |
192 | #define split_huge_page_pmd_mm(__mm, __address, __pmd) \ | ||
193 | do { } while (0) | ||
184 | #define compound_trans_head(page) compound_head(page) | 194 | #define compound_trans_head(page) compound_head(page) |
185 | static inline int hugepage_madvise(struct vm_area_struct *vma, | 195 | static inline int hugepage_madvise(struct vm_area_struct *vma, |
186 | unsigned long *vm_flags, int advice) | 196 | unsigned long *vm_flags, int advice) |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 11ddc7ffeba8..e98a74c0c9c0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -181,7 +181,14 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
181 | gfp_t gfp_mask, | 181 | gfp_t gfp_mask, |
182 | unsigned long *total_scanned); | 182 | unsigned long *total_scanned); |
183 | 183 | ||
184 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); | 184 | void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); |
185 | static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, | ||
186 | enum vm_event_item idx) | ||
187 | { | ||
188 | if (mem_cgroup_disabled()) | ||
189 | return; | ||
190 | __mem_cgroup_count_vm_event(mm, idx); | ||
191 | } | ||
185 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 192 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
186 | void mem_cgroup_split_huge_fixup(struct page *head); | 193 | void mem_cgroup_split_huge_fixup(struct page *head); |
187 | #endif | 194 | #endif |
diff --git a/include/linux/memory.h b/include/linux/memory.h index a09216d0dcc7..45e93b468878 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h | |||
@@ -54,6 +54,7 @@ struct memory_notify { | |||
54 | unsigned long start_pfn; | 54 | unsigned long start_pfn; |
55 | unsigned long nr_pages; | 55 | unsigned long nr_pages; |
56 | int status_change_nid_normal; | 56 | int status_change_nid_normal; |
57 | int status_change_nid_high; | ||
57 | int status_change_nid; | 58 | int status_change_nid; |
58 | }; | 59 | }; |
59 | 60 | ||
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0c0b1d608a69..cd55dad56aac 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -460,17 +460,44 @@ struct zone { | |||
460 | unsigned long zone_start_pfn; | 460 | unsigned long zone_start_pfn; |
461 | 461 | ||
462 | /* | 462 | /* |
463 | * zone_start_pfn, spanned_pages and present_pages are all | 463 | * spanned_pages is the total pages spanned by the zone, including |
464 | * protected by span_seqlock. It is a seqlock because it has | 464 | * holes, which is calculated as: |
465 | * to be read outside of zone->lock, and it is done in the main | 465 | * spanned_pages = zone_end_pfn - zone_start_pfn; |
466 | * allocator path. But, it is written quite infrequently. | ||
467 | * | 466 | * |
468 | * The lock is declared along with zone->lock because it is | 467 | * present_pages is physical pages existing within the zone, which |
468 | * is calculated as: | ||
469 | * present_pages = spanned_pages - absent_pages(pags in holes); | ||
470 | * | ||
471 | * managed_pages is present pages managed by the buddy system, which | ||
472 | * is calculated as (reserved_pages includes pages allocated by the | ||
473 | * bootmem allocator): | ||
474 | * managed_pages = present_pages - reserved_pages; | ||
475 | * | ||
476 | * So present_pages may be used by memory hotplug or memory power | ||
477 | * management logic to figure out unmanaged pages by checking | ||
478 | * (present_pages - managed_pages). And managed_pages should be used | ||
479 | * by page allocator and vm scanner to calculate all kinds of watermarks | ||
480 | * and thresholds. | ||
481 | * | ||
482 | * Locking rules: | ||
483 | * | ||
484 | * zone_start_pfn and spanned_pages are protected by span_seqlock. | ||
485 | * It is a seqlock because it has to be read outside of zone->lock, | ||
486 | * and it is done in the main allocator path. But, it is written | ||
487 | * quite infrequently. | ||
488 | * | ||
489 | * The span_seq lock is declared along with zone->lock because it is | ||
469 | * frequently read in proximity to zone->lock. It's good to | 490 | * frequently read in proximity to zone->lock. It's good to |
470 | * give them a chance of being in the same cacheline. | 491 | * give them a chance of being in the same cacheline. |
492 | * | ||
493 | * Write access to present_pages and managed_pages at runtime should | ||
494 | * be protected by lock_memory_hotplug()/unlock_memory_hotplug(). | ||
495 | * Any reader who can't tolerant drift of present_pages and | ||
496 | * managed_pages should hold memory hotplug lock to get a stable value. | ||
471 | */ | 497 | */ |
472 | unsigned long spanned_pages; /* total size, including holes */ | 498 | unsigned long spanned_pages; |
473 | unsigned long present_pages; /* amount of memory (excluding holes) */ | 499 | unsigned long present_pages; |
500 | unsigned long managed_pages; | ||
474 | 501 | ||
475 | /* | 502 | /* |
476 | * rarely used fields: | 503 | * rarely used fields: |
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 7afc36334d52..4e2cbfa640b7 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h | |||
@@ -380,6 +380,11 @@ enum node_states { | |||
380 | #else | 380 | #else |
381 | N_HIGH_MEMORY = N_NORMAL_MEMORY, | 381 | N_HIGH_MEMORY = N_NORMAL_MEMORY, |
382 | #endif | 382 | #endif |
383 | #ifdef CONFIG_MOVABLE_NODE | ||
384 | N_MEMORY, /* The node has memory(regular, high, movable) */ | ||
385 | #else | ||
386 | N_MEMORY = N_HIGH_MEMORY, | ||
387 | #endif | ||
383 | N_CPU, /* The node has one or more cpus */ | 388 | N_CPU, /* The node has one or more cpus */ |
384 | NR_NODE_STATES | 389 | NR_NODE_STATES |
385 | }; | 390 | }; |
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index 7d7fbe2ef782..6f54e40fa218 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h | |||
@@ -74,14 +74,9 @@ ssize_t res_counter_read(struct res_counter *counter, int member, | |||
74 | const char __user *buf, size_t nbytes, loff_t *pos, | 74 | const char __user *buf, size_t nbytes, loff_t *pos, |
75 | int (*read_strategy)(unsigned long long val, char *s)); | 75 | int (*read_strategy)(unsigned long long val, char *s)); |
76 | 76 | ||
77 | typedef int (*write_strategy_fn)(const char *buf, unsigned long long *val); | ||
78 | |||
79 | int res_counter_memparse_write_strategy(const char *buf, | 77 | int res_counter_memparse_write_strategy(const char *buf, |
80 | unsigned long long *res); | 78 | unsigned long long *res); |
81 | 79 | ||
82 | int res_counter_write(struct res_counter *counter, int member, | ||
83 | const char *buffer, write_strategy_fn write_strategy); | ||
84 | |||
85 | /* | 80 | /* |
86 | * the field descriptors. one for each member of res_counter | 81 | * the field descriptors. one for each member of res_counter |
87 | */ | 82 | */ |
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 3d3114594370..fe786f07d2bd 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
@@ -58,6 +58,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
58 | THP_COLLAPSE_ALLOC, | 58 | THP_COLLAPSE_ALLOC, |
59 | THP_COLLAPSE_ALLOC_FAILED, | 59 | THP_COLLAPSE_ALLOC_FAILED, |
60 | THP_SPLIT, | 60 | THP_SPLIT, |
61 | THP_ZERO_PAGE_ALLOC, | ||
62 | THP_ZERO_PAGE_ALLOC_FAILED, | ||
61 | #endif | 63 | #endif |
62 | NR_VM_EVENT_ITEMS | 64 | NR_VM_EVENT_ITEMS |
63 | }; | 65 | }; |
diff --git a/init/main.c b/init/main.c index e33e09df3cbc..63ae904a99a8 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -857,7 +857,7 @@ static void __init kernel_init_freeable(void) | |||
857 | /* | 857 | /* |
858 | * init can allocate pages on any node | 858 | * init can allocate pages on any node |
859 | */ | 859 | */ |
860 | set_mems_allowed(node_states[N_HIGH_MEMORY]); | 860 | set_mems_allowed(node_states[N_MEMORY]); |
861 | /* | 861 | /* |
862 | * init can run on any cpu. | 862 | * init can run on any cpu. |
863 | */ | 863 | */ |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b017887d632f..7bb63eea6eb8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -302,10 +302,10 @@ static void guarantee_online_cpus(const struct cpuset *cs, | |||
302 | * are online, with memory. If none are online with memory, walk | 302 | * are online, with memory. If none are online with memory, walk |
303 | * up the cpuset hierarchy until we find one that does have some | 303 | * up the cpuset hierarchy until we find one that does have some |
304 | * online mems. If we get all the way to the top and still haven't | 304 | * online mems. If we get all the way to the top and still haven't |
305 | * found any online mems, return node_states[N_HIGH_MEMORY]. | 305 | * found any online mems, return node_states[N_MEMORY]. |
306 | * | 306 | * |
307 | * One way or another, we guarantee to return some non-empty subset | 307 | * One way or another, we guarantee to return some non-empty subset |
308 | * of node_states[N_HIGH_MEMORY]. | 308 | * of node_states[N_MEMORY]. |
309 | * | 309 | * |
310 | * Call with callback_mutex held. | 310 | * Call with callback_mutex held. |
311 | */ | 311 | */ |
@@ -313,14 +313,14 @@ static void guarantee_online_cpus(const struct cpuset *cs, | |||
313 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 313 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) |
314 | { | 314 | { |
315 | while (cs && !nodes_intersects(cs->mems_allowed, | 315 | while (cs && !nodes_intersects(cs->mems_allowed, |
316 | node_states[N_HIGH_MEMORY])) | 316 | node_states[N_MEMORY])) |
317 | cs = cs->parent; | 317 | cs = cs->parent; |
318 | if (cs) | 318 | if (cs) |
319 | nodes_and(*pmask, cs->mems_allowed, | 319 | nodes_and(*pmask, cs->mems_allowed, |
320 | node_states[N_HIGH_MEMORY]); | 320 | node_states[N_MEMORY]); |
321 | else | 321 | else |
322 | *pmask = node_states[N_HIGH_MEMORY]; | 322 | *pmask = node_states[N_MEMORY]; |
323 | BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); | 323 | BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY])); |
324 | } | 324 | } |
325 | 325 | ||
326 | /* | 326 | /* |
@@ -1100,7 +1100,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1100 | return -ENOMEM; | 1100 | return -ENOMEM; |
1101 | 1101 | ||
1102 | /* | 1102 | /* |
1103 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; | 1103 | * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; |
1104 | * it's read-only | 1104 | * it's read-only |
1105 | */ | 1105 | */ |
1106 | if (cs == &top_cpuset) { | 1106 | if (cs == &top_cpuset) { |
@@ -1122,7 +1122,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1122 | goto done; | 1122 | goto done; |
1123 | 1123 | ||
1124 | if (!nodes_subset(trialcs->mems_allowed, | 1124 | if (!nodes_subset(trialcs->mems_allowed, |
1125 | node_states[N_HIGH_MEMORY])) { | 1125 | node_states[N_MEMORY])) { |
1126 | retval = -EINVAL; | 1126 | retval = -EINVAL; |
1127 | goto done; | 1127 | goto done; |
1128 | } | 1128 | } |
@@ -2026,7 +2026,7 @@ static struct cpuset *cpuset_next(struct list_head *queue) | |||
2026 | * before dropping down to the next. It always processes a node before | 2026 | * before dropping down to the next. It always processes a node before |
2027 | * any of its children. | 2027 | * any of its children. |
2028 | * | 2028 | * |
2029 | * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY | 2029 | * In the case of memory hot-unplug, it will remove nodes from N_MEMORY |
2030 | * if all present pages from a node are offlined. | 2030 | * if all present pages from a node are offlined. |
2031 | */ | 2031 | */ |
2032 | static void | 2032 | static void |
@@ -2065,7 +2065,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) | |||
2065 | 2065 | ||
2066 | /* Continue past cpusets with all mems online */ | 2066 | /* Continue past cpusets with all mems online */ |
2067 | if (nodes_subset(cp->mems_allowed, | 2067 | if (nodes_subset(cp->mems_allowed, |
2068 | node_states[N_HIGH_MEMORY])) | 2068 | node_states[N_MEMORY])) |
2069 | continue; | 2069 | continue; |
2070 | 2070 | ||
2071 | oldmems = cp->mems_allowed; | 2071 | oldmems = cp->mems_allowed; |
@@ -2073,7 +2073,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) | |||
2073 | /* Remove offline mems from this cpuset. */ | 2073 | /* Remove offline mems from this cpuset. */ |
2074 | mutex_lock(&callback_mutex); | 2074 | mutex_lock(&callback_mutex); |
2075 | nodes_and(cp->mems_allowed, cp->mems_allowed, | 2075 | nodes_and(cp->mems_allowed, cp->mems_allowed, |
2076 | node_states[N_HIGH_MEMORY]); | 2076 | node_states[N_MEMORY]); |
2077 | mutex_unlock(&callback_mutex); | 2077 | mutex_unlock(&callback_mutex); |
2078 | 2078 | ||
2079 | /* Move tasks from the empty cpuset to a parent */ | 2079 | /* Move tasks from the empty cpuset to a parent */ |
@@ -2126,8 +2126,8 @@ void cpuset_update_active_cpus(bool cpu_online) | |||
2126 | 2126 | ||
2127 | #ifdef CONFIG_MEMORY_HOTPLUG | 2127 | #ifdef CONFIG_MEMORY_HOTPLUG |
2128 | /* | 2128 | /* |
2129 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. | 2129 | * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. |
2130 | * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. | 2130 | * Call this routine anytime after node_states[N_MEMORY] changes. |
2131 | * See cpuset_update_active_cpus() for CPU hotplug handling. | 2131 | * See cpuset_update_active_cpus() for CPU hotplug handling. |
2132 | */ | 2132 | */ |
2133 | static int cpuset_track_online_nodes(struct notifier_block *self, | 2133 | static int cpuset_track_online_nodes(struct notifier_block *self, |
@@ -2140,7 +2140,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
2140 | case MEM_ONLINE: | 2140 | case MEM_ONLINE: |
2141 | oldmems = top_cpuset.mems_allowed; | 2141 | oldmems = top_cpuset.mems_allowed; |
2142 | mutex_lock(&callback_mutex); | 2142 | mutex_lock(&callback_mutex); |
2143 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2143 | top_cpuset.mems_allowed = node_states[N_MEMORY]; |
2144 | mutex_unlock(&callback_mutex); | 2144 | mutex_unlock(&callback_mutex); |
2145 | update_tasks_nodemask(&top_cpuset, &oldmems, NULL); | 2145 | update_tasks_nodemask(&top_cpuset, &oldmems, NULL); |
2146 | break; | 2146 | break; |
@@ -2169,7 +2169,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
2169 | void __init cpuset_init_smp(void) | 2169 | void __init cpuset_init_smp(void) |
2170 | { | 2170 | { |
2171 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2171 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
2172 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2172 | top_cpuset.mems_allowed = node_states[N_MEMORY]; |
2173 | 2173 | ||
2174 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); | 2174 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); |
2175 | 2175 | ||
@@ -2237,7 +2237,7 @@ void cpuset_init_current_mems_allowed(void) | |||
2237 | * | 2237 | * |
2238 | * Description: Returns the nodemask_t mems_allowed of the cpuset | 2238 | * Description: Returns the nodemask_t mems_allowed of the cpuset |
2239 | * attached to the specified @tsk. Guaranteed to return some non-empty | 2239 | * attached to the specified @tsk. Guaranteed to return some non-empty |
2240 | * subset of node_states[N_HIGH_MEMORY], even if this means going outside the | 2240 | * subset of node_states[N_MEMORY], even if this means going outside the |
2241 | * tasks cpuset. | 2241 | * tasks cpuset. |
2242 | **/ | 2242 | **/ |
2243 | 2243 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index 29fb60caecb5..691dc2ef9baf 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -428,7 +428,7 @@ int kthreadd(void *unused) | |||
428 | set_task_comm(tsk, "kthreadd"); | 428 | set_task_comm(tsk, "kthreadd"); |
429 | ignore_signals(tsk); | 429 | ignore_signals(tsk); |
430 | set_cpus_allowed_ptr(tsk, cpu_all_mask); | 430 | set_cpus_allowed_ptr(tsk, cpu_all_mask); |
431 | set_mems_allowed(node_states[N_HIGH_MEMORY]); | 431 | set_mems_allowed(node_states[N_MEMORY]); |
432 | 432 | ||
433 | current->flags |= PF_NOFREEZE; | 433 | current->flags |= PF_NOFREEZE; |
434 | 434 | ||
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index ad581aa2369a..3920d593e63c 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -192,25 +192,3 @@ int res_counter_memparse_write_strategy(const char *buf, | |||
192 | *res = PAGE_ALIGN(*res); | 192 | *res = PAGE_ALIGN(*res); |
193 | return 0; | 193 | return 0; |
194 | } | 194 | } |
195 | |||
196 | int res_counter_write(struct res_counter *counter, int member, | ||
197 | const char *buf, write_strategy_fn write_strategy) | ||
198 | { | ||
199 | char *end; | ||
200 | unsigned long flags; | ||
201 | unsigned long long tmp, *val; | ||
202 | |||
203 | if (write_strategy) { | ||
204 | if (write_strategy(buf, &tmp)) | ||
205 | return -EINVAL; | ||
206 | } else { | ||
207 | tmp = simple_strtoull(buf, &end, 10); | ||
208 | if (*end != '\0') | ||
209 | return -EINVAL; | ||
210 | } | ||
211 | spin_lock_irqsave(&counter->lock, flags); | ||
212 | val = res_counter_member(counter, member); | ||
213 | *val = tmp; | ||
214 | spin_unlock_irqrestore(&counter->lock, flags); | ||
215 | return 0; | ||
216 | } | ||
diff --git a/mm/Kconfig b/mm/Kconfig index e6651c5de14f..71259e052ce8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -143,6 +143,14 @@ config NO_BOOTMEM | |||
143 | config MEMORY_ISOLATION | 143 | config MEMORY_ISOLATION |
144 | boolean | 144 | boolean |
145 | 145 | ||
146 | config MOVABLE_NODE | ||
147 | boolean "Enable to assign a node which has only movable memory" | ||
148 | depends on HAVE_MEMBLOCK | ||
149 | depends on NO_BOOTMEM | ||
150 | depends on X86_64 | ||
151 | depends on NUMA | ||
152 | depends on BROKEN | ||
153 | |||
146 | # eventually, we can have this option just 'select SPARSEMEM' | 154 | # eventually, we can have this option just 'select SPARSEMEM' |
147 | config MEMORY_HOTPLUG | 155 | config MEMORY_HOTPLUG |
148 | bool "Allow for memory hot-add" | 156 | bool "Allow for memory hot-add" |
diff --git a/mm/bootmem.c b/mm/bootmem.c index ecc45958ac0c..1324cd74faec 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -229,6 +229,22 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
229 | return count; | 229 | return count; |
230 | } | 230 | } |
231 | 231 | ||
232 | static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) | ||
233 | { | ||
234 | struct zone *z; | ||
235 | |||
236 | /* | ||
237 | * In free_area_init_core(), highmem zone's managed_pages is set to | ||
238 | * present_pages, and bootmem allocator doesn't allocate from highmem | ||
239 | * zones. So there's no need to recalculate managed_pages because all | ||
240 | * highmem pages will be managed by the buddy system. Here highmem | ||
241 | * zone also includes highmem movable zone. | ||
242 | */ | ||
243 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) | ||
244 | if (!is_highmem(z)) | ||
245 | z->managed_pages = 0; | ||
246 | } | ||
247 | |||
232 | /** | 248 | /** |
233 | * free_all_bootmem_node - release a node's free pages to the buddy allocator | 249 | * free_all_bootmem_node - release a node's free pages to the buddy allocator |
234 | * @pgdat: node to be released | 250 | * @pgdat: node to be released |
@@ -238,6 +254,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
238 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | 254 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) |
239 | { | 255 | { |
240 | register_page_bootmem_info_node(pgdat); | 256 | register_page_bootmem_info_node(pgdat); |
257 | reset_node_lowmem_managed_pages(pgdat); | ||
241 | return free_all_bootmem_core(pgdat->bdata); | 258 | return free_all_bootmem_core(pgdat->bdata); |
242 | } | 259 | } |
243 | 260 | ||
@@ -250,6 +267,10 @@ unsigned long __init free_all_bootmem(void) | |||
250 | { | 267 | { |
251 | unsigned long total_pages = 0; | 268 | unsigned long total_pages = 0; |
252 | bootmem_data_t *bdata; | 269 | bootmem_data_t *bdata; |
270 | struct pglist_data *pgdat; | ||
271 | |||
272 | for_each_online_pgdat(pgdat) | ||
273 | reset_node_lowmem_managed_pages(pgdat); | ||
253 | 274 | ||
254 | list_for_each_entry(bdata, &bdata_list, list) | 275 | list_for_each_entry(bdata, &bdata_list, list) |
255 | total_pages += free_all_bootmem_core(bdata); | 276 | total_pages += free_all_bootmem_core(bdata); |
@@ -439,12 +460,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, | |||
439 | return mark_bootmem(start, end, 1, flags); | 460 | return mark_bootmem(start, end, 1, flags); |
440 | } | 461 | } |
441 | 462 | ||
442 | int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | ||
443 | int flags) | ||
444 | { | ||
445 | return reserve_bootmem(phys, len, flags); | ||
446 | } | ||
447 | |||
448 | static unsigned long __init align_idx(struct bootmem_data *bdata, | 463 | static unsigned long __init align_idx(struct bootmem_data *bdata, |
449 | unsigned long idx, unsigned long step) | 464 | unsigned long idx, unsigned long step) |
450 | { | 465 | { |
@@ -575,27 +590,6 @@ find_block: | |||
575 | return NULL; | 590 | return NULL; |
576 | } | 591 | } |
577 | 592 | ||
578 | static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, | ||
579 | unsigned long size, unsigned long align, | ||
580 | unsigned long goal, unsigned long limit) | ||
581 | { | ||
582 | if (WARN_ON_ONCE(slab_is_available())) | ||
583 | return kzalloc(size, GFP_NOWAIT); | ||
584 | |||
585 | #ifdef CONFIG_HAVE_ARCH_BOOTMEM | ||
586 | { | ||
587 | bootmem_data_t *p_bdata; | ||
588 | |||
589 | p_bdata = bootmem_arch_preferred_node(bdata, size, align, | ||
590 | goal, limit); | ||
591 | if (p_bdata) | ||
592 | return alloc_bootmem_bdata(p_bdata, size, align, | ||
593 | goal, limit); | ||
594 | } | ||
595 | #endif | ||
596 | return NULL; | ||
597 | } | ||
598 | |||
599 | static void * __init alloc_bootmem_core(unsigned long size, | 593 | static void * __init alloc_bootmem_core(unsigned long size, |
600 | unsigned long align, | 594 | unsigned long align, |
601 | unsigned long goal, | 595 | unsigned long goal, |
@@ -604,9 +598,8 @@ static void * __init alloc_bootmem_core(unsigned long size, | |||
604 | bootmem_data_t *bdata; | 598 | bootmem_data_t *bdata; |
605 | void *region; | 599 | void *region; |
606 | 600 | ||
607 | region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); | 601 | if (WARN_ON_ONCE(slab_is_available())) |
608 | if (region) | 602 | return kzalloc(size, GFP_NOWAIT); |
609 | return region; | ||
610 | 603 | ||
611 | list_for_each_entry(bdata, &bdata_list, list) { | 604 | list_for_each_entry(bdata, &bdata_list, list) { |
612 | if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) | 605 | if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) |
@@ -704,11 +697,9 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, | |||
704 | { | 697 | { |
705 | void *ptr; | 698 | void *ptr; |
706 | 699 | ||
700 | if (WARN_ON_ONCE(slab_is_available())) | ||
701 | return kzalloc(size, GFP_NOWAIT); | ||
707 | again: | 702 | again: |
708 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, | ||
709 | align, goal, limit); | ||
710 | if (ptr) | ||
711 | return ptr; | ||
712 | 703 | ||
713 | /* do not panic in alloc_bootmem_bdata() */ | 704 | /* do not panic in alloc_bootmem_bdata() */ |
714 | if (limit && goal + size > limit) | 705 | if (limit && goal + size > limit) |
diff --git a/mm/compaction.c b/mm/compaction.c index d24dd2d7bad4..129791218226 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -215,60 +215,6 @@ static bool suitable_migration_target(struct page *page) | |||
215 | return false; | 215 | return false; |
216 | } | 216 | } |
217 | 217 | ||
218 | static void compact_capture_page(struct compact_control *cc) | ||
219 | { | ||
220 | unsigned long flags; | ||
221 | int mtype, mtype_low, mtype_high; | ||
222 | |||
223 | if (!cc->page || *cc->page) | ||
224 | return; | ||
225 | |||
226 | /* | ||
227 | * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP | ||
228 | * regardless of the migratetype of the freelist is is captured from. | ||
229 | * This is fine because the order for a high-order MIGRATE_MOVABLE | ||
230 | * allocation is typically at least a pageblock size and overall | ||
231 | * fragmentation is not impaired. Other allocation types must | ||
232 | * capture pages from their own migratelist because otherwise they | ||
233 | * could pollute other pageblocks like MIGRATE_MOVABLE with | ||
234 | * difficult to move pages and making fragmentation worse overall. | ||
235 | */ | ||
236 | if (cc->migratetype == MIGRATE_MOVABLE) { | ||
237 | mtype_low = 0; | ||
238 | mtype_high = MIGRATE_PCPTYPES; | ||
239 | } else { | ||
240 | mtype_low = cc->migratetype; | ||
241 | mtype_high = cc->migratetype + 1; | ||
242 | } | ||
243 | |||
244 | /* Speculatively examine the free lists without zone lock */ | ||
245 | for (mtype = mtype_low; mtype < mtype_high; mtype++) { | ||
246 | int order; | ||
247 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
248 | struct page *page; | ||
249 | struct free_area *area; | ||
250 | area = &(cc->zone->free_area[order]); | ||
251 | if (list_empty(&area->free_list[mtype])) | ||
252 | continue; | ||
253 | |||
254 | /* Take the lock and attempt capture of the page */ | ||
255 | if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) | ||
256 | return; | ||
257 | if (!list_empty(&area->free_list[mtype])) { | ||
258 | page = list_entry(area->free_list[mtype].next, | ||
259 | struct page, lru); | ||
260 | if (capture_free_page(page, cc->order, mtype)) { | ||
261 | spin_unlock_irqrestore(&cc->zone->lock, | ||
262 | flags); | ||
263 | *cc->page = page; | ||
264 | return; | ||
265 | } | ||
266 | } | ||
267 | spin_unlock_irqrestore(&cc->zone->lock, flags); | ||
268 | } | ||
269 | } | ||
270 | } | ||
271 | |||
272 | /* | 218 | /* |
273 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. | 219 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. |
274 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free | 220 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free |
@@ -953,6 +899,60 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
953 | return COMPACT_CONTINUE; | 899 | return COMPACT_CONTINUE; |
954 | } | 900 | } |
955 | 901 | ||
902 | static void compact_capture_page(struct compact_control *cc) | ||
903 | { | ||
904 | unsigned long flags; | ||
905 | int mtype, mtype_low, mtype_high; | ||
906 | |||
907 | if (!cc->page || *cc->page) | ||
908 | return; | ||
909 | |||
910 | /* | ||
911 | * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP | ||
912 | * regardless of the migratetype of the freelist is is captured from. | ||
913 | * This is fine because the order for a high-order MIGRATE_MOVABLE | ||
914 | * allocation is typically at least a pageblock size and overall | ||
915 | * fragmentation is not impaired. Other allocation types must | ||
916 | * capture pages from their own migratelist because otherwise they | ||
917 | * could pollute other pageblocks like MIGRATE_MOVABLE with | ||
918 | * difficult to move pages and making fragmentation worse overall. | ||
919 | */ | ||
920 | if (cc->migratetype == MIGRATE_MOVABLE) { | ||
921 | mtype_low = 0; | ||
922 | mtype_high = MIGRATE_PCPTYPES; | ||
923 | } else { | ||
924 | mtype_low = cc->migratetype; | ||
925 | mtype_high = cc->migratetype + 1; | ||
926 | } | ||
927 | |||
928 | /* Speculatively examine the free lists without zone lock */ | ||
929 | for (mtype = mtype_low; mtype < mtype_high; mtype++) { | ||
930 | int order; | ||
931 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
932 | struct page *page; | ||
933 | struct free_area *area; | ||
934 | area = &(cc->zone->free_area[order]); | ||
935 | if (list_empty(&area->free_list[mtype])) | ||
936 | continue; | ||
937 | |||
938 | /* Take the lock and attempt capture of the page */ | ||
939 | if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) | ||
940 | return; | ||
941 | if (!list_empty(&area->free_list[mtype])) { | ||
942 | page = list_entry(area->free_list[mtype].next, | ||
943 | struct page, lru); | ||
944 | if (capture_free_page(page, cc->order, mtype)) { | ||
945 | spin_unlock_irqrestore(&cc->zone->lock, | ||
946 | flags); | ||
947 | *cc->page = page; | ||
948 | return; | ||
949 | } | ||
950 | } | ||
951 | spin_unlock_irqrestore(&cc->zone->lock, flags); | ||
952 | } | ||
953 | } | ||
954 | } | ||
955 | |||
956 | static int compact_zone(struct zone *zone, struct compact_control *cc) | 956 | static int compact_zone(struct zone *zone, struct compact_control *cc) |
957 | { | 957 | { |
958 | int ret; | 958 | int ret; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5f902e20e8c0..827d9c813051 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -12,12 +12,14 @@ | |||
12 | #include <linux/mmu_notifier.h> | 12 | #include <linux/mmu_notifier.h> |
13 | #include <linux/rmap.h> | 13 | #include <linux/rmap.h> |
14 | #include <linux/swap.h> | 14 | #include <linux/swap.h> |
15 | #include <linux/shrinker.h> | ||
15 | #include <linux/mm_inline.h> | 16 | #include <linux/mm_inline.h> |
16 | #include <linux/kthread.h> | 17 | #include <linux/kthread.h> |
17 | #include <linux/khugepaged.h> | 18 | #include <linux/khugepaged.h> |
18 | #include <linux/freezer.h> | 19 | #include <linux/freezer.h> |
19 | #include <linux/mman.h> | 20 | #include <linux/mman.h> |
20 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | |||
21 | #include <asm/tlb.h> | 23 | #include <asm/tlb.h> |
22 | #include <asm/pgalloc.h> | 24 | #include <asm/pgalloc.h> |
23 | #include "internal.h" | 25 | #include "internal.h" |
@@ -37,7 +39,8 @@ unsigned long transparent_hugepage_flags __read_mostly = | |||
37 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| | 39 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| |
38 | #endif | 40 | #endif |
39 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| | 41 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| |
40 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | 42 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| |
43 | (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | ||
41 | 44 | ||
42 | /* default scan 8*512 pte (or vmas) every 30 second */ | 45 | /* default scan 8*512 pte (or vmas) every 30 second */ |
43 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; | 46 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; |
@@ -159,6 +162,77 @@ static int start_khugepaged(void) | |||
159 | return err; | 162 | return err; |
160 | } | 163 | } |
161 | 164 | ||
165 | static atomic_t huge_zero_refcount; | ||
166 | static unsigned long huge_zero_pfn __read_mostly; | ||
167 | |||
168 | static inline bool is_huge_zero_pfn(unsigned long pfn) | ||
169 | { | ||
170 | unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); | ||
171 | return zero_pfn && pfn == zero_pfn; | ||
172 | } | ||
173 | |||
174 | static inline bool is_huge_zero_pmd(pmd_t pmd) | ||
175 | { | ||
176 | return is_huge_zero_pfn(pmd_pfn(pmd)); | ||
177 | } | ||
178 | |||
179 | static unsigned long get_huge_zero_page(void) | ||
180 | { | ||
181 | struct page *zero_page; | ||
182 | retry: | ||
183 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) | ||
184 | return ACCESS_ONCE(huge_zero_pfn); | ||
185 | |||
186 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, | ||
187 | HPAGE_PMD_ORDER); | ||
188 | if (!zero_page) { | ||
189 | count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); | ||
190 | return 0; | ||
191 | } | ||
192 | count_vm_event(THP_ZERO_PAGE_ALLOC); | ||
193 | preempt_disable(); | ||
194 | if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { | ||
195 | preempt_enable(); | ||
196 | __free_page(zero_page); | ||
197 | goto retry; | ||
198 | } | ||
199 | |||
200 | /* We take additional reference here. It will be put back by shrinker */ | ||
201 | atomic_set(&huge_zero_refcount, 2); | ||
202 | preempt_enable(); | ||
203 | return ACCESS_ONCE(huge_zero_pfn); | ||
204 | } | ||
205 | |||
206 | static void put_huge_zero_page(void) | ||
207 | { | ||
208 | /* | ||
209 | * Counter should never go to zero here. Only shrinker can put | ||
210 | * last reference. | ||
211 | */ | ||
212 | BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); | ||
213 | } | ||
214 | |||
215 | static int shrink_huge_zero_page(struct shrinker *shrink, | ||
216 | struct shrink_control *sc) | ||
217 | { | ||
218 | if (!sc->nr_to_scan) | ||
219 | /* we can free zero page only if last reference remains */ | ||
220 | return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; | ||
221 | |||
222 | if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { | ||
223 | unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); | ||
224 | BUG_ON(zero_pfn == 0); | ||
225 | __free_page(__pfn_to_page(zero_pfn)); | ||
226 | } | ||
227 | |||
228 | return 0; | ||
229 | } | ||
230 | |||
231 | static struct shrinker huge_zero_page_shrinker = { | ||
232 | .shrink = shrink_huge_zero_page, | ||
233 | .seeks = DEFAULT_SEEKS, | ||
234 | }; | ||
235 | |||
162 | #ifdef CONFIG_SYSFS | 236 | #ifdef CONFIG_SYSFS |
163 | 237 | ||
164 | static ssize_t double_flag_show(struct kobject *kobj, | 238 | static ssize_t double_flag_show(struct kobject *kobj, |
@@ -284,6 +358,20 @@ static ssize_t defrag_store(struct kobject *kobj, | |||
284 | static struct kobj_attribute defrag_attr = | 358 | static struct kobj_attribute defrag_attr = |
285 | __ATTR(defrag, 0644, defrag_show, defrag_store); | 359 | __ATTR(defrag, 0644, defrag_show, defrag_store); |
286 | 360 | ||
361 | static ssize_t use_zero_page_show(struct kobject *kobj, | ||
362 | struct kobj_attribute *attr, char *buf) | ||
363 | { | ||
364 | return single_flag_show(kobj, attr, buf, | ||
365 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | ||
366 | } | ||
367 | static ssize_t use_zero_page_store(struct kobject *kobj, | ||
368 | struct kobj_attribute *attr, const char *buf, size_t count) | ||
369 | { | ||
370 | return single_flag_store(kobj, attr, buf, count, | ||
371 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | ||
372 | } | ||
373 | static struct kobj_attribute use_zero_page_attr = | ||
374 | __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); | ||
287 | #ifdef CONFIG_DEBUG_VM | 375 | #ifdef CONFIG_DEBUG_VM |
288 | static ssize_t debug_cow_show(struct kobject *kobj, | 376 | static ssize_t debug_cow_show(struct kobject *kobj, |
289 | struct kobj_attribute *attr, char *buf) | 377 | struct kobj_attribute *attr, char *buf) |
@@ -305,6 +393,7 @@ static struct kobj_attribute debug_cow_attr = | |||
305 | static struct attribute *hugepage_attr[] = { | 393 | static struct attribute *hugepage_attr[] = { |
306 | &enabled_attr.attr, | 394 | &enabled_attr.attr, |
307 | &defrag_attr.attr, | 395 | &defrag_attr.attr, |
396 | &use_zero_page_attr.attr, | ||
308 | #ifdef CONFIG_DEBUG_VM | 397 | #ifdef CONFIG_DEBUG_VM |
309 | &debug_cow_attr.attr, | 398 | &debug_cow_attr.attr, |
310 | #endif | 399 | #endif |
@@ -550,6 +639,8 @@ static int __init hugepage_init(void) | |||
550 | goto out; | 639 | goto out; |
551 | } | 640 | } |
552 | 641 | ||
642 | register_shrinker(&huge_zero_page_shrinker); | ||
643 | |||
553 | /* | 644 | /* |
554 | * By default disable transparent hugepages on smaller systems, | 645 | * By default disable transparent hugepages on smaller systems, |
555 | * where the extra memory used could hurt more than TLB overhead | 646 | * where the extra memory used could hurt more than TLB overhead |
@@ -678,6 +769,22 @@ static inline struct page *alloc_hugepage(int defrag) | |||
678 | } | 769 | } |
679 | #endif | 770 | #endif |
680 | 771 | ||
772 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | ||
773 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | ||
774 | unsigned long zero_pfn) | ||
775 | { | ||
776 | pmd_t entry; | ||
777 | if (!pmd_none(*pmd)) | ||
778 | return false; | ||
779 | entry = pfn_pmd(zero_pfn, vma->vm_page_prot); | ||
780 | entry = pmd_wrprotect(entry); | ||
781 | entry = pmd_mkhuge(entry); | ||
782 | set_pmd_at(mm, haddr, pmd, entry); | ||
783 | pgtable_trans_huge_deposit(mm, pgtable); | ||
784 | mm->nr_ptes++; | ||
785 | return true; | ||
786 | } | ||
787 | |||
681 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 788 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, |
682 | unsigned long address, pmd_t *pmd, | 789 | unsigned long address, pmd_t *pmd, |
683 | unsigned int flags) | 790 | unsigned int flags) |
@@ -691,6 +798,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
691 | return VM_FAULT_OOM; | 798 | return VM_FAULT_OOM; |
692 | if (unlikely(khugepaged_enter(vma))) | 799 | if (unlikely(khugepaged_enter(vma))) |
693 | return VM_FAULT_OOM; | 800 | return VM_FAULT_OOM; |
801 | if (!(flags & FAULT_FLAG_WRITE) && | ||
802 | transparent_hugepage_use_zero_page()) { | ||
803 | pgtable_t pgtable; | ||
804 | unsigned long zero_pfn; | ||
805 | bool set; | ||
806 | pgtable = pte_alloc_one(mm, haddr); | ||
807 | if (unlikely(!pgtable)) | ||
808 | return VM_FAULT_OOM; | ||
809 | zero_pfn = get_huge_zero_page(); | ||
810 | if (unlikely(!zero_pfn)) { | ||
811 | pte_free(mm, pgtable); | ||
812 | count_vm_event(THP_FAULT_FALLBACK); | ||
813 | goto out; | ||
814 | } | ||
815 | spin_lock(&mm->page_table_lock); | ||
816 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, | ||
817 | zero_pfn); | ||
818 | spin_unlock(&mm->page_table_lock); | ||
819 | if (!set) { | ||
820 | pte_free(mm, pgtable); | ||
821 | put_huge_zero_page(); | ||
822 | } | ||
823 | return 0; | ||
824 | } | ||
694 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 825 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
695 | vma, haddr, numa_node_id(), 0); | 826 | vma, haddr, numa_node_id(), 0); |
696 | if (unlikely(!page)) { | 827 | if (unlikely(!page)) { |
@@ -755,6 +886,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
755 | pte_free(dst_mm, pgtable); | 886 | pte_free(dst_mm, pgtable); |
756 | goto out_unlock; | 887 | goto out_unlock; |
757 | } | 888 | } |
889 | /* | ||
890 | * mm->page_table_lock is enough to be sure that huge zero pmd is not | ||
891 | * under splitting since we don't split the page itself, only pmd to | ||
892 | * a page table. | ||
893 | */ | ||
894 | if (is_huge_zero_pmd(pmd)) { | ||
895 | unsigned long zero_pfn; | ||
896 | bool set; | ||
897 | /* | ||
898 | * get_huge_zero_page() will never allocate a new page here, | ||
899 | * since we already have a zero page to copy. It just takes a | ||
900 | * reference. | ||
901 | */ | ||
902 | zero_pfn = get_huge_zero_page(); | ||
903 | set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, | ||
904 | zero_pfn); | ||
905 | BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ | ||
906 | ret = 0; | ||
907 | goto out_unlock; | ||
908 | } | ||
758 | if (unlikely(pmd_trans_splitting(pmd))) { | 909 | if (unlikely(pmd_trans_splitting(pmd))) { |
759 | /* split huge page running from under us */ | 910 | /* split huge page running from under us */ |
760 | spin_unlock(&src_mm->page_table_lock); | 911 | spin_unlock(&src_mm->page_table_lock); |
@@ -806,6 +957,80 @@ unlock: | |||
806 | spin_unlock(&mm->page_table_lock); | 957 | spin_unlock(&mm->page_table_lock); |
807 | } | 958 | } |
808 | 959 | ||
960 | static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, | ||
961 | struct vm_area_struct *vma, unsigned long address, | ||
962 | pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) | ||
963 | { | ||
964 | pgtable_t pgtable; | ||
965 | pmd_t _pmd; | ||
966 | struct page *page; | ||
967 | int i, ret = 0; | ||
968 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
969 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
970 | |||
971 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
972 | if (!page) { | ||
973 | ret |= VM_FAULT_OOM; | ||
974 | goto out; | ||
975 | } | ||
976 | |||
977 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { | ||
978 | put_page(page); | ||
979 | ret |= VM_FAULT_OOM; | ||
980 | goto out; | ||
981 | } | ||
982 | |||
983 | clear_user_highpage(page, address); | ||
984 | __SetPageUptodate(page); | ||
985 | |||
986 | mmun_start = haddr; | ||
987 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
988 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
989 | |||
990 | spin_lock(&mm->page_table_lock); | ||
991 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | ||
992 | goto out_free_page; | ||
993 | |||
994 | pmdp_clear_flush(vma, haddr, pmd); | ||
995 | /* leave pmd empty until pte is filled */ | ||
996 | |||
997 | pgtable = pgtable_trans_huge_withdraw(mm); | ||
998 | pmd_populate(mm, &_pmd, pgtable); | ||
999 | |||
1000 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
1001 | pte_t *pte, entry; | ||
1002 | if (haddr == (address & PAGE_MASK)) { | ||
1003 | entry = mk_pte(page, vma->vm_page_prot); | ||
1004 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
1005 | page_add_new_anon_rmap(page, vma, haddr); | ||
1006 | } else { | ||
1007 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); | ||
1008 | entry = pte_mkspecial(entry); | ||
1009 | } | ||
1010 | pte = pte_offset_map(&_pmd, haddr); | ||
1011 | VM_BUG_ON(!pte_none(*pte)); | ||
1012 | set_pte_at(mm, haddr, pte, entry); | ||
1013 | pte_unmap(pte); | ||
1014 | } | ||
1015 | smp_wmb(); /* make pte visible before pmd */ | ||
1016 | pmd_populate(mm, pmd, pgtable); | ||
1017 | spin_unlock(&mm->page_table_lock); | ||
1018 | put_huge_zero_page(); | ||
1019 | inc_mm_counter(mm, MM_ANONPAGES); | ||
1020 | |||
1021 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1022 | |||
1023 | ret |= VM_FAULT_WRITE; | ||
1024 | out: | ||
1025 | return ret; | ||
1026 | out_free_page: | ||
1027 | spin_unlock(&mm->page_table_lock); | ||
1028 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1029 | mem_cgroup_uncharge_page(page); | ||
1030 | put_page(page); | ||
1031 | goto out; | ||
1032 | } | ||
1033 | |||
809 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | 1034 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, |
810 | struct vm_area_struct *vma, | 1035 | struct vm_area_struct *vma, |
811 | unsigned long address, | 1036 | unsigned long address, |
@@ -912,19 +1137,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
912 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) | 1137 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) |
913 | { | 1138 | { |
914 | int ret = 0; | 1139 | int ret = 0; |
915 | struct page *page, *new_page; | 1140 | struct page *page = NULL, *new_page; |
916 | unsigned long haddr; | 1141 | unsigned long haddr; |
917 | unsigned long mmun_start; /* For mmu_notifiers */ | 1142 | unsigned long mmun_start; /* For mmu_notifiers */ |
918 | unsigned long mmun_end; /* For mmu_notifiers */ | 1143 | unsigned long mmun_end; /* For mmu_notifiers */ |
919 | 1144 | ||
920 | VM_BUG_ON(!vma->anon_vma); | 1145 | VM_BUG_ON(!vma->anon_vma); |
1146 | haddr = address & HPAGE_PMD_MASK; | ||
1147 | if (is_huge_zero_pmd(orig_pmd)) | ||
1148 | goto alloc; | ||
921 | spin_lock(&mm->page_table_lock); | 1149 | spin_lock(&mm->page_table_lock); |
922 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 1150 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
923 | goto out_unlock; | 1151 | goto out_unlock; |
924 | 1152 | ||
925 | page = pmd_page(orig_pmd); | 1153 | page = pmd_page(orig_pmd); |
926 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); | 1154 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); |
927 | haddr = address & HPAGE_PMD_MASK; | ||
928 | if (page_mapcount(page) == 1) { | 1155 | if (page_mapcount(page) == 1) { |
929 | pmd_t entry; | 1156 | pmd_t entry; |
930 | entry = pmd_mkyoung(orig_pmd); | 1157 | entry = pmd_mkyoung(orig_pmd); |
@@ -936,7 +1163,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
936 | } | 1163 | } |
937 | get_page(page); | 1164 | get_page(page); |
938 | spin_unlock(&mm->page_table_lock); | 1165 | spin_unlock(&mm->page_table_lock); |
939 | 1166 | alloc: | |
940 | if (transparent_hugepage_enabled(vma) && | 1167 | if (transparent_hugepage_enabled(vma) && |
941 | !transparent_hugepage_debug_cow()) | 1168 | !transparent_hugepage_debug_cow()) |
942 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 1169 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
@@ -946,24 +1173,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
946 | 1173 | ||
947 | if (unlikely(!new_page)) { | 1174 | if (unlikely(!new_page)) { |
948 | count_vm_event(THP_FAULT_FALLBACK); | 1175 | count_vm_event(THP_FAULT_FALLBACK); |
949 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | 1176 | if (is_huge_zero_pmd(orig_pmd)) { |
950 | pmd, orig_pmd, page, haddr); | 1177 | ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, |
951 | if (ret & VM_FAULT_OOM) | 1178 | address, pmd, orig_pmd, haddr); |
952 | split_huge_page(page); | 1179 | } else { |
953 | put_page(page); | 1180 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, |
1181 | pmd, orig_pmd, page, haddr); | ||
1182 | if (ret & VM_FAULT_OOM) | ||
1183 | split_huge_page(page); | ||
1184 | put_page(page); | ||
1185 | } | ||
954 | goto out; | 1186 | goto out; |
955 | } | 1187 | } |
956 | count_vm_event(THP_FAULT_ALLOC); | 1188 | count_vm_event(THP_FAULT_ALLOC); |
957 | 1189 | ||
958 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 1190 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { |
959 | put_page(new_page); | 1191 | put_page(new_page); |
960 | split_huge_page(page); | 1192 | if (page) { |
961 | put_page(page); | 1193 | split_huge_page(page); |
1194 | put_page(page); | ||
1195 | } | ||
962 | ret |= VM_FAULT_OOM; | 1196 | ret |= VM_FAULT_OOM; |
963 | goto out; | 1197 | goto out; |
964 | } | 1198 | } |
965 | 1199 | ||
966 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | 1200 | if (is_huge_zero_pmd(orig_pmd)) |
1201 | clear_huge_page(new_page, haddr, HPAGE_PMD_NR); | ||
1202 | else | ||
1203 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | ||
967 | __SetPageUptodate(new_page); | 1204 | __SetPageUptodate(new_page); |
968 | 1205 | ||
969 | mmun_start = haddr; | 1206 | mmun_start = haddr; |
@@ -971,7 +1208,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
971 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1208 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
972 | 1209 | ||
973 | spin_lock(&mm->page_table_lock); | 1210 | spin_lock(&mm->page_table_lock); |
974 | put_page(page); | 1211 | if (page) |
1212 | put_page(page); | ||
975 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 1213 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
976 | spin_unlock(&mm->page_table_lock); | 1214 | spin_unlock(&mm->page_table_lock); |
977 | mem_cgroup_uncharge_page(new_page); | 1215 | mem_cgroup_uncharge_page(new_page); |
@@ -979,14 +1217,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
979 | goto out_mn; | 1217 | goto out_mn; |
980 | } else { | 1218 | } else { |
981 | pmd_t entry; | 1219 | pmd_t entry; |
982 | VM_BUG_ON(!PageHead(page)); | ||
983 | entry = mk_huge_pmd(new_page, vma); | 1220 | entry = mk_huge_pmd(new_page, vma); |
984 | pmdp_clear_flush(vma, haddr, pmd); | 1221 | pmdp_clear_flush(vma, haddr, pmd); |
985 | page_add_new_anon_rmap(new_page, vma, haddr); | 1222 | page_add_new_anon_rmap(new_page, vma, haddr); |
986 | set_pmd_at(mm, haddr, pmd, entry); | 1223 | set_pmd_at(mm, haddr, pmd, entry); |
987 | update_mmu_cache_pmd(vma, address, pmd); | 1224 | update_mmu_cache_pmd(vma, address, pmd); |
988 | page_remove_rmap(page); | 1225 | if (is_huge_zero_pmd(orig_pmd)) { |
989 | put_page(page); | 1226 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
1227 | put_huge_zero_page(); | ||
1228 | } else { | ||
1229 | VM_BUG_ON(!PageHead(page)); | ||
1230 | page_remove_rmap(page); | ||
1231 | put_page(page); | ||
1232 | } | ||
990 | ret |= VM_FAULT_WRITE; | 1233 | ret |= VM_FAULT_WRITE; |
991 | } | 1234 | } |
992 | spin_unlock(&mm->page_table_lock); | 1235 | spin_unlock(&mm->page_table_lock); |
@@ -1055,15 +1298,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1055 | pmd_t orig_pmd; | 1298 | pmd_t orig_pmd; |
1056 | pgtable = pgtable_trans_huge_withdraw(tlb->mm); | 1299 | pgtable = pgtable_trans_huge_withdraw(tlb->mm); |
1057 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); | 1300 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); |
1058 | page = pmd_page(orig_pmd); | ||
1059 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1301 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1060 | page_remove_rmap(page); | 1302 | if (is_huge_zero_pmd(orig_pmd)) { |
1061 | VM_BUG_ON(page_mapcount(page) < 0); | 1303 | tlb->mm->nr_ptes--; |
1062 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | 1304 | spin_unlock(&tlb->mm->page_table_lock); |
1063 | VM_BUG_ON(!PageHead(page)); | 1305 | put_huge_zero_page(); |
1064 | tlb->mm->nr_ptes--; | 1306 | } else { |
1065 | spin_unlock(&tlb->mm->page_table_lock); | 1307 | page = pmd_page(orig_pmd); |
1066 | tlb_remove_page(tlb, page); | 1308 | page_remove_rmap(page); |
1309 | VM_BUG_ON(page_mapcount(page) < 0); | ||
1310 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | ||
1311 | VM_BUG_ON(!PageHead(page)); | ||
1312 | tlb->mm->nr_ptes--; | ||
1313 | spin_unlock(&tlb->mm->page_table_lock); | ||
1314 | tlb_remove_page(tlb, page); | ||
1315 | } | ||
1067 | pte_free(tlb->mm, pgtable); | 1316 | pte_free(tlb->mm, pgtable); |
1068 | ret = 1; | 1317 | ret = 1; |
1069 | } | 1318 | } |
@@ -1135,6 +1384,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1135 | pmd_t entry; | 1384 | pmd_t entry; |
1136 | entry = pmdp_get_and_clear(mm, addr, pmd); | 1385 | entry = pmdp_get_and_clear(mm, addr, pmd); |
1137 | entry = pmd_modify(entry, newprot); | 1386 | entry = pmd_modify(entry, newprot); |
1387 | BUG_ON(pmd_write(entry)); | ||
1138 | set_pmd_at(mm, addr, pmd, entry); | 1388 | set_pmd_at(mm, addr, pmd, entry); |
1139 | spin_unlock(&vma->vm_mm->page_table_lock); | 1389 | spin_unlock(&vma->vm_mm->page_table_lock); |
1140 | ret = 1; | 1390 | ret = 1; |
@@ -1477,6 +1727,7 @@ int split_huge_page(struct page *page) | |||
1477 | struct anon_vma *anon_vma; | 1727 | struct anon_vma *anon_vma; |
1478 | int ret = 1; | 1728 | int ret = 1; |
1479 | 1729 | ||
1730 | BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); | ||
1480 | BUG_ON(!PageAnon(page)); | 1731 | BUG_ON(!PageAnon(page)); |
1481 | anon_vma = page_lock_anon_vma(page); | 1732 | anon_vma = page_lock_anon_vma(page); |
1482 | if (!anon_vma) | 1733 | if (!anon_vma) |
@@ -2336,19 +2587,65 @@ static int khugepaged(void *none) | |||
2336 | return 0; | 2587 | return 0; |
2337 | } | 2588 | } |
2338 | 2589 | ||
2339 | void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) | 2590 | static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, |
2591 | unsigned long haddr, pmd_t *pmd) | ||
2592 | { | ||
2593 | struct mm_struct *mm = vma->vm_mm; | ||
2594 | pgtable_t pgtable; | ||
2595 | pmd_t _pmd; | ||
2596 | int i; | ||
2597 | |||
2598 | pmdp_clear_flush(vma, haddr, pmd); | ||
2599 | /* leave pmd empty until pte is filled */ | ||
2600 | |||
2601 | pgtable = pgtable_trans_huge_withdraw(mm); | ||
2602 | pmd_populate(mm, &_pmd, pgtable); | ||
2603 | |||
2604 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
2605 | pte_t *pte, entry; | ||
2606 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); | ||
2607 | entry = pte_mkspecial(entry); | ||
2608 | pte = pte_offset_map(&_pmd, haddr); | ||
2609 | VM_BUG_ON(!pte_none(*pte)); | ||
2610 | set_pte_at(mm, haddr, pte, entry); | ||
2611 | pte_unmap(pte); | ||
2612 | } | ||
2613 | smp_wmb(); /* make pte visible before pmd */ | ||
2614 | pmd_populate(mm, pmd, pgtable); | ||
2615 | put_huge_zero_page(); | ||
2616 | } | ||
2617 | |||
2618 | void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | ||
2619 | pmd_t *pmd) | ||
2340 | { | 2620 | { |
2341 | struct page *page; | 2621 | struct page *page; |
2622 | struct mm_struct *mm = vma->vm_mm; | ||
2623 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
2624 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
2625 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
2626 | |||
2627 | BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); | ||
2342 | 2628 | ||
2629 | mmun_start = haddr; | ||
2630 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
2631 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
2343 | spin_lock(&mm->page_table_lock); | 2632 | spin_lock(&mm->page_table_lock); |
2344 | if (unlikely(!pmd_trans_huge(*pmd))) { | 2633 | if (unlikely(!pmd_trans_huge(*pmd))) { |
2345 | spin_unlock(&mm->page_table_lock); | 2634 | spin_unlock(&mm->page_table_lock); |
2635 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2636 | return; | ||
2637 | } | ||
2638 | if (is_huge_zero_pmd(*pmd)) { | ||
2639 | __split_huge_zero_page_pmd(vma, haddr, pmd); | ||
2640 | spin_unlock(&mm->page_table_lock); | ||
2641 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2346 | return; | 2642 | return; |
2347 | } | 2643 | } |
2348 | page = pmd_page(*pmd); | 2644 | page = pmd_page(*pmd); |
2349 | VM_BUG_ON(!page_count(page)); | 2645 | VM_BUG_ON(!page_count(page)); |
2350 | get_page(page); | 2646 | get_page(page); |
2351 | spin_unlock(&mm->page_table_lock); | 2647 | spin_unlock(&mm->page_table_lock); |
2648 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2352 | 2649 | ||
2353 | split_huge_page(page); | 2650 | split_huge_page(page); |
2354 | 2651 | ||
@@ -2356,6 +2653,16 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) | |||
2356 | BUG_ON(pmd_trans_huge(*pmd)); | 2653 | BUG_ON(pmd_trans_huge(*pmd)); |
2357 | } | 2654 | } |
2358 | 2655 | ||
2656 | void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, | ||
2657 | pmd_t *pmd) | ||
2658 | { | ||
2659 | struct vm_area_struct *vma; | ||
2660 | |||
2661 | vma = find_vma(mm, address); | ||
2662 | BUG_ON(vma == NULL); | ||
2663 | split_huge_page_pmd(vma, address, pmd); | ||
2664 | } | ||
2665 | |||
2359 | static void split_huge_page_address(struct mm_struct *mm, | 2666 | static void split_huge_page_address(struct mm_struct *mm, |
2360 | unsigned long address) | 2667 | unsigned long address) |
2361 | { | 2668 | { |
@@ -2370,7 +2677,7 @@ static void split_huge_page_address(struct mm_struct *mm, | |||
2370 | * Caller holds the mmap_sem write mode, so a huge pmd cannot | 2677 | * Caller holds the mmap_sem write mode, so a huge pmd cannot |
2371 | * materialize from under us. | 2678 | * materialize from under us. |
2372 | */ | 2679 | */ |
2373 | split_huge_page_pmd(mm, pmd); | 2680 | split_huge_page_pmd_mm(mm, address, pmd); |
2374 | } | 2681 | } |
2375 | 2682 | ||
2376 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, | 2683 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 34f372ad89d0..88e7293b96bd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -1057,7 +1057,7 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
1057 | * on-line nodes with memory and will handle the hstate accounting. | 1057 | * on-line nodes with memory and will handle the hstate accounting. |
1058 | */ | 1058 | */ |
1059 | while (nr_pages--) { | 1059 | while (nr_pages--) { |
1060 | if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) | 1060 | if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) |
1061 | break; | 1061 | break; |
1062 | } | 1062 | } |
1063 | } | 1063 | } |
@@ -1180,14 +1180,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1180 | int __weak alloc_bootmem_huge_page(struct hstate *h) | 1180 | int __weak alloc_bootmem_huge_page(struct hstate *h) |
1181 | { | 1181 | { |
1182 | struct huge_bootmem_page *m; | 1182 | struct huge_bootmem_page *m; |
1183 | int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | 1183 | int nr_nodes = nodes_weight(node_states[N_MEMORY]); |
1184 | 1184 | ||
1185 | while (nr_nodes) { | 1185 | while (nr_nodes) { |
1186 | void *addr; | 1186 | void *addr; |
1187 | 1187 | ||
1188 | addr = __alloc_bootmem_node_nopanic( | 1188 | addr = __alloc_bootmem_node_nopanic( |
1189 | NODE_DATA(hstate_next_node_to_alloc(h, | 1189 | NODE_DATA(hstate_next_node_to_alloc(h, |
1190 | &node_states[N_HIGH_MEMORY])), | 1190 | &node_states[N_MEMORY])), |
1191 | huge_page_size(h), huge_page_size(h), 0); | 1191 | huge_page_size(h), huge_page_size(h), 0); |
1192 | 1192 | ||
1193 | if (addr) { | 1193 | if (addr) { |
@@ -1259,7 +1259,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) | |||
1259 | if (!alloc_bootmem_huge_page(h)) | 1259 | if (!alloc_bootmem_huge_page(h)) |
1260 | break; | 1260 | break; |
1261 | } else if (!alloc_fresh_huge_page(h, | 1261 | } else if (!alloc_fresh_huge_page(h, |
1262 | &node_states[N_HIGH_MEMORY])) | 1262 | &node_states[N_MEMORY])) |
1263 | break; | 1263 | break; |
1264 | } | 1264 | } |
1265 | h->max_huge_pages = i; | 1265 | h->max_huge_pages = i; |
@@ -1527,7 +1527,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
1527 | if (!(obey_mempolicy && | 1527 | if (!(obey_mempolicy && |
1528 | init_nodemask_of_mempolicy(nodes_allowed))) { | 1528 | init_nodemask_of_mempolicy(nodes_allowed))) { |
1529 | NODEMASK_FREE(nodes_allowed); | 1529 | NODEMASK_FREE(nodes_allowed); |
1530 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | 1530 | nodes_allowed = &node_states[N_MEMORY]; |
1531 | } | 1531 | } |
1532 | } else if (nodes_allowed) { | 1532 | } else if (nodes_allowed) { |
1533 | /* | 1533 | /* |
@@ -1537,11 +1537,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
1537 | count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; | 1537 | count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; |
1538 | init_nodemask_of_node(nodes_allowed, nid); | 1538 | init_nodemask_of_node(nodes_allowed, nid); |
1539 | } else | 1539 | } else |
1540 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | 1540 | nodes_allowed = &node_states[N_MEMORY]; |
1541 | 1541 | ||
1542 | h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); | 1542 | h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); |
1543 | 1543 | ||
1544 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | 1544 | if (nodes_allowed != &node_states[N_MEMORY]) |
1545 | NODEMASK_FREE(nodes_allowed); | 1545 | NODEMASK_FREE(nodes_allowed); |
1546 | 1546 | ||
1547 | return len; | 1547 | return len; |
@@ -1844,7 +1844,7 @@ static void hugetlb_register_all_nodes(void) | |||
1844 | { | 1844 | { |
1845 | int nid; | 1845 | int nid; |
1846 | 1846 | ||
1847 | for_each_node_state(nid, N_HIGH_MEMORY) { | 1847 | for_each_node_state(nid, N_MEMORY) { |
1848 | struct node *node = node_devices[nid]; | 1848 | struct node *node = node_devices[nid]; |
1849 | if (node->dev.id == nid) | 1849 | if (node->dev.id == nid) |
1850 | hugetlb_register_node(node); | 1850 | hugetlb_register_node(node); |
@@ -1939,8 +1939,8 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1939 | for (i = 0; i < MAX_NUMNODES; ++i) | 1939 | for (i = 0; i < MAX_NUMNODES; ++i) |
1940 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1940 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
1941 | INIT_LIST_HEAD(&h->hugepage_activelist); | 1941 | INIT_LIST_HEAD(&h->hugepage_activelist); |
1942 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); | 1942 | h->next_nid_to_alloc = first_node(node_states[N_MEMORY]); |
1943 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); | 1943 | h->next_nid_to_free = first_node(node_states[N_MEMORY]); |
1944 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1944 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
1945 | huge_page_size(h)/1024); | 1945 | huge_page_size(h)/1024); |
1946 | /* | 1946 | /* |
@@ -2035,11 +2035,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
2035 | if (!(obey_mempolicy && | 2035 | if (!(obey_mempolicy && |
2036 | init_nodemask_of_mempolicy(nodes_allowed))) { | 2036 | init_nodemask_of_mempolicy(nodes_allowed))) { |
2037 | NODEMASK_FREE(nodes_allowed); | 2037 | NODEMASK_FREE(nodes_allowed); |
2038 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | 2038 | nodes_allowed = &node_states[N_MEMORY]; |
2039 | } | 2039 | } |
2040 | h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); | 2040 | h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); |
2041 | 2041 | ||
2042 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | 2042 | if (nodes_allowed != &node_states[N_MEMORY]) |
2043 | NODEMASK_FREE(nodes_allowed); | 2043 | NODEMASK_FREE(nodes_allowed); |
2044 | } | 2044 | } |
2045 | out: | 2045 | out: |
@@ -2386,8 +2386,10 @@ again: | |||
2386 | /* | 2386 | /* |
2387 | * HWPoisoned hugepage is already unmapped and dropped reference | 2387 | * HWPoisoned hugepage is already unmapped and dropped reference |
2388 | */ | 2388 | */ |
2389 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) | 2389 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { |
2390 | pte_clear(mm, address, ptep); | ||
2390 | continue; | 2391 | continue; |
2392 | } | ||
2391 | 2393 | ||
2392 | page = pte_page(pte); | 2394 | page = pte_page(pte); |
2393 | /* | 2395 | /* |
@@ -3170,7 +3172,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) | |||
3170 | 3172 | ||
3171 | spin_lock(&hugetlb_lock); | 3173 | spin_lock(&hugetlb_lock); |
3172 | if (is_hugepage_on_freelist(hpage)) { | 3174 | if (is_hugepage_on_freelist(hpage)) { |
3173 | list_del(&hpage->lru); | 3175 | /* |
3176 | * Hwpoisoned hugepage isn't linked to activelist or freelist, | ||
3177 | * but dangling hpage->lru can trigger list-debug warnings | ||
3178 | * (this happens when we call unpoison_memory() on it), | ||
3179 | * so let it point to itself with list_del_init(). | ||
3180 | */ | ||
3181 | list_del_init(&hpage->lru); | ||
3174 | set_page_refcounted(hpage); | 3182 | set_page_refcounted(hpage); |
3175 | h->free_huge_pages--; | 3183 | h->free_huge_pages--; |
3176 | h->free_huge_pages_node[nid]--; | 3184 | h->free_huge_pages_node[nid]--; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 12307b3838fb..6c055929c8cc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -59,6 +59,8 @@ | |||
59 | #include <trace/events/vmscan.h> | 59 | #include <trace/events/vmscan.h> |
60 | 60 | ||
61 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 61 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
62 | EXPORT_SYMBOL(mem_cgroup_subsys); | ||
63 | |||
62 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 64 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
63 | static struct mem_cgroup *root_mem_cgroup __read_mostly; | 65 | static struct mem_cgroup *root_mem_cgroup __read_mostly; |
64 | 66 | ||
@@ -800,7 +802,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, | |||
800 | int nid; | 802 | int nid; |
801 | u64 total = 0; | 803 | u64 total = 0; |
802 | 804 | ||
803 | for_each_node_state(nid, N_HIGH_MEMORY) | 805 | for_each_node_state(nid, N_MEMORY) |
804 | total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); | 806 | total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); |
805 | return total; | 807 | return total; |
806 | } | 808 | } |
@@ -1015,13 +1017,10 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, | |||
1015 | iter != NULL; \ | 1017 | iter != NULL; \ |
1016 | iter = mem_cgroup_iter(NULL, iter, NULL)) | 1018 | iter = mem_cgroup_iter(NULL, iter, NULL)) |
1017 | 1019 | ||
1018 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | 1020 | void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) |
1019 | { | 1021 | { |
1020 | struct mem_cgroup *memcg; | 1022 | struct mem_cgroup *memcg; |
1021 | 1023 | ||
1022 | if (!mm) | ||
1023 | return; | ||
1024 | |||
1025 | rcu_read_lock(); | 1024 | rcu_read_lock(); |
1026 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1025 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
1027 | if (unlikely(!memcg)) | 1026 | if (unlikely(!memcg)) |
@@ -1040,7 +1039,7 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | |||
1040 | out: | 1039 | out: |
1041 | rcu_read_unlock(); | 1040 | rcu_read_unlock(); |
1042 | } | 1041 | } |
1043 | EXPORT_SYMBOL(mem_cgroup_count_vm_event); | 1042 | EXPORT_SYMBOL(__mem_cgroup_count_vm_event); |
1044 | 1043 | ||
1045 | /** | 1044 | /** |
1046 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg | 1045 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg |
@@ -1644,9 +1643,9 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) | |||
1644 | return; | 1643 | return; |
1645 | 1644 | ||
1646 | /* make a nodemask where this memcg uses memory from */ | 1645 | /* make a nodemask where this memcg uses memory from */ |
1647 | memcg->scan_nodes = node_states[N_HIGH_MEMORY]; | 1646 | memcg->scan_nodes = node_states[N_MEMORY]; |
1648 | 1647 | ||
1649 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { | 1648 | for_each_node_mask(nid, node_states[N_MEMORY]) { |
1650 | 1649 | ||
1651 | if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) | 1650 | if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) |
1652 | node_clear(nid, memcg->scan_nodes); | 1651 | node_clear(nid, memcg->scan_nodes); |
@@ -1717,7 +1716,7 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | |||
1717 | /* | 1716 | /* |
1718 | * Check rest of nodes. | 1717 | * Check rest of nodes. |
1719 | */ | 1718 | */ |
1720 | for_each_node_state(nid, N_HIGH_MEMORY) { | 1719 | for_each_node_state(nid, N_MEMORY) { |
1721 | if (node_isset(nid, memcg->scan_nodes)) | 1720 | if (node_isset(nid, memcg->scan_nodes)) |
1722 | continue; | 1721 | continue; |
1723 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | 1722 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) |
@@ -3776,7 +3775,7 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) | |||
3776 | lru_add_drain_all(); | 3775 | lru_add_drain_all(); |
3777 | drain_all_stock_sync(memcg); | 3776 | drain_all_stock_sync(memcg); |
3778 | mem_cgroup_start_move(memcg); | 3777 | mem_cgroup_start_move(memcg); |
3779 | for_each_node_state(node, N_HIGH_MEMORY) { | 3778 | for_each_node_state(node, N_MEMORY) { |
3780 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 3779 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
3781 | enum lru_list lru; | 3780 | enum lru_list lru; |
3782 | for_each_lru(lru) { | 3781 | for_each_lru(lru) { |
@@ -4122,7 +4121,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4122 | 4121 | ||
4123 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); | 4122 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); |
4124 | seq_printf(m, "total=%lu", total_nr); | 4123 | seq_printf(m, "total=%lu", total_nr); |
4125 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4124 | for_each_node_state(nid, N_MEMORY) { |
4126 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); | 4125 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); |
4127 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4126 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4128 | } | 4127 | } |
@@ -4130,7 +4129,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4130 | 4129 | ||
4131 | file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); | 4130 | file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); |
4132 | seq_printf(m, "file=%lu", file_nr); | 4131 | seq_printf(m, "file=%lu", file_nr); |
4133 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4132 | for_each_node_state(nid, N_MEMORY) { |
4134 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 4133 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4135 | LRU_ALL_FILE); | 4134 | LRU_ALL_FILE); |
4136 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4135 | seq_printf(m, " N%d=%lu", nid, node_nr); |
@@ -4139,7 +4138,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4139 | 4138 | ||
4140 | anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); | 4139 | anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); |
4141 | seq_printf(m, "anon=%lu", anon_nr); | 4140 | seq_printf(m, "anon=%lu", anon_nr); |
4142 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4141 | for_each_node_state(nid, N_MEMORY) { |
4143 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 4142 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4144 | LRU_ALL_ANON); | 4143 | LRU_ALL_ANON); |
4145 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4144 | seq_printf(m, " N%d=%lu", nid, node_nr); |
@@ -4148,7 +4147,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4148 | 4147 | ||
4149 | unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); | 4148 | unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); |
4150 | seq_printf(m, "unevictable=%lu", unevictable_nr); | 4149 | seq_printf(m, "unevictable=%lu", unevictable_nr); |
4151 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4150 | for_each_node_state(nid, N_MEMORY) { |
4152 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 4151 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4153 | BIT(LRU_UNEVICTABLE)); | 4152 | BIT(LRU_UNEVICTABLE)); |
4154 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4153 | seq_printf(m, " N%d=%lu", nid, node_nr); |
diff --git a/mm/memory.c b/mm/memory.c index 765377385632..db2e9e797a05 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -717,20 +717,6 @@ static inline bool is_cow_mapping(vm_flags_t flags) | |||
717 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 717 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
718 | } | 718 | } |
719 | 719 | ||
720 | #ifndef is_zero_pfn | ||
721 | static inline int is_zero_pfn(unsigned long pfn) | ||
722 | { | ||
723 | return pfn == zero_pfn; | ||
724 | } | ||
725 | #endif | ||
726 | |||
727 | #ifndef my_zero_pfn | ||
728 | static inline unsigned long my_zero_pfn(unsigned long addr) | ||
729 | { | ||
730 | return zero_pfn; | ||
731 | } | ||
732 | #endif | ||
733 | |||
734 | /* | 720 | /* |
735 | * vm_normal_page -- This function gets the "struct page" associated with a pte. | 721 | * vm_normal_page -- This function gets the "struct page" associated with a pte. |
736 | * | 722 | * |
@@ -1250,7 +1236,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1250 | BUG(); | 1236 | BUG(); |
1251 | } | 1237 | } |
1252 | #endif | 1238 | #endif |
1253 | split_huge_page_pmd(vma->vm_mm, pmd); | 1239 | split_huge_page_pmd(vma, addr, pmd); |
1254 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) | 1240 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) |
1255 | goto next; | 1241 | goto next; |
1256 | /* fall through */ | 1242 | /* fall through */ |
@@ -1519,7 +1505,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1519 | } | 1505 | } |
1520 | if (pmd_trans_huge(*pmd)) { | 1506 | if (pmd_trans_huge(*pmd)) { |
1521 | if (flags & FOLL_SPLIT) { | 1507 | if (flags & FOLL_SPLIT) { |
1522 | split_huge_page_pmd(mm, pmd); | 1508 | split_huge_page_pmd(vma, address, pmd); |
1523 | goto split_fallthrough; | 1509 | goto split_fallthrough; |
1524 | } | 1510 | } |
1525 | spin_lock(&mm->page_table_lock); | 1511 | spin_lock(&mm->page_table_lock); |
@@ -2794,13 +2780,8 @@ unlock: | |||
2794 | oom_free_new: | 2780 | oom_free_new: |
2795 | page_cache_release(new_page); | 2781 | page_cache_release(new_page); |
2796 | oom: | 2782 | oom: |
2797 | if (old_page) { | 2783 | if (old_page) |
2798 | if (page_mkwrite) { | ||
2799 | unlock_page(old_page); | ||
2800 | page_cache_release(old_page); | ||
2801 | } | ||
2802 | page_cache_release(old_page); | 2784 | page_cache_release(old_page); |
2803 | } | ||
2804 | return VM_FAULT_OOM; | 2785 | return VM_FAULT_OOM; |
2805 | 2786 | ||
2806 | unwritable_page: | 2787 | unwritable_page: |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c3e66ae411fd..518baa896e83 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page, | |||
106 | void __ref put_page_bootmem(struct page *page) | 106 | void __ref put_page_bootmem(struct page *page) |
107 | { | 107 | { |
108 | unsigned long type; | 108 | unsigned long type; |
109 | static DEFINE_MUTEX(ppb_lock); | ||
109 | 110 | ||
110 | type = (unsigned long) page->lru.next; | 111 | type = (unsigned long) page->lru.next; |
111 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || | 112 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || |
@@ -115,7 +116,14 @@ void __ref put_page_bootmem(struct page *page) | |||
115 | ClearPagePrivate(page); | 116 | ClearPagePrivate(page); |
116 | set_page_private(page, 0); | 117 | set_page_private(page, 0); |
117 | INIT_LIST_HEAD(&page->lru); | 118 | INIT_LIST_HEAD(&page->lru); |
119 | |||
120 | /* | ||
121 | * Please refer to comment for __free_pages_bootmem() | ||
122 | * for why we serialize here. | ||
123 | */ | ||
124 | mutex_lock(&ppb_lock); | ||
118 | __free_pages_bootmem(page, 0); | 125 | __free_pages_bootmem(page, 0); |
126 | mutex_unlock(&ppb_lock); | ||
119 | } | 127 | } |
120 | 128 | ||
121 | } | 129 | } |
@@ -581,11 +589,19 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, | |||
581 | return 0; | 589 | return 0; |
582 | } | 590 | } |
583 | 591 | ||
592 | #ifdef CONFIG_MOVABLE_NODE | ||
593 | /* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ | ||
594 | static bool can_online_high_movable(struct zone *zone) | ||
595 | { | ||
596 | return true; | ||
597 | } | ||
598 | #else /* #ifdef CONFIG_MOVABLE_NODE */ | ||
584 | /* ensure every online node has NORMAL memory */ | 599 | /* ensure every online node has NORMAL memory */ |
585 | static bool can_online_high_movable(struct zone *zone) | 600 | static bool can_online_high_movable(struct zone *zone) |
586 | { | 601 | { |
587 | return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); | 602 | return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); |
588 | } | 603 | } |
604 | #endif /* #ifdef CONFIG_MOVABLE_NODE */ | ||
589 | 605 | ||
590 | /* check which state of node_states will be changed when online memory */ | 606 | /* check which state of node_states will be changed when online memory */ |
591 | static void node_states_check_changes_online(unsigned long nr_pages, | 607 | static void node_states_check_changes_online(unsigned long nr_pages, |
@@ -595,13 +611,15 @@ static void node_states_check_changes_online(unsigned long nr_pages, | |||
595 | enum zone_type zone_last = ZONE_NORMAL; | 611 | enum zone_type zone_last = ZONE_NORMAL; |
596 | 612 | ||
597 | /* | 613 | /* |
598 | * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes | 614 | * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] |
599 | * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. | 615 | * contains nodes which have zones of 0...ZONE_NORMAL, |
616 | * set zone_last to ZONE_NORMAL. | ||
600 | * | 617 | * |
601 | * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes | 618 | * If we don't have HIGHMEM nor movable node, |
602 | * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | 619 | * node_states[N_NORMAL_MEMORY] contains nodes which have zones of |
620 | * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | ||
603 | */ | 621 | */ |
604 | if (N_HIGH_MEMORY == N_NORMAL_MEMORY) | 622 | if (N_MEMORY == N_NORMAL_MEMORY) |
605 | zone_last = ZONE_MOVABLE; | 623 | zone_last = ZONE_MOVABLE; |
606 | 624 | ||
607 | /* | 625 | /* |
@@ -615,12 +633,34 @@ static void node_states_check_changes_online(unsigned long nr_pages, | |||
615 | else | 633 | else |
616 | arg->status_change_nid_normal = -1; | 634 | arg->status_change_nid_normal = -1; |
617 | 635 | ||
636 | #ifdef CONFIG_HIGHMEM | ||
637 | /* | ||
638 | * If we have movable node, node_states[N_HIGH_MEMORY] | ||
639 | * contains nodes which have zones of 0...ZONE_HIGHMEM, | ||
640 | * set zone_last to ZONE_HIGHMEM. | ||
641 | * | ||
642 | * If we don't have movable node, node_states[N_NORMAL_MEMORY] | ||
643 | * contains nodes which have zones of 0...ZONE_MOVABLE, | ||
644 | * set zone_last to ZONE_MOVABLE. | ||
645 | */ | ||
646 | zone_last = ZONE_HIGHMEM; | ||
647 | if (N_MEMORY == N_HIGH_MEMORY) | ||
648 | zone_last = ZONE_MOVABLE; | ||
649 | |||
650 | if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) | ||
651 | arg->status_change_nid_high = nid; | ||
652 | else | ||
653 | arg->status_change_nid_high = -1; | ||
654 | #else | ||
655 | arg->status_change_nid_high = arg->status_change_nid_normal; | ||
656 | #endif | ||
657 | |||
618 | /* | 658 | /* |
619 | * if the node don't have memory befor online, we will need to | 659 | * if the node don't have memory befor online, we will need to |
620 | * set the node to node_states[N_HIGH_MEMORY] after the memory | 660 | * set the node to node_states[N_MEMORY] after the memory |
621 | * is online. | 661 | * is online. |
622 | */ | 662 | */ |
623 | if (!node_state(nid, N_HIGH_MEMORY)) | 663 | if (!node_state(nid, N_MEMORY)) |
624 | arg->status_change_nid = nid; | 664 | arg->status_change_nid = nid; |
625 | else | 665 | else |
626 | arg->status_change_nid = -1; | 666 | arg->status_change_nid = -1; |
@@ -631,7 +671,10 @@ static void node_states_set_node(int node, struct memory_notify *arg) | |||
631 | if (arg->status_change_nid_normal >= 0) | 671 | if (arg->status_change_nid_normal >= 0) |
632 | node_set_state(node, N_NORMAL_MEMORY); | 672 | node_set_state(node, N_NORMAL_MEMORY); |
633 | 673 | ||
634 | node_set_state(node, N_HIGH_MEMORY); | 674 | if (arg->status_change_nid_high >= 0) |
675 | node_set_state(node, N_HIGH_MEMORY); | ||
676 | |||
677 | node_set_state(node, N_MEMORY); | ||
635 | } | 678 | } |
636 | 679 | ||
637 | 680 | ||
@@ -713,6 +756,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
713 | return ret; | 756 | return ret; |
714 | } | 757 | } |
715 | 758 | ||
759 | zone->managed_pages += onlined_pages; | ||
716 | zone->present_pages += onlined_pages; | 760 | zone->present_pages += onlined_pages; |
717 | zone->zone_pgdat->node_present_pages += onlined_pages; | 761 | zone->zone_pgdat->node_present_pages += onlined_pages; |
718 | if (onlined_pages) { | 762 | if (onlined_pages) { |
@@ -1066,6 +1110,13 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
1066 | return offlined; | 1110 | return offlined; |
1067 | } | 1111 | } |
1068 | 1112 | ||
1113 | #ifdef CONFIG_MOVABLE_NODE | ||
1114 | /* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ | ||
1115 | static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) | ||
1116 | { | ||
1117 | return true; | ||
1118 | } | ||
1119 | #else /* #ifdef CONFIG_MOVABLE_NODE */ | ||
1069 | /* ensure the node has NORMAL memory if it is still online */ | 1120 | /* ensure the node has NORMAL memory if it is still online */ |
1070 | static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) | 1121 | static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) |
1071 | { | 1122 | { |
@@ -1089,6 +1140,7 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) | |||
1089 | */ | 1140 | */ |
1090 | return present_pages == 0; | 1141 | return present_pages == 0; |
1091 | } | 1142 | } |
1143 | #endif /* #ifdef CONFIG_MOVABLE_NODE */ | ||
1092 | 1144 | ||
1093 | /* check which state of node_states will be changed when offline memory */ | 1145 | /* check which state of node_states will be changed when offline memory */ |
1094 | static void node_states_check_changes_offline(unsigned long nr_pages, | 1146 | static void node_states_check_changes_offline(unsigned long nr_pages, |
@@ -1099,13 +1151,15 @@ static void node_states_check_changes_offline(unsigned long nr_pages, | |||
1099 | enum zone_type zt, zone_last = ZONE_NORMAL; | 1151 | enum zone_type zt, zone_last = ZONE_NORMAL; |
1100 | 1152 | ||
1101 | /* | 1153 | /* |
1102 | * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes | 1154 | * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] |
1103 | * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. | 1155 | * contains nodes which have zones of 0...ZONE_NORMAL, |
1156 | * set zone_last to ZONE_NORMAL. | ||
1104 | * | 1157 | * |
1105 | * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes | 1158 | * If we don't have HIGHMEM nor movable node, |
1106 | * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | 1159 | * node_states[N_NORMAL_MEMORY] contains nodes which have zones of |
1160 | * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | ||
1107 | */ | 1161 | */ |
1108 | if (N_HIGH_MEMORY == N_NORMAL_MEMORY) | 1162 | if (N_MEMORY == N_NORMAL_MEMORY) |
1109 | zone_last = ZONE_MOVABLE; | 1163 | zone_last = ZONE_MOVABLE; |
1110 | 1164 | ||
1111 | /* | 1165 | /* |
@@ -1122,6 +1176,30 @@ static void node_states_check_changes_offline(unsigned long nr_pages, | |||
1122 | else | 1176 | else |
1123 | arg->status_change_nid_normal = -1; | 1177 | arg->status_change_nid_normal = -1; |
1124 | 1178 | ||
1179 | #ifdef CONFIG_HIGHMEM | ||
1180 | /* | ||
1181 | * If we have movable node, node_states[N_HIGH_MEMORY] | ||
1182 | * contains nodes which have zones of 0...ZONE_HIGHMEM, | ||
1183 | * set zone_last to ZONE_HIGHMEM. | ||
1184 | * | ||
1185 | * If we don't have movable node, node_states[N_NORMAL_MEMORY] | ||
1186 | * contains nodes which have zones of 0...ZONE_MOVABLE, | ||
1187 | * set zone_last to ZONE_MOVABLE. | ||
1188 | */ | ||
1189 | zone_last = ZONE_HIGHMEM; | ||
1190 | if (N_MEMORY == N_HIGH_MEMORY) | ||
1191 | zone_last = ZONE_MOVABLE; | ||
1192 | |||
1193 | for (; zt <= zone_last; zt++) | ||
1194 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1195 | if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) | ||
1196 | arg->status_change_nid_high = zone_to_nid(zone); | ||
1197 | else | ||
1198 | arg->status_change_nid_high = -1; | ||
1199 | #else | ||
1200 | arg->status_change_nid_high = arg->status_change_nid_normal; | ||
1201 | #endif | ||
1202 | |||
1125 | /* | 1203 | /* |
1126 | * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE | 1204 | * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE |
1127 | */ | 1205 | */ |
@@ -1146,9 +1224,13 @@ static void node_states_clear_node(int node, struct memory_notify *arg) | |||
1146 | if (arg->status_change_nid_normal >= 0) | 1224 | if (arg->status_change_nid_normal >= 0) |
1147 | node_clear_state(node, N_NORMAL_MEMORY); | 1225 | node_clear_state(node, N_NORMAL_MEMORY); |
1148 | 1226 | ||
1149 | if ((N_HIGH_MEMORY != N_NORMAL_MEMORY) && | 1227 | if ((N_MEMORY != N_NORMAL_MEMORY) && |
1150 | (arg->status_change_nid >= 0)) | 1228 | (arg->status_change_nid_high >= 0)) |
1151 | node_clear_state(node, N_HIGH_MEMORY); | 1229 | node_clear_state(node, N_HIGH_MEMORY); |
1230 | |||
1231 | if ((N_MEMORY != N_HIGH_MEMORY) && | ||
1232 | (arg->status_change_nid >= 0)) | ||
1233 | node_clear_state(node, N_MEMORY); | ||
1152 | } | 1234 | } |
1153 | 1235 | ||
1154 | static int __ref __offline_pages(unsigned long start_pfn, | 1236 | static int __ref __offline_pages(unsigned long start_pfn, |
@@ -1248,6 +1330,7 @@ repeat: | |||
1248 | /* reset pagetype flags and makes migrate type to be MOVABLE */ | 1330 | /* reset pagetype flags and makes migrate type to be MOVABLE */ |
1249 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | 1331 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); |
1250 | /* removal success */ | 1332 | /* removal success */ |
1333 | zone->managed_pages -= offlined_pages; | ||
1251 | zone->present_pages -= offlined_pages; | 1334 | zone->present_pages -= offlined_pages; |
1252 | zone->zone_pgdat->node_present_pages -= offlined_pages; | 1335 | zone->zone_pgdat->node_present_pages -= offlined_pages; |
1253 | totalram_pages -= offlined_pages; | 1336 | totalram_pages -= offlined_pages; |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 05b28361a39b..aaf54566cb6b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -212,9 +212,9 @@ static int mpol_set_nodemask(struct mempolicy *pol, | |||
212 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ | 212 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ |
213 | if (pol == NULL) | 213 | if (pol == NULL) |
214 | return 0; | 214 | return 0; |
215 | /* Check N_HIGH_MEMORY */ | 215 | /* Check N_MEMORY */ |
216 | nodes_and(nsc->mask1, | 216 | nodes_and(nsc->mask1, |
217 | cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); | 217 | cpuset_current_mems_allowed, node_states[N_MEMORY]); |
218 | 218 | ||
219 | VM_BUG_ON(!nodes); | 219 | VM_BUG_ON(!nodes); |
220 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) | 220 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) |
@@ -511,7 +511,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
511 | pmd = pmd_offset(pud, addr); | 511 | pmd = pmd_offset(pud, addr); |
512 | do { | 512 | do { |
513 | next = pmd_addr_end(addr, end); | 513 | next = pmd_addr_end(addr, end); |
514 | split_huge_page_pmd(vma->vm_mm, pmd); | 514 | split_huge_page_pmd(vma, addr, pmd); |
515 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | 515 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
516 | continue; | 516 | continue; |
517 | if (check_pte_range(vma, pmd, addr, next, nodes, | 517 | if (check_pte_range(vma, pmd, addr, next, nodes, |
@@ -1388,7 +1388,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1388 | goto out_put; | 1388 | goto out_put; |
1389 | } | 1389 | } |
1390 | 1390 | ||
1391 | if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { | 1391 | if (!nodes_subset(*new, node_states[N_MEMORY])) { |
1392 | err = -EINVAL; | 1392 | err = -EINVAL; |
1393 | goto out_put; | 1393 | goto out_put; |
1394 | } | 1394 | } |
@@ -2326,7 +2326,7 @@ void __init numa_policy_init(void) | |||
2326 | * fall back to the largest node if they're all smaller. | 2326 | * fall back to the largest node if they're all smaller. |
2327 | */ | 2327 | */ |
2328 | nodes_clear(interleave_nodes); | 2328 | nodes_clear(interleave_nodes); |
2329 | for_each_node_state(nid, N_HIGH_MEMORY) { | 2329 | for_each_node_state(nid, N_MEMORY) { |
2330 | unsigned long total_pages = node_present_pages(nid); | 2330 | unsigned long total_pages = node_present_pages(nid); |
2331 | 2331 | ||
2332 | /* Preserve the largest node */ | 2332 | /* Preserve the largest node */ |
@@ -2407,7 +2407,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2407 | *nodelist++ = '\0'; | 2407 | *nodelist++ = '\0'; |
2408 | if (nodelist_parse(nodelist, nodes)) | 2408 | if (nodelist_parse(nodelist, nodes)) |
2409 | goto out; | 2409 | goto out; |
2410 | if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY])) | 2410 | if (!nodes_subset(nodes, node_states[N_MEMORY])) |
2411 | goto out; | 2411 | goto out; |
2412 | } else | 2412 | } else |
2413 | nodes_clear(nodes); | 2413 | nodes_clear(nodes); |
@@ -2441,7 +2441,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2441 | * Default to online nodes with memory if no nodelist | 2441 | * Default to online nodes with memory if no nodelist |
2442 | */ | 2442 | */ |
2443 | if (!nodelist) | 2443 | if (!nodelist) |
2444 | nodes = node_states[N_HIGH_MEMORY]; | 2444 | nodes = node_states[N_MEMORY]; |
2445 | break; | 2445 | break; |
2446 | case MPOL_LOCAL: | 2446 | case MPOL_LOCAL: |
2447 | /* | 2447 | /* |
diff --git a/mm/migrate.c b/mm/migrate.c index 3f675ca08279..cae02711181d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -1238,7 +1238,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, | |||
1238 | if (node < 0 || node >= MAX_NUMNODES) | 1238 | if (node < 0 || node >= MAX_NUMNODES) |
1239 | goto out_pm; | 1239 | goto out_pm; |
1240 | 1240 | ||
1241 | if (!node_state(node, N_HIGH_MEMORY)) | 1241 | if (!node_state(node, N_MEMORY)) |
1242 | goto out_pm; | 1242 | goto out_pm; |
1243 | 1243 | ||
1244 | err = -EACCES; | 1244 | err = -EACCES; |
@@ -1488,7 +1488,11 @@ munmap_back: | |||
1488 | * | 1488 | * |
1489 | * Answer: Yes, several device drivers can do it in their | 1489 | * Answer: Yes, several device drivers can do it in their |
1490 | * f_op->mmap method. -DaveM | 1490 | * f_op->mmap method. -DaveM |
1491 | * Bug: If addr is changed, prev, rb_link, rb_parent should | ||
1492 | * be updated for vma_link() | ||
1491 | */ | 1493 | */ |
1494 | WARN_ON_ONCE(addr != vma->vm_start); | ||
1495 | |||
1492 | addr = vma->vm_start; | 1496 | addr = vma->vm_start; |
1493 | pgoff = vma->vm_pgoff; | 1497 | pgoff = vma->vm_pgoff; |
1494 | vm_flags = vma->vm_flags; | 1498 | vm_flags = vma->vm_flags; |
@@ -2065,6 +2069,18 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
2065 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { | 2069 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { |
2066 | error = acct_stack_growth(vma, size, grow); | 2070 | error = acct_stack_growth(vma, size, grow); |
2067 | if (!error) { | 2071 | if (!error) { |
2072 | /* | ||
2073 | * vma_gap_update() doesn't support concurrent | ||
2074 | * updates, but we only hold a shared mmap_sem | ||
2075 | * lock here, so we need to protect against | ||
2076 | * concurrent vma expansions. | ||
2077 | * vma_lock_anon_vma() doesn't help here, as | ||
2078 | * we don't guarantee that all growable vmas | ||
2079 | * in a mm share the same root anon vma. | ||
2080 | * So, we reuse mm->page_table_lock to guard | ||
2081 | * against concurrent vma expansions. | ||
2082 | */ | ||
2083 | spin_lock(&vma->vm_mm->page_table_lock); | ||
2068 | anon_vma_interval_tree_pre_update_vma(vma); | 2084 | anon_vma_interval_tree_pre_update_vma(vma); |
2069 | vma->vm_end = address; | 2085 | vma->vm_end = address; |
2070 | anon_vma_interval_tree_post_update_vma(vma); | 2086 | anon_vma_interval_tree_post_update_vma(vma); |
@@ -2072,6 +2088,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
2072 | vma_gap_update(vma->vm_next); | 2088 | vma_gap_update(vma->vm_next); |
2073 | else | 2089 | else |
2074 | vma->vm_mm->highest_vm_end = address; | 2090 | vma->vm_mm->highest_vm_end = address; |
2091 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
2092 | |||
2075 | perf_event_mmap(vma); | 2093 | perf_event_mmap(vma); |
2076 | } | 2094 | } |
2077 | } | 2095 | } |
@@ -2122,11 +2140,25 @@ int expand_downwards(struct vm_area_struct *vma, | |||
2122 | if (grow <= vma->vm_pgoff) { | 2140 | if (grow <= vma->vm_pgoff) { |
2123 | error = acct_stack_growth(vma, size, grow); | 2141 | error = acct_stack_growth(vma, size, grow); |
2124 | if (!error) { | 2142 | if (!error) { |
2143 | /* | ||
2144 | * vma_gap_update() doesn't support concurrent | ||
2145 | * updates, but we only hold a shared mmap_sem | ||
2146 | * lock here, so we need to protect against | ||
2147 | * concurrent vma expansions. | ||
2148 | * vma_lock_anon_vma() doesn't help here, as | ||
2149 | * we don't guarantee that all growable vmas | ||
2150 | * in a mm share the same root anon vma. | ||
2151 | * So, we reuse mm->page_table_lock to guard | ||
2152 | * against concurrent vma expansions. | ||
2153 | */ | ||
2154 | spin_lock(&vma->vm_mm->page_table_lock); | ||
2125 | anon_vma_interval_tree_pre_update_vma(vma); | 2155 | anon_vma_interval_tree_pre_update_vma(vma); |
2126 | vma->vm_start = address; | 2156 | vma->vm_start = address; |
2127 | vma->vm_pgoff -= grow; | 2157 | vma->vm_pgoff -= grow; |
2128 | anon_vma_interval_tree_post_update_vma(vma); | 2158 | anon_vma_interval_tree_post_update_vma(vma); |
2129 | vma_gap_update(vma); | 2159 | vma_gap_update(vma); |
2160 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
2161 | |||
2130 | perf_event_mmap(vma); | 2162 | perf_event_mmap(vma); |
2131 | } | 2163 | } |
2132 | } | 2164 | } |
diff --git a/mm/mprotect.c b/mm/mprotect.c index a40992610ab6..e8c3938db6fa 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -90,7 +90,7 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
90 | next = pmd_addr_end(addr, end); | 90 | next = pmd_addr_end(addr, end); |
91 | if (pmd_trans_huge(*pmd)) { | 91 | if (pmd_trans_huge(*pmd)) { |
92 | if (next - addr != HPAGE_PMD_SIZE) | 92 | if (next - addr != HPAGE_PMD_SIZE) |
93 | split_huge_page_pmd(vma->vm_mm, pmd); | 93 | split_huge_page_pmd(vma, addr, pmd); |
94 | else if (change_huge_pmd(vma, pmd, addr, newprot)) | 94 | else if (change_huge_pmd(vma, pmd, addr, newprot)) |
95 | continue; | 95 | continue; |
96 | /* fall through */ | 96 | /* fall through */ |
diff --git a/mm/mremap.c b/mm/mremap.c index 1b61c2d3307a..eabb24da6c9e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -182,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
182 | need_flush = true; | 182 | need_flush = true; |
183 | continue; | 183 | continue; |
184 | } else if (!err) { | 184 | } else if (!err) { |
185 | split_huge_page_pmd(vma->vm_mm, old_pmd); | 185 | split_huge_page_pmd(vma, old_addr, old_pmd); |
186 | } | 186 | } |
187 | VM_BUG_ON(pmd_trans_huge(*old_pmd)); | 187 | VM_BUG_ON(pmd_trans_huge(*old_pmd)); |
188 | } | 188 | } |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bd82f6b31411..b8294fc03df8 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -137,6 +137,22 @@ unsigned long __init free_low_memory_core_early(int nodeid) | |||
137 | return count; | 137 | return count; |
138 | } | 138 | } |
139 | 139 | ||
140 | static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) | ||
141 | { | ||
142 | struct zone *z; | ||
143 | |||
144 | /* | ||
145 | * In free_area_init_core(), highmem zone's managed_pages is set to | ||
146 | * present_pages, and bootmem allocator doesn't allocate from highmem | ||
147 | * zones. So there's no need to recalculate managed_pages because all | ||
148 | * highmem pages will be managed by the buddy system. Here highmem | ||
149 | * zone also includes highmem movable zone. | ||
150 | */ | ||
151 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) | ||
152 | if (!is_highmem(z)) | ||
153 | z->managed_pages = 0; | ||
154 | } | ||
155 | |||
140 | /** | 156 | /** |
141 | * free_all_bootmem_node - release a node's free pages to the buddy allocator | 157 | * free_all_bootmem_node - release a node's free pages to the buddy allocator |
142 | * @pgdat: node to be released | 158 | * @pgdat: node to be released |
@@ -146,6 +162,7 @@ unsigned long __init free_low_memory_core_early(int nodeid) | |||
146 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | 162 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) |
147 | { | 163 | { |
148 | register_page_bootmem_info_node(pgdat); | 164 | register_page_bootmem_info_node(pgdat); |
165 | reset_node_lowmem_managed_pages(pgdat); | ||
149 | 166 | ||
150 | /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ | 167 | /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ |
151 | return 0; | 168 | return 0; |
@@ -158,6 +175,11 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | |||
158 | */ | 175 | */ |
159 | unsigned long __init free_all_bootmem(void) | 176 | unsigned long __init free_all_bootmem(void) |
160 | { | 177 | { |
178 | struct pglist_data *pgdat; | ||
179 | |||
180 | for_each_online_pgdat(pgdat) | ||
181 | reset_node_lowmem_managed_pages(pgdat); | ||
182 | |||
161 | /* | 183 | /* |
162 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | 184 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id |
163 | * because in some case like Node0 doesn't have RAM installed | 185 | * because in some case like Node0 doesn't have RAM installed |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 18f1ae2b45de..0399f146ae49 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -215,7 +215,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
215 | * the page allocator means a mempolicy is in effect. Cpuset policy | 215 | * the page allocator means a mempolicy is in effect. Cpuset policy |
216 | * is enforced in get_page_from_freelist(). | 216 | * is enforced in get_page_from_freelist(). |
217 | */ | 217 | */ |
218 | if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) { | 218 | if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) { |
219 | *totalpages = total_swap_pages; | 219 | *totalpages = total_swap_pages; |
220 | for_each_node_mask(nid, *nodemask) | 220 | for_each_node_mask(nid, *nodemask) |
221 | *totalpages += node_spanned_pages(nid); | 221 | *totalpages += node_spanned_pages(nid); |
@@ -591,43 +591,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
591 | spin_unlock(&zone_scan_lock); | 591 | spin_unlock(&zone_scan_lock); |
592 | } | 592 | } |
593 | 593 | ||
594 | /* | ||
595 | * Try to acquire the oom killer lock for all system zones. Returns zero if a | ||
596 | * parallel oom killing is taking place, otherwise locks all zones and returns | ||
597 | * non-zero. | ||
598 | */ | ||
599 | static int try_set_system_oom(void) | ||
600 | { | ||
601 | struct zone *zone; | ||
602 | int ret = 1; | ||
603 | |||
604 | spin_lock(&zone_scan_lock); | ||
605 | for_each_populated_zone(zone) | ||
606 | if (zone_is_oom_locked(zone)) { | ||
607 | ret = 0; | ||
608 | goto out; | ||
609 | } | ||
610 | for_each_populated_zone(zone) | ||
611 | zone_set_flag(zone, ZONE_OOM_LOCKED); | ||
612 | out: | ||
613 | spin_unlock(&zone_scan_lock); | ||
614 | return ret; | ||
615 | } | ||
616 | |||
617 | /* | ||
618 | * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation | ||
619 | * attempts or page faults may now recall the oom killer, if necessary. | ||
620 | */ | ||
621 | static void clear_system_oom(void) | ||
622 | { | ||
623 | struct zone *zone; | ||
624 | |||
625 | spin_lock(&zone_scan_lock); | ||
626 | for_each_populated_zone(zone) | ||
627 | zone_clear_flag(zone, ZONE_OOM_LOCKED); | ||
628 | spin_unlock(&zone_scan_lock); | ||
629 | } | ||
630 | |||
631 | /** | 594 | /** |
632 | * out_of_memory - kill the "best" process when we run out of memory | 595 | * out_of_memory - kill the "best" process when we run out of memory |
633 | * @zonelist: zonelist pointer | 596 | * @zonelist: zonelist pointer |
@@ -708,15 +671,16 @@ out: | |||
708 | 671 | ||
709 | /* | 672 | /* |
710 | * The pagefault handler calls here because it is out of memory, so kill a | 673 | * The pagefault handler calls here because it is out of memory, so kill a |
711 | * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel | 674 | * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a |
712 | * oom killing is already in progress so do nothing. If a task is found with | 675 | * parallel oom killing is already in progress so do nothing. |
713 | * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit. | ||
714 | */ | 676 | */ |
715 | void pagefault_out_of_memory(void) | 677 | void pagefault_out_of_memory(void) |
716 | { | 678 | { |
717 | if (try_set_system_oom()) { | 679 | struct zonelist *zonelist = node_zonelist(first_online_node, |
680 | GFP_KERNEL); | ||
681 | |||
682 | if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { | ||
718 | out_of_memory(NULL, 0, 0, NULL, false); | 683 | out_of_memory(NULL, 0, 0, NULL, false); |
719 | clear_system_oom(); | 684 | clear_zonelist_oom(zonelist, GFP_KERNEL); |
720 | } | 685 | } |
721 | schedule_timeout_killable(1); | ||
722 | } | 686 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee3efa58c91..83637dfba110 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -90,6 +90,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { | |||
90 | #ifdef CONFIG_HIGHMEM | 90 | #ifdef CONFIG_HIGHMEM |
91 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, | 91 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, |
92 | #endif | 92 | #endif |
93 | #ifdef CONFIG_MOVABLE_NODE | ||
94 | [N_MEMORY] = { { [0] = 1UL } }, | ||
95 | #endif | ||
93 | [N_CPU] = { { [0] = 1UL } }, | 96 | [N_CPU] = { { [0] = 1UL } }, |
94 | #endif /* NUMA */ | 97 | #endif /* NUMA */ |
95 | }; | 98 | }; |
@@ -732,6 +735,13 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
732 | local_irq_restore(flags); | 735 | local_irq_restore(flags); |
733 | } | 736 | } |
734 | 737 | ||
738 | /* | ||
739 | * Read access to zone->managed_pages is safe because it's unsigned long, | ||
740 | * but we still need to serialize writers. Currently all callers of | ||
741 | * __free_pages_bootmem() except put_page_bootmem() should only be used | ||
742 | * at boot time. So for shorter boot time, we shift the burden to | ||
743 | * put_page_bootmem() to serialize writers. | ||
744 | */ | ||
735 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | 745 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) |
736 | { | 746 | { |
737 | unsigned int nr_pages = 1 << order; | 747 | unsigned int nr_pages = 1 << order; |
@@ -747,6 +757,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | |||
747 | set_page_count(p, 0); | 757 | set_page_count(p, 0); |
748 | } | 758 | } |
749 | 759 | ||
760 | page_zone(page)->managed_pages += 1 << order; | ||
750 | set_page_refcounted(page); | 761 | set_page_refcounted(page); |
751 | __free_pages(page, order); | 762 | __free_pages(page, order); |
752 | } | 763 | } |
@@ -1695,7 +1706,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | |||
1695 | * | 1706 | * |
1696 | * If the zonelist cache is present in the passed in zonelist, then | 1707 | * If the zonelist cache is present in the passed in zonelist, then |
1697 | * returns a pointer to the allowed node mask (either the current | 1708 | * returns a pointer to the allowed node mask (either the current |
1698 | * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) | 1709 | * tasks mems_allowed, or node_states[N_MEMORY].) |
1699 | * | 1710 | * |
1700 | * If the zonelist cache is not available for this zonelist, does | 1711 | * If the zonelist cache is not available for this zonelist, does |
1701 | * nothing and returns NULL. | 1712 | * nothing and returns NULL. |
@@ -1724,7 +1735,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | |||
1724 | 1735 | ||
1725 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | 1736 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? |
1726 | &cpuset_current_mems_allowed : | 1737 | &cpuset_current_mems_allowed : |
1727 | &node_states[N_HIGH_MEMORY]; | 1738 | &node_states[N_MEMORY]; |
1728 | return allowednodes; | 1739 | return allowednodes; |
1729 | } | 1740 | } |
1730 | 1741 | ||
@@ -2981,6 +2992,7 @@ void show_free_areas(unsigned int filter) | |||
2981 | " isolated(anon):%lukB" | 2992 | " isolated(anon):%lukB" |
2982 | " isolated(file):%lukB" | 2993 | " isolated(file):%lukB" |
2983 | " present:%lukB" | 2994 | " present:%lukB" |
2995 | " managed:%lukB" | ||
2984 | " mlocked:%lukB" | 2996 | " mlocked:%lukB" |
2985 | " dirty:%lukB" | 2997 | " dirty:%lukB" |
2986 | " writeback:%lukB" | 2998 | " writeback:%lukB" |
@@ -3010,6 +3022,7 @@ void show_free_areas(unsigned int filter) | |||
3010 | K(zone_page_state(zone, NR_ISOLATED_ANON)), | 3022 | K(zone_page_state(zone, NR_ISOLATED_ANON)), |
3011 | K(zone_page_state(zone, NR_ISOLATED_FILE)), | 3023 | K(zone_page_state(zone, NR_ISOLATED_FILE)), |
3012 | K(zone->present_pages), | 3024 | K(zone->present_pages), |
3025 | K(zone->managed_pages), | ||
3013 | K(zone_page_state(zone, NR_MLOCK)), | 3026 | K(zone_page_state(zone, NR_MLOCK)), |
3014 | K(zone_page_state(zone, NR_FILE_DIRTY)), | 3027 | K(zone_page_state(zone, NR_FILE_DIRTY)), |
3015 | K(zone_page_state(zone, NR_WRITEBACK)), | 3028 | K(zone_page_state(zone, NR_WRITEBACK)), |
@@ -3238,7 +3251,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
3238 | return node; | 3251 | return node; |
3239 | } | 3252 | } |
3240 | 3253 | ||
3241 | for_each_node_state(n, N_HIGH_MEMORY) { | 3254 | for_each_node_state(n, N_MEMORY) { |
3242 | 3255 | ||
3243 | /* Don't want a node to appear more than once */ | 3256 | /* Don't want a node to appear more than once */ |
3244 | if (node_isset(n, *used_node_mask)) | 3257 | if (node_isset(n, *used_node_mask)) |
@@ -3380,7 +3393,7 @@ static int default_zonelist_order(void) | |||
3380 | * local memory, NODE_ORDER may be suitable. | 3393 | * local memory, NODE_ORDER may be suitable. |
3381 | */ | 3394 | */ |
3382 | average_size = total_size / | 3395 | average_size = total_size / |
3383 | (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); | 3396 | (nodes_weight(node_states[N_MEMORY]) + 1); |
3384 | for_each_online_node(nid) { | 3397 | for_each_online_node(nid) { |
3385 | low_kmem_size = 0; | 3398 | low_kmem_size = 0; |
3386 | total_size = 0; | 3399 | total_size = 0; |
@@ -4476,6 +4489,26 @@ void __init set_pageblock_order(void) | |||
4476 | 4489 | ||
4477 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | 4490 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ |
4478 | 4491 | ||
4492 | static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, | ||
4493 | unsigned long present_pages) | ||
4494 | { | ||
4495 | unsigned long pages = spanned_pages; | ||
4496 | |||
4497 | /* | ||
4498 | * Provide a more accurate estimation if there are holes within | ||
4499 | * the zone and SPARSEMEM is in use. If there are holes within the | ||
4500 | * zone, each populated memory region may cost us one or two extra | ||
4501 | * memmap pages due to alignment because memmap pages for each | ||
4502 | * populated regions may not naturally algined on page boundary. | ||
4503 | * So the (present_pages >> 4) heuristic is a tradeoff for that. | ||
4504 | */ | ||
4505 | if (spanned_pages > present_pages + (present_pages >> 4) && | ||
4506 | IS_ENABLED(CONFIG_SPARSEMEM)) | ||
4507 | pages = present_pages; | ||
4508 | |||
4509 | return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; | ||
4510 | } | ||
4511 | |||
4479 | /* | 4512 | /* |
4480 | * Set up the zone data structures: | 4513 | * Set up the zone data structures: |
4481 | * - mark all pages reserved | 4514 | * - mark all pages reserved |
@@ -4499,48 +4532,56 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4499 | 4532 | ||
4500 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4533 | for (j = 0; j < MAX_NR_ZONES; j++) { |
4501 | struct zone *zone = pgdat->node_zones + j; | 4534 | struct zone *zone = pgdat->node_zones + j; |
4502 | unsigned long size, realsize, memmap_pages; | 4535 | unsigned long size, realsize, freesize, memmap_pages; |
4503 | 4536 | ||
4504 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 4537 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
4505 | realsize = size - zone_absent_pages_in_node(nid, j, | 4538 | realsize = freesize = size - zone_absent_pages_in_node(nid, j, |
4506 | zholes_size); | 4539 | zholes_size); |
4507 | 4540 | ||
4508 | /* | 4541 | /* |
4509 | * Adjust realsize so that it accounts for how much memory | 4542 | * Adjust freesize so that it accounts for how much memory |
4510 | * is used by this zone for memmap. This affects the watermark | 4543 | * is used by this zone for memmap. This affects the watermark |
4511 | * and per-cpu initialisations | 4544 | * and per-cpu initialisations |
4512 | */ | 4545 | */ |
4513 | memmap_pages = | 4546 | memmap_pages = calc_memmap_size(size, realsize); |
4514 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; | 4547 | if (freesize >= memmap_pages) { |
4515 | if (realsize >= memmap_pages) { | 4548 | freesize -= memmap_pages; |
4516 | realsize -= memmap_pages; | ||
4517 | if (memmap_pages) | 4549 | if (memmap_pages) |
4518 | printk(KERN_DEBUG | 4550 | printk(KERN_DEBUG |
4519 | " %s zone: %lu pages used for memmap\n", | 4551 | " %s zone: %lu pages used for memmap\n", |
4520 | zone_names[j], memmap_pages); | 4552 | zone_names[j], memmap_pages); |
4521 | } else | 4553 | } else |
4522 | printk(KERN_WARNING | 4554 | printk(KERN_WARNING |
4523 | " %s zone: %lu pages exceeds realsize %lu\n", | 4555 | " %s zone: %lu pages exceeds freesize %lu\n", |
4524 | zone_names[j], memmap_pages, realsize); | 4556 | zone_names[j], memmap_pages, freesize); |
4525 | 4557 | ||
4526 | /* Account for reserved pages */ | 4558 | /* Account for reserved pages */ |
4527 | if (j == 0 && realsize > dma_reserve) { | 4559 | if (j == 0 && freesize > dma_reserve) { |
4528 | realsize -= dma_reserve; | 4560 | freesize -= dma_reserve; |
4529 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", | 4561 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", |
4530 | zone_names[0], dma_reserve); | 4562 | zone_names[0], dma_reserve); |
4531 | } | 4563 | } |
4532 | 4564 | ||
4533 | if (!is_highmem_idx(j)) | 4565 | if (!is_highmem_idx(j)) |
4534 | nr_kernel_pages += realsize; | 4566 | nr_kernel_pages += freesize; |
4535 | nr_all_pages += realsize; | 4567 | /* Charge for highmem memmap if there are enough kernel pages */ |
4568 | else if (nr_kernel_pages > memmap_pages * 2) | ||
4569 | nr_kernel_pages -= memmap_pages; | ||
4570 | nr_all_pages += freesize; | ||
4536 | 4571 | ||
4537 | zone->spanned_pages = size; | 4572 | zone->spanned_pages = size; |
4538 | zone->present_pages = realsize; | 4573 | zone->present_pages = freesize; |
4574 | /* | ||
4575 | * Set an approximate value for lowmem here, it will be adjusted | ||
4576 | * when the bootmem allocator frees pages into the buddy system. | ||
4577 | * And all highmem pages will be managed by the buddy system. | ||
4578 | */ | ||
4579 | zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; | ||
4539 | #ifdef CONFIG_NUMA | 4580 | #ifdef CONFIG_NUMA |
4540 | zone->node = nid; | 4581 | zone->node = nid; |
4541 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 4582 | zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) |
4542 | / 100; | 4583 | / 100; |
4543 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; | 4584 | zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; |
4544 | #endif | 4585 | #endif |
4545 | zone->name = zone_names[j]; | 4586 | zone->name = zone_names[j]; |
4546 | spin_lock_init(&zone->lock); | 4587 | spin_lock_init(&zone->lock); |
@@ -4731,7 +4772,7 @@ unsigned long __init find_min_pfn_with_active_regions(void) | |||
4731 | /* | 4772 | /* |
4732 | * early_calculate_totalpages() | 4773 | * early_calculate_totalpages() |
4733 | * Sum pages in active regions for movable zone. | 4774 | * Sum pages in active regions for movable zone. |
4734 | * Populate N_HIGH_MEMORY for calculating usable_nodes. | 4775 | * Populate N_MEMORY for calculating usable_nodes. |
4735 | */ | 4776 | */ |
4736 | static unsigned long __init early_calculate_totalpages(void) | 4777 | static unsigned long __init early_calculate_totalpages(void) |
4737 | { | 4778 | { |
@@ -4744,7 +4785,7 @@ static unsigned long __init early_calculate_totalpages(void) | |||
4744 | 4785 | ||
4745 | totalpages += pages; | 4786 | totalpages += pages; |
4746 | if (pages) | 4787 | if (pages) |
4747 | node_set_state(nid, N_HIGH_MEMORY); | 4788 | node_set_state(nid, N_MEMORY); |
4748 | } | 4789 | } |
4749 | return totalpages; | 4790 | return totalpages; |
4750 | } | 4791 | } |
@@ -4761,9 +4802,9 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
4761 | unsigned long usable_startpfn; | 4802 | unsigned long usable_startpfn; |
4762 | unsigned long kernelcore_node, kernelcore_remaining; | 4803 | unsigned long kernelcore_node, kernelcore_remaining; |
4763 | /* save the state before borrow the nodemask */ | 4804 | /* save the state before borrow the nodemask */ |
4764 | nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; | 4805 | nodemask_t saved_node_state = node_states[N_MEMORY]; |
4765 | unsigned long totalpages = early_calculate_totalpages(); | 4806 | unsigned long totalpages = early_calculate_totalpages(); |
4766 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | 4807 | int usable_nodes = nodes_weight(node_states[N_MEMORY]); |
4767 | 4808 | ||
4768 | /* | 4809 | /* |
4769 | * If movablecore was specified, calculate what size of | 4810 | * If movablecore was specified, calculate what size of |
@@ -4798,7 +4839,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
4798 | restart: | 4839 | restart: |
4799 | /* Spread kernelcore memory as evenly as possible throughout nodes */ | 4840 | /* Spread kernelcore memory as evenly as possible throughout nodes */ |
4800 | kernelcore_node = required_kernelcore / usable_nodes; | 4841 | kernelcore_node = required_kernelcore / usable_nodes; |
4801 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4842 | for_each_node_state(nid, N_MEMORY) { |
4802 | unsigned long start_pfn, end_pfn; | 4843 | unsigned long start_pfn, end_pfn; |
4803 | 4844 | ||
4804 | /* | 4845 | /* |
@@ -4890,23 +4931,27 @@ restart: | |||
4890 | 4931 | ||
4891 | out: | 4932 | out: |
4892 | /* restore the node_state */ | 4933 | /* restore the node_state */ |
4893 | node_states[N_HIGH_MEMORY] = saved_node_state; | 4934 | node_states[N_MEMORY] = saved_node_state; |
4894 | } | 4935 | } |
4895 | 4936 | ||
4896 | /* Any regular memory on that node ? */ | 4937 | /* Any regular or high memory on that node ? */ |
4897 | static void __init check_for_regular_memory(pg_data_t *pgdat) | 4938 | static void check_for_memory(pg_data_t *pgdat, int nid) |
4898 | { | 4939 | { |
4899 | #ifdef CONFIG_HIGHMEM | ||
4900 | enum zone_type zone_type; | 4940 | enum zone_type zone_type; |
4901 | 4941 | ||
4902 | for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { | 4942 | if (N_MEMORY == N_NORMAL_MEMORY) |
4943 | return; | ||
4944 | |||
4945 | for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { | ||
4903 | struct zone *zone = &pgdat->node_zones[zone_type]; | 4946 | struct zone *zone = &pgdat->node_zones[zone_type]; |
4904 | if (zone->present_pages) { | 4947 | if (zone->present_pages) { |
4905 | node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); | 4948 | node_set_state(nid, N_HIGH_MEMORY); |
4949 | if (N_NORMAL_MEMORY != N_HIGH_MEMORY && | ||
4950 | zone_type <= ZONE_NORMAL) | ||
4951 | node_set_state(nid, N_NORMAL_MEMORY); | ||
4906 | break; | 4952 | break; |
4907 | } | 4953 | } |
4908 | } | 4954 | } |
4909 | #endif | ||
4910 | } | 4955 | } |
4911 | 4956 | ||
4912 | /** | 4957 | /** |
@@ -4989,8 +5034,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4989 | 5034 | ||
4990 | /* Any memory on that node */ | 5035 | /* Any memory on that node */ |
4991 | if (pgdat->node_present_pages) | 5036 | if (pgdat->node_present_pages) |
4992 | node_set_state(nid, N_HIGH_MEMORY); | 5037 | node_set_state(nid, N_MEMORY); |
4993 | check_for_regular_memory(pgdat); | 5038 | check_for_memory(pgdat, nid); |
4994 | } | 5039 | } |
4995 | } | 5040 | } |
4996 | 5041 | ||
@@ -5727,7 +5772,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
5727 | unsigned int tries = 0; | 5772 | unsigned int tries = 0; |
5728 | int ret = 0; | 5773 | int ret = 0; |
5729 | 5774 | ||
5730 | migrate_prep_local(); | 5775 | migrate_prep(); |
5731 | 5776 | ||
5732 | while (pfn < end || !list_empty(&cc->migratepages)) { | 5777 | while (pfn < end || !list_empty(&cc->migratepages)) { |
5733 | if (fatal_signal_pending(current)) { | 5778 | if (fatal_signal_pending(current)) { |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 44db00e253ed..6d757e3a872a 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -274,7 +274,7 @@ void __init page_cgroup_init(void) | |||
274 | if (mem_cgroup_disabled()) | 274 | if (mem_cgroup_disabled()) |
275 | return; | 275 | return; |
276 | 276 | ||
277 | for_each_node_state(nid, N_HIGH_MEMORY) { | 277 | for_each_node_state(nid, N_MEMORY) { |
278 | unsigned long start_pfn, end_pfn; | 278 | unsigned long start_pfn, end_pfn; |
279 | 279 | ||
280 | start_pfn = node_start_pfn(nid); | 280 | start_pfn = node_start_pfn(nid); |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 6c118d012bb5..35aa294656cd 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -58,7 +58,7 @@ again: | |||
58 | if (!walk->pte_entry) | 58 | if (!walk->pte_entry) |
59 | continue; | 59 | continue; |
60 | 60 | ||
61 | split_huge_page_pmd(walk->mm, pmd); | 61 | split_huge_page_pmd_mm(walk->mm, addr, pmd); |
62 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | 62 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
63 | goto again; | 63 | goto again; |
64 | err = walk_pte_range(pmd, addr, next, walk); | 64 | err = walk_pte_range(pmd, addr, next, walk); |
@@ -1249,12 +1249,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1249 | update_hiwater_rss(mm); | 1249 | update_hiwater_rss(mm); |
1250 | 1250 | ||
1251 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { | 1251 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
1252 | if (PageAnon(page)) | 1252 | if (!PageHuge(page)) { |
1253 | dec_mm_counter(mm, MM_ANONPAGES); | 1253 | if (PageAnon(page)) |
1254 | else | 1254 | dec_mm_counter(mm, MM_ANONPAGES); |
1255 | dec_mm_counter(mm, MM_FILEPAGES); | 1255 | else |
1256 | dec_mm_counter(mm, MM_FILEPAGES); | ||
1257 | } | ||
1256 | set_pte_at(mm, address, pte, | 1258 | set_pte_at(mm, address, pte, |
1257 | swp_entry_to_pte(make_hwpoison_entry(page))); | 1259 | swp_entry_to_pte(make_hwpoison_entry(page))); |
1258 | } else if (PageAnon(page)) { | 1260 | } else if (PageAnon(page)) { |
1259 | swp_entry_t entry = { .val = page_private(page) }; | 1261 | swp_entry_t entry = { .val = page_private(page) }; |
1260 | 1262 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 50c5b8f3a359..03f9ba8fb8e5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1715,6 +1715,96 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | |||
1715 | return error; | 1715 | return error; |
1716 | } | 1716 | } |
1717 | 1717 | ||
1718 | /* | ||
1719 | * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. | ||
1720 | */ | ||
1721 | static pgoff_t shmem_seek_hole_data(struct address_space *mapping, | ||
1722 | pgoff_t index, pgoff_t end, int origin) | ||
1723 | { | ||
1724 | struct page *page; | ||
1725 | struct pagevec pvec; | ||
1726 | pgoff_t indices[PAGEVEC_SIZE]; | ||
1727 | bool done = false; | ||
1728 | int i; | ||
1729 | |||
1730 | pagevec_init(&pvec, 0); | ||
1731 | pvec.nr = 1; /* start small: we may be there already */ | ||
1732 | while (!done) { | ||
1733 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | ||
1734 | pvec.nr, pvec.pages, indices); | ||
1735 | if (!pvec.nr) { | ||
1736 | if (origin == SEEK_DATA) | ||
1737 | index = end; | ||
1738 | break; | ||
1739 | } | ||
1740 | for (i = 0; i < pvec.nr; i++, index++) { | ||
1741 | if (index < indices[i]) { | ||
1742 | if (origin == SEEK_HOLE) { | ||
1743 | done = true; | ||
1744 | break; | ||
1745 | } | ||
1746 | index = indices[i]; | ||
1747 | } | ||
1748 | page = pvec.pages[i]; | ||
1749 | if (page && !radix_tree_exceptional_entry(page)) { | ||
1750 | if (!PageUptodate(page)) | ||
1751 | page = NULL; | ||
1752 | } | ||
1753 | if (index >= end || | ||
1754 | (page && origin == SEEK_DATA) || | ||
1755 | (!page && origin == SEEK_HOLE)) { | ||
1756 | done = true; | ||
1757 | break; | ||
1758 | } | ||
1759 | } | ||
1760 | shmem_deswap_pagevec(&pvec); | ||
1761 | pagevec_release(&pvec); | ||
1762 | pvec.nr = PAGEVEC_SIZE; | ||
1763 | cond_resched(); | ||
1764 | } | ||
1765 | return index; | ||
1766 | } | ||
1767 | |||
1768 | static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) | ||
1769 | { | ||
1770 | struct address_space *mapping = file->f_mapping; | ||
1771 | struct inode *inode = mapping->host; | ||
1772 | pgoff_t start, end; | ||
1773 | loff_t new_offset; | ||
1774 | |||
1775 | if (origin != SEEK_DATA && origin != SEEK_HOLE) | ||
1776 | return generic_file_llseek_size(file, offset, origin, | ||
1777 | MAX_LFS_FILESIZE, i_size_read(inode)); | ||
1778 | mutex_lock(&inode->i_mutex); | ||
1779 | /* We're holding i_mutex so we can access i_size directly */ | ||
1780 | |||
1781 | if (offset < 0) | ||
1782 | offset = -EINVAL; | ||
1783 | else if (offset >= inode->i_size) | ||
1784 | offset = -ENXIO; | ||
1785 | else { | ||
1786 | start = offset >> PAGE_CACHE_SHIFT; | ||
1787 | end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1788 | new_offset = shmem_seek_hole_data(mapping, start, end, origin); | ||
1789 | new_offset <<= PAGE_CACHE_SHIFT; | ||
1790 | if (new_offset > offset) { | ||
1791 | if (new_offset < inode->i_size) | ||
1792 | offset = new_offset; | ||
1793 | else if (origin == SEEK_DATA) | ||
1794 | offset = -ENXIO; | ||
1795 | else | ||
1796 | offset = inode->i_size; | ||
1797 | } | ||
1798 | } | ||
1799 | |||
1800 | if (offset >= 0 && offset != file->f_pos) { | ||
1801 | file->f_pos = offset; | ||
1802 | file->f_version = 0; | ||
1803 | } | ||
1804 | mutex_unlock(&inode->i_mutex); | ||
1805 | return offset; | ||
1806 | } | ||
1807 | |||
1718 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, | 1808 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, |
1719 | loff_t len) | 1809 | loff_t len) |
1720 | { | 1810 | { |
@@ -2586,7 +2676,7 @@ static const struct address_space_operations shmem_aops = { | |||
2586 | static const struct file_operations shmem_file_operations = { | 2676 | static const struct file_operations shmem_file_operations = { |
2587 | .mmap = shmem_mmap, | 2677 | .mmap = shmem_mmap, |
2588 | #ifdef CONFIG_TMPFS | 2678 | #ifdef CONFIG_TMPFS |
2589 | .llseek = generic_file_llseek, | 2679 | .llseek = shmem_file_llseek, |
2590 | .read = do_sync_read, | 2680 | .read = do_sync_read, |
2591 | .write = do_sync_write, | 2681 | .write = do_sync_write, |
2592 | .aio_read = shmem_file_aio_read, | 2682 | .aio_read = shmem_file_aio_read, |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 157bb116dec8..7f3096137b8a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -3131,7 +3131,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
3131 | int nid; | 3131 | int nid; |
3132 | 3132 | ||
3133 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { | 3133 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { |
3134 | for_each_node_state(nid, N_HIGH_MEMORY) { | 3134 | for_each_node_state(nid, N_MEMORY) { |
3135 | pg_data_t *pgdat = NODE_DATA(nid); | 3135 | pg_data_t *pgdat = NODE_DATA(nid); |
3136 | const struct cpumask *mask; | 3136 | const struct cpumask *mask; |
3137 | 3137 | ||
@@ -3187,7 +3187,7 @@ static int __init kswapd_init(void) | |||
3187 | int nid; | 3187 | int nid; |
3188 | 3188 | ||
3189 | swap_setup(); | 3189 | swap_setup(); |
3190 | for_each_node_state(nid, N_HIGH_MEMORY) | 3190 | for_each_node_state(nid, N_MEMORY) |
3191 | kswapd_run(nid); | 3191 | kswapd_run(nid); |
3192 | hotcpu_notifier(cpu_callback, 0); | 3192 | hotcpu_notifier(cpu_callback, 0); |
3193 | return 0; | 3193 | return 0; |
diff --git a/mm/vmstat.c b/mm/vmstat.c index c7370579111b..df14808f0a36 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -801,6 +801,8 @@ const char * const vmstat_text[] = { | |||
801 | "thp_collapse_alloc", | 801 | "thp_collapse_alloc", |
802 | "thp_collapse_alloc_failed", | 802 | "thp_collapse_alloc_failed", |
803 | "thp_split", | 803 | "thp_split", |
804 | "thp_zero_page_alloc", | ||
805 | "thp_zero_page_alloc_failed", | ||
804 | #endif | 806 | #endif |
805 | 807 | ||
806 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 808 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
@@ -930,7 +932,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg) | |||
930 | pg_data_t *pgdat = (pg_data_t *)arg; | 932 | pg_data_t *pgdat = (pg_data_t *)arg; |
931 | 933 | ||
932 | /* check memoryless node */ | 934 | /* check memoryless node */ |
933 | if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) | 935 | if (!node_state(pgdat->node_id, N_MEMORY)) |
934 | return 0; | 936 | return 0; |
935 | 937 | ||
936 | seq_printf(m, "Page block order: %d\n", pageblock_order); | 938 | seq_printf(m, "Page block order: %d\n", pageblock_order); |
@@ -992,14 +994,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
992 | "\n high %lu" | 994 | "\n high %lu" |
993 | "\n scanned %lu" | 995 | "\n scanned %lu" |
994 | "\n spanned %lu" | 996 | "\n spanned %lu" |
995 | "\n present %lu", | 997 | "\n present %lu" |
998 | "\n managed %lu", | ||
996 | zone_page_state(zone, NR_FREE_PAGES), | 999 | zone_page_state(zone, NR_FREE_PAGES), |
997 | min_wmark_pages(zone), | 1000 | min_wmark_pages(zone), |
998 | low_wmark_pages(zone), | 1001 | low_wmark_pages(zone), |
999 | high_wmark_pages(zone), | 1002 | high_wmark_pages(zone), |
1000 | zone->pages_scanned, | 1003 | zone->pages_scanned, |
1001 | zone->spanned_pages, | 1004 | zone->spanned_pages, |
1002 | zone->present_pages); | 1005 | zone->present_pages, |
1006 | zone->managed_pages); | ||
1003 | 1007 | ||
1004 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 1008 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
1005 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], | 1009 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], |
@@ -1292,7 +1296,7 @@ static int unusable_show(struct seq_file *m, void *arg) | |||
1292 | pg_data_t *pgdat = (pg_data_t *)arg; | 1296 | pg_data_t *pgdat = (pg_data_t *)arg; |
1293 | 1297 | ||
1294 | /* check memoryless node */ | 1298 | /* check memoryless node */ |
1295 | if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) | 1299 | if (!node_state(pgdat->node_id, N_MEMORY)) |
1296 | return 0; | 1300 | return 0; |
1297 | 1301 | ||
1298 | walk_zones_in_node(m, pgdat, unusable_show_print); | 1302 | walk_zones_in_node(m, pgdat, unusable_show_print); |