diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-13 01:45:43 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-13 01:45:43 -0500 |
commit | 5cbb3d216e2041700231bcfc383ee5f8b7fc8b74 (patch) | |
tree | a738fa82dbcefa9bd283c08bc67f38827be63937 /mm | |
parent | 9bc9ccd7db1c9f043f75380b5a5b94912046a60e (diff) | |
parent | 4e9b45a19241354daec281d7a785739829b52359 (diff) |
Merge branch 'akpm' (patches from Andrew Morton)
Merge first patch-bomb from Andrew Morton:
"Quite a lot of other stuff is banked up awaiting further
next->mainline merging, but this batch contains:
- Lots of random misc patches
- OCFS2
- Most of MM
- backlight updates
- lib/ updates
- printk updates
- checkpatch updates
- epoll tweaking
- rtc updates
- hfs
- hfsplus
- documentation
- procfs
- update gcov to gcc-4.7 format
- IPC"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (269 commits)
ipc, msg: fix message length check for negative values
ipc/util.c: remove unnecessary work pending test
devpts: plug the memory leak in kill_sb
./Makefile: export initial ramdisk compression config option
init/Kconfig: add option to disable kernel compression
drivers: w1: make w1_slave::flags long to avoid memory corruption
drivers/w1/masters/ds1wm.cuse dev_get_platdata()
drivers/memstick/core/ms_block.c: fix unreachable state in h_msb_read_page()
drivers/memstick/core/mspro_block.c: fix attributes array allocation
drivers/pps/clients/pps-gpio.c: remove redundant of_match_ptr
kernel/panic.c: reduce 1 byte usage for print tainted buffer
gcov: reuse kbasename helper
kernel/gcov/fs.c: use pr_warn()
kernel/module.c: use pr_foo()
gcov: compile specific gcov implementation based on gcc version
gcov: add support for gcc 4.7 gcov format
gcov: move gcov structs definitions to a gcc version specific file
kernel/taskstats.c: return -ENOMEM when alloc memory fails in add_del_listener()
kernel/taskstats.c: add nla_nest_cancel() for failure processing between nla_nest_start() and nla_nest_end()
kernel/sysctl_binary.c: use scnprintf() instead of snprintf()
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 17 | ||||
-rw-r--r-- | mm/bootmem.c | 8 | ||||
-rw-r--r-- | mm/compaction.c | 7 | ||||
-rw-r--r-- | mm/huge_memory.c | 78 | ||||
-rw-r--r-- | mm/kmemleak.c | 4 | ||||
-rw-r--r-- | mm/ksm.c | 4 | ||||
-rw-r--r-- | mm/memblock.c | 124 | ||||
-rw-r--r-- | mm/memcontrol.c | 97 | ||||
-rw-r--r-- | mm/memory-failure.c | 36 | ||||
-rw-r--r-- | mm/memory.c | 2 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 65 | ||||
-rw-r--r-- | mm/mempolicy.c | 62 | ||||
-rw-r--r-- | mm/mmap.c | 16 | ||||
-rw-r--r-- | mm/mprotect.c | 10 | ||||
-rw-r--r-- | mm/nobootmem.c | 25 | ||||
-rw-r--r-- | mm/nommu.c | 3 | ||||
-rw-r--r-- | mm/page_alloc.c | 34 | ||||
-rw-r--r-- | mm/readahead.c | 8 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/slab.h | 6 | ||||
-rw-r--r-- | mm/slab_common.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 2 | ||||
-rw-r--r-- | mm/sparse.c | 53 | ||||
-rw-r--r-- | mm/swapfile.c | 16 | ||||
-rw-r--r-- | mm/util.c | 13 | ||||
-rw-r--r-- | mm/vmalloc.c | 48 | ||||
-rw-r--r-- | mm/vmstat.c | 22 | ||||
-rw-r--r-- | mm/zswap.c | 195 |
28 files changed, 569 insertions, 390 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 394838f489eb..3f4ffda152bb 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -153,11 +153,18 @@ config MOVABLE_NODE | |||
153 | help | 153 | help |
154 | Allow a node to have only movable memory. Pages used by the kernel, | 154 | Allow a node to have only movable memory. Pages used by the kernel, |
155 | such as direct mapping pages cannot be migrated. So the corresponding | 155 | such as direct mapping pages cannot be migrated. So the corresponding |
156 | memory device cannot be hotplugged. This option allows users to | 156 | memory device cannot be hotplugged. This option allows the following |
157 | online all the memory of a node as movable memory so that the whole | 157 | two things: |
158 | node can be hotplugged. Users who don't use the memory hotplug | 158 | - When the system is booting, node full of hotpluggable memory can |
159 | feature are fine with this option on since they don't online memory | 159 | be arranged to have only movable memory so that the whole node can |
160 | as movable. | 160 | be hot-removed. (need movable_node boot option specified). |
161 | - After the system is up, the option allows users to online all the | ||
162 | memory of a node as movable memory so that the whole node can be | ||
163 | hot-removed. | ||
164 | |||
165 | Users who don't use the memory hotplug feature are fine with this | ||
166 | option on since they don't specify movable_node boot option or they | ||
167 | don't online memory as movable. | ||
161 | 168 | ||
162 | Say Y here if you want to hotplug a whole node. | 169 | Say Y here if you want to hotplug a whole node. |
163 | Say N here if you want kernel to use memory on all nodes evenly. | 170 | Say N here if you want kernel to use memory on all nodes evenly. |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 6ab7744e692e..90bd3507b413 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -172,11 +172,12 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) | |||
172 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | 172 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) |
173 | { | 173 | { |
174 | struct page *page; | 174 | struct page *page; |
175 | unsigned long start, end, pages, count = 0; | 175 | unsigned long *map, start, end, pages, count = 0; |
176 | 176 | ||
177 | if (!bdata->node_bootmem_map) | 177 | if (!bdata->node_bootmem_map) |
178 | return 0; | 178 | return 0; |
179 | 179 | ||
180 | map = bdata->node_bootmem_map; | ||
180 | start = bdata->node_min_pfn; | 181 | start = bdata->node_min_pfn; |
181 | end = bdata->node_low_pfn; | 182 | end = bdata->node_low_pfn; |
182 | 183 | ||
@@ -184,10 +185,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
184 | bdata - bootmem_node_data, start, end); | 185 | bdata - bootmem_node_data, start, end); |
185 | 186 | ||
186 | while (start < end) { | 187 | while (start < end) { |
187 | unsigned long *map, idx, vec; | 188 | unsigned long idx, vec; |
188 | unsigned shift; | 189 | unsigned shift; |
189 | 190 | ||
190 | map = bdata->node_bootmem_map; | ||
191 | idx = start - bdata->node_min_pfn; | 191 | idx = start - bdata->node_min_pfn; |
192 | shift = idx & (BITS_PER_LONG - 1); | 192 | shift = idx & (BITS_PER_LONG - 1); |
193 | /* | 193 | /* |
@@ -784,7 +784,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | |||
784 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 784 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
785 | 785 | ||
786 | /* update goal according ...MAX_DMA32_PFN */ | 786 | /* update goal according ...MAX_DMA32_PFN */ |
787 | end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; | 787 | end_pfn = pgdat_end_pfn(pgdat); |
788 | 788 | ||
789 | if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && | 789 | if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && |
790 | (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { | 790 | (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { |
diff --git a/mm/compaction.c b/mm/compaction.c index b5326b141a25..805165bcd3dd 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -235,10 +235,9 @@ static bool suitable_migration_target(struct page *page) | |||
235 | } | 235 | } |
236 | 236 | ||
237 | /* | 237 | /* |
238 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. | 238 | * Isolate free pages onto a private freelist. If @strict is true, will abort |
239 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free | 239 | * returning 0 on any invalid PFNs or non-free pages inside of the pageblock |
240 | * pages inside of the pageblock (even though it may still end up isolating | 240 | * (even though it may still end up isolating some pages). |
241 | * some pages). | ||
242 | */ | 241 | */ |
243 | static unsigned long isolate_freepages_block(struct compact_control *cc, | 242 | static unsigned long isolate_freepages_block(struct compact_control *cc, |
244 | unsigned long blockpfn, | 243 | unsigned long blockpfn, |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2612f60f53ee..0556c6a44959 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -27,11 +27,12 @@ | |||
27 | #include "internal.h" | 27 | #include "internal.h" |
28 | 28 | ||
29 | /* | 29 | /* |
30 | * By default transparent hugepage support is enabled for all mappings | 30 | * By default transparent hugepage support is disabled in order that avoid |
31 | * and khugepaged scans all mappings. Defrag is only invoked by | 31 | * to risk increase the memory footprint of applications without a guaranteed |
32 | * khugepaged hugepage allocations and by page faults inside | 32 | * benefit. When transparent hugepage support is enabled, is for all mappings, |
33 | * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived | 33 | * and khugepaged scans all mappings. |
34 | * allocations. | 34 | * Defrag is invoked by khugepaged hugepage allocations and by page faults |
35 | * for all hugepage allocations. | ||
35 | */ | 36 | */ |
36 | unsigned long transparent_hugepage_flags __read_mostly = | 37 | unsigned long transparent_hugepage_flags __read_mostly = |
37 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS | 38 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS |
@@ -758,14 +759,6 @@ static inline struct page *alloc_hugepage_vma(int defrag, | |||
758 | HPAGE_PMD_ORDER, vma, haddr, nd); | 759 | HPAGE_PMD_ORDER, vma, haddr, nd); |
759 | } | 760 | } |
760 | 761 | ||
761 | #ifndef CONFIG_NUMA | ||
762 | static inline struct page *alloc_hugepage(int defrag) | ||
763 | { | ||
764 | return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), | ||
765 | HPAGE_PMD_ORDER); | ||
766 | } | ||
767 | #endif | ||
768 | |||
769 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | 762 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, |
770 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | 763 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, |
771 | struct page *zero_page) | 764 | struct page *zero_page) |
@@ -2198,7 +2191,34 @@ static void khugepaged_alloc_sleep(void) | |||
2198 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); | 2191 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); |
2199 | } | 2192 | } |
2200 | 2193 | ||
2194 | static int khugepaged_node_load[MAX_NUMNODES]; | ||
2195 | |||
2201 | #ifdef CONFIG_NUMA | 2196 | #ifdef CONFIG_NUMA |
2197 | static int khugepaged_find_target_node(void) | ||
2198 | { | ||
2199 | static int last_khugepaged_target_node = NUMA_NO_NODE; | ||
2200 | int nid, target_node = 0, max_value = 0; | ||
2201 | |||
2202 | /* find first node with max normal pages hit */ | ||
2203 | for (nid = 0; nid < MAX_NUMNODES; nid++) | ||
2204 | if (khugepaged_node_load[nid] > max_value) { | ||
2205 | max_value = khugepaged_node_load[nid]; | ||
2206 | target_node = nid; | ||
2207 | } | ||
2208 | |||
2209 | /* do some balance if several nodes have the same hit record */ | ||
2210 | if (target_node <= last_khugepaged_target_node) | ||
2211 | for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; | ||
2212 | nid++) | ||
2213 | if (max_value == khugepaged_node_load[nid]) { | ||
2214 | target_node = nid; | ||
2215 | break; | ||
2216 | } | ||
2217 | |||
2218 | last_khugepaged_target_node = target_node; | ||
2219 | return target_node; | ||
2220 | } | ||
2221 | |||
2202 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | 2222 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) |
2203 | { | 2223 | { |
2204 | if (IS_ERR(*hpage)) { | 2224 | if (IS_ERR(*hpage)) { |
@@ -2232,9 +2252,8 @@ static struct page | |||
2232 | * mmap_sem in read mode is good idea also to allow greater | 2252 | * mmap_sem in read mode is good idea also to allow greater |
2233 | * scalability. | 2253 | * scalability. |
2234 | */ | 2254 | */ |
2235 | *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, | 2255 | *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( |
2236 | node, __GFP_OTHER_NODE); | 2256 | khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); |
2237 | |||
2238 | /* | 2257 | /* |
2239 | * After allocating the hugepage, release the mmap_sem read lock in | 2258 | * After allocating the hugepage, release the mmap_sem read lock in |
2240 | * preparation for taking it in write mode. | 2259 | * preparation for taking it in write mode. |
@@ -2250,6 +2269,17 @@ static struct page | |||
2250 | return *hpage; | 2269 | return *hpage; |
2251 | } | 2270 | } |
2252 | #else | 2271 | #else |
2272 | static int khugepaged_find_target_node(void) | ||
2273 | { | ||
2274 | return 0; | ||
2275 | } | ||
2276 | |||
2277 | static inline struct page *alloc_hugepage(int defrag) | ||
2278 | { | ||
2279 | return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), | ||
2280 | HPAGE_PMD_ORDER); | ||
2281 | } | ||
2282 | |||
2253 | static struct page *khugepaged_alloc_hugepage(bool *wait) | 2283 | static struct page *khugepaged_alloc_hugepage(bool *wait) |
2254 | { | 2284 | { |
2255 | struct page *hpage; | 2285 | struct page *hpage; |
@@ -2456,6 +2486,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2456 | if (pmd_trans_huge(*pmd)) | 2486 | if (pmd_trans_huge(*pmd)) |
2457 | goto out; | 2487 | goto out; |
2458 | 2488 | ||
2489 | memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); | ||
2459 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2490 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
2460 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; | 2491 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; |
2461 | _pte++, _address += PAGE_SIZE) { | 2492 | _pte++, _address += PAGE_SIZE) { |
@@ -2472,12 +2503,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2472 | if (unlikely(!page)) | 2503 | if (unlikely(!page)) |
2473 | goto out_unmap; | 2504 | goto out_unmap; |
2474 | /* | 2505 | /* |
2475 | * Chose the node of the first page. This could | 2506 | * Record which node the original page is from and save this |
2476 | * be more sophisticated and look at more pages, | 2507 | * information to khugepaged_node_load[]. |
2477 | * but isn't for now. | 2508 | * Khupaged will allocate hugepage from the node has the max |
2509 | * hit record. | ||
2478 | */ | 2510 | */ |
2479 | if (node == NUMA_NO_NODE) | 2511 | node = page_to_nid(page); |
2480 | node = page_to_nid(page); | 2512 | khugepaged_node_load[node]++; |
2481 | VM_BUG_ON(PageCompound(page)); | 2513 | VM_BUG_ON(PageCompound(page)); |
2482 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | 2514 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) |
2483 | goto out_unmap; | 2515 | goto out_unmap; |
@@ -2492,9 +2524,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2492 | ret = 1; | 2524 | ret = 1; |
2493 | out_unmap: | 2525 | out_unmap: |
2494 | pte_unmap_unlock(pte, ptl); | 2526 | pte_unmap_unlock(pte, ptl); |
2495 | if (ret) | 2527 | if (ret) { |
2528 | node = khugepaged_find_target_node(); | ||
2496 | /* collapse_huge_page will return with the mmap_sem released */ | 2529 | /* collapse_huge_page will return with the mmap_sem released */ |
2497 | collapse_huge_page(mm, address, hpage, vma, node); | 2530 | collapse_huge_page(mm, address, hpage, vma, node); |
2531 | } | ||
2498 | out: | 2532 | out: |
2499 | return ret; | 2533 | return ret; |
2500 | } | 2534 | } |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index e126b0ef9ad2..31f01c5011e5 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -753,7 +753,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) | |||
753 | } | 753 | } |
754 | 754 | ||
755 | spin_lock_irqsave(&object->lock, flags); | 755 | spin_lock_irqsave(&object->lock, flags); |
756 | if (ptr + size > object->pointer + object->size) { | 756 | if (size == SIZE_MAX) { |
757 | size = object->pointer + object->size - ptr; | ||
758 | } else if (ptr + size > object->pointer + object->size) { | ||
757 | kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); | 759 | kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); |
758 | dump_object_info(object); | 760 | dump_object_info(object); |
759 | kmem_cache_free(scan_area_cache, area); | 761 | kmem_cache_free(scan_area_cache, area); |
@@ -2309,8 +2309,8 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj, | |||
2309 | * Allocate stable and unstable together: | 2309 | * Allocate stable and unstable together: |
2310 | * MAXSMP NODES_SHIFT 10 will use 16kB. | 2310 | * MAXSMP NODES_SHIFT 10 will use 16kB. |
2311 | */ | 2311 | */ |
2312 | buf = kcalloc(nr_node_ids + nr_node_ids, | 2312 | buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf), |
2313 | sizeof(*buf), GFP_KERNEL | __GFP_ZERO); | 2313 | GFP_KERNEL); |
2314 | /* Let us assume that RB_ROOT is NULL is zero */ | 2314 | /* Let us assume that RB_ROOT is NULL is zero */ |
2315 | if (!buf) | 2315 | if (!buf) |
2316 | err = -ENOMEM; | 2316 | err = -ENOMEM; |
diff --git a/mm/memblock.c b/mm/memblock.c index 0ac412a0a7ee..53e477bb5558 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -20,6 +20,8 @@ | |||
20 | #include <linux/seq_file.h> | 20 | #include <linux/seq_file.h> |
21 | #include <linux/memblock.h> | 21 | #include <linux/memblock.h> |
22 | 22 | ||
23 | #include <asm-generic/sections.h> | ||
24 | |||
23 | static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; | 25 | static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; |
24 | static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; | 26 | static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; |
25 | 27 | ||
@@ -32,6 +34,7 @@ struct memblock memblock __initdata_memblock = { | |||
32 | .reserved.cnt = 1, /* empty dummy entry */ | 34 | .reserved.cnt = 1, /* empty dummy entry */ |
33 | .reserved.max = INIT_MEMBLOCK_REGIONS, | 35 | .reserved.max = INIT_MEMBLOCK_REGIONS, |
34 | 36 | ||
37 | .bottom_up = false, | ||
35 | .current_limit = MEMBLOCK_ALLOC_ANYWHERE, | 38 | .current_limit = MEMBLOCK_ALLOC_ANYWHERE, |
36 | }; | 39 | }; |
37 | 40 | ||
@@ -82,6 +85,73 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, | |||
82 | return (i < type->cnt) ? i : -1; | 85 | return (i < type->cnt) ? i : -1; |
83 | } | 86 | } |
84 | 87 | ||
88 | /* | ||
89 | * __memblock_find_range_bottom_up - find free area utility in bottom-up | ||
90 | * @start: start of candidate range | ||
91 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | ||
92 | * @size: size of free area to find | ||
93 | * @align: alignment of free area to find | ||
94 | * @nid: nid of the free area to find, %MAX_NUMNODES for any node | ||
95 | * | ||
96 | * Utility called from memblock_find_in_range_node(), find free area bottom-up. | ||
97 | * | ||
98 | * RETURNS: | ||
99 | * Found address on success, 0 on failure. | ||
100 | */ | ||
101 | static phys_addr_t __init_memblock | ||
102 | __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, | ||
103 | phys_addr_t size, phys_addr_t align, int nid) | ||
104 | { | ||
105 | phys_addr_t this_start, this_end, cand; | ||
106 | u64 i; | ||
107 | |||
108 | for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) { | ||
109 | this_start = clamp(this_start, start, end); | ||
110 | this_end = clamp(this_end, start, end); | ||
111 | |||
112 | cand = round_up(this_start, align); | ||
113 | if (cand < this_end && this_end - cand >= size) | ||
114 | return cand; | ||
115 | } | ||
116 | |||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | /** | ||
121 | * __memblock_find_range_top_down - find free area utility, in top-down | ||
122 | * @start: start of candidate range | ||
123 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | ||
124 | * @size: size of free area to find | ||
125 | * @align: alignment of free area to find | ||
126 | * @nid: nid of the free area to find, %MAX_NUMNODES for any node | ||
127 | * | ||
128 | * Utility called from memblock_find_in_range_node(), find free area top-down. | ||
129 | * | ||
130 | * RETURNS: | ||
131 | * Found address on success, 0 on failure. | ||
132 | */ | ||
133 | static phys_addr_t __init_memblock | ||
134 | __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, | ||
135 | phys_addr_t size, phys_addr_t align, int nid) | ||
136 | { | ||
137 | phys_addr_t this_start, this_end, cand; | ||
138 | u64 i; | ||
139 | |||
140 | for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { | ||
141 | this_start = clamp(this_start, start, end); | ||
142 | this_end = clamp(this_end, start, end); | ||
143 | |||
144 | if (this_end < size) | ||
145 | continue; | ||
146 | |||
147 | cand = round_down(this_end - size, align); | ||
148 | if (cand >= this_start) | ||
149 | return cand; | ||
150 | } | ||
151 | |||
152 | return 0; | ||
153 | } | ||
154 | |||
85 | /** | 155 | /** |
86 | * memblock_find_in_range_node - find free area in given range and node | 156 | * memblock_find_in_range_node - find free area in given range and node |
87 | * @start: start of candidate range | 157 | * @start: start of candidate range |
@@ -92,15 +162,23 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, | |||
92 | * | 162 | * |
93 | * Find @size free area aligned to @align in the specified range and node. | 163 | * Find @size free area aligned to @align in the specified range and node. |
94 | * | 164 | * |
165 | * When allocation direction is bottom-up, the @start should be greater | ||
166 | * than the end of the kernel image. Otherwise, it will be trimmed. The | ||
167 | * reason is that we want the bottom-up allocation just near the kernel | ||
168 | * image so it is highly likely that the allocated memory and the kernel | ||
169 | * will reside in the same node. | ||
170 | * | ||
171 | * If bottom-up allocation failed, will try to allocate memory top-down. | ||
172 | * | ||
95 | * RETURNS: | 173 | * RETURNS: |
96 | * Found address on success, %0 on failure. | 174 | * Found address on success, 0 on failure. |
97 | */ | 175 | */ |
98 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | 176 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, |
99 | phys_addr_t end, phys_addr_t size, | 177 | phys_addr_t end, phys_addr_t size, |
100 | phys_addr_t align, int nid) | 178 | phys_addr_t align, int nid) |
101 | { | 179 | { |
102 | phys_addr_t this_start, this_end, cand; | 180 | int ret; |
103 | u64 i; | 181 | phys_addr_t kernel_end; |
104 | 182 | ||
105 | /* pump up @end */ | 183 | /* pump up @end */ |
106 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE) | 184 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE) |
@@ -109,19 +187,39 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | |||
109 | /* avoid allocating the first page */ | 187 | /* avoid allocating the first page */ |
110 | start = max_t(phys_addr_t, start, PAGE_SIZE); | 188 | start = max_t(phys_addr_t, start, PAGE_SIZE); |
111 | end = max(start, end); | 189 | end = max(start, end); |
190 | kernel_end = __pa_symbol(_end); | ||
112 | 191 | ||
113 | for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { | 192 | /* |
114 | this_start = clamp(this_start, start, end); | 193 | * try bottom-up allocation only when bottom-up mode |
115 | this_end = clamp(this_end, start, end); | 194 | * is set and @end is above the kernel image. |
195 | */ | ||
196 | if (memblock_bottom_up() && end > kernel_end) { | ||
197 | phys_addr_t bottom_up_start; | ||
116 | 198 | ||
117 | if (this_end < size) | 199 | /* make sure we will allocate above the kernel */ |
118 | continue; | 200 | bottom_up_start = max(start, kernel_end); |
119 | 201 | ||
120 | cand = round_down(this_end - size, align); | 202 | /* ok, try bottom-up allocation first */ |
121 | if (cand >= this_start) | 203 | ret = __memblock_find_range_bottom_up(bottom_up_start, end, |
122 | return cand; | 204 | size, align, nid); |
205 | if (ret) | ||
206 | return ret; | ||
207 | |||
208 | /* | ||
209 | * we always limit bottom-up allocation above the kernel, | ||
210 | * but top-down allocation doesn't have the limit, so | ||
211 | * retrying top-down allocation may succeed when bottom-up | ||
212 | * allocation failed. | ||
213 | * | ||
214 | * bottom-up allocation is expected to be fail very rarely, | ||
215 | * so we use WARN_ONCE() here to see the stack trace if | ||
216 | * fail happens. | ||
217 | */ | ||
218 | WARN_ONCE(1, "memblock: bottom-up allocation failed, " | ||
219 | "memory hotunplug may be affected\n"); | ||
123 | } | 220 | } |
124 | return 0; | 221 | |
222 | return __memblock_find_range_top_down(start, end, size, align, nid); | ||
125 | } | 223 | } |
126 | 224 | ||
127 | /** | 225 | /** |
@@ -134,7 +232,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | |||
134 | * Find @size free area aligned to @align in the specified range. | 232 | * Find @size free area aligned to @align in the specified range. |
135 | * | 233 | * |
136 | * RETURNS: | 234 | * RETURNS: |
137 | * Found address on success, %0 on failure. | 235 | * Found address on success, 0 on failure. |
138 | */ | 236 | */ |
139 | phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, | 237 | phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, |
140 | phys_addr_t end, phys_addr_t size, | 238 | phys_addr_t end, phys_addr_t size, |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 796820925de0..f20a57b7faf2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -59,6 +59,7 @@ | |||
59 | #include <net/sock.h> | 59 | #include <net/sock.h> |
60 | #include <net/ip.h> | 60 | #include <net/ip.h> |
61 | #include <net/tcp_memcontrol.h> | 61 | #include <net/tcp_memcontrol.h> |
62 | #include "slab.h" | ||
62 | 63 | ||
63 | #include <asm/uaccess.h> | 64 | #include <asm/uaccess.h> |
64 | 65 | ||
@@ -2968,7 +2969,7 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) | |||
2968 | 2969 | ||
2969 | VM_BUG_ON(p->is_root_cache); | 2970 | VM_BUG_ON(p->is_root_cache); |
2970 | cachep = p->root_cache; | 2971 | cachep = p->root_cache; |
2971 | return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; | 2972 | return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); |
2972 | } | 2973 | } |
2973 | 2974 | ||
2974 | #ifdef CONFIG_SLABINFO | 2975 | #ifdef CONFIG_SLABINFO |
@@ -2997,21 +2998,14 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) | |||
2997 | struct res_counter *fail_res; | 2998 | struct res_counter *fail_res; |
2998 | struct mem_cgroup *_memcg; | 2999 | struct mem_cgroup *_memcg; |
2999 | int ret = 0; | 3000 | int ret = 0; |
3000 | bool may_oom; | ||
3001 | 3001 | ||
3002 | ret = res_counter_charge(&memcg->kmem, size, &fail_res); | 3002 | ret = res_counter_charge(&memcg->kmem, size, &fail_res); |
3003 | if (ret) | 3003 | if (ret) |
3004 | return ret; | 3004 | return ret; |
3005 | 3005 | ||
3006 | /* | ||
3007 | * Conditions under which we can wait for the oom_killer. Those are | ||
3008 | * the same conditions tested by the core page allocator | ||
3009 | */ | ||
3010 | may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); | ||
3011 | |||
3012 | _memcg = memcg; | 3006 | _memcg = memcg; |
3013 | ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, | 3007 | ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, |
3014 | &_memcg, may_oom); | 3008 | &_memcg, oom_gfp_allowed(gfp)); |
3015 | 3009 | ||
3016 | if (ret == -EINTR) { | 3010 | if (ret == -EINTR) { |
3017 | /* | 3011 | /* |
@@ -3151,7 +3145,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | |||
3151 | { | 3145 | { |
3152 | struct memcg_cache_params *cur_params = s->memcg_params; | 3146 | struct memcg_cache_params *cur_params = s->memcg_params; |
3153 | 3147 | ||
3154 | VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); | 3148 | VM_BUG_ON(!is_root_cache(s)); |
3155 | 3149 | ||
3156 | if (num_groups > memcg_limited_groups_array_size) { | 3150 | if (num_groups > memcg_limited_groups_array_size) { |
3157 | int i; | 3151 | int i; |
@@ -3412,7 +3406,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | |||
3412 | idx = memcg_cache_id(memcg); | 3406 | idx = memcg_cache_id(memcg); |
3413 | 3407 | ||
3414 | mutex_lock(&memcg_cache_mutex); | 3408 | mutex_lock(&memcg_cache_mutex); |
3415 | new_cachep = cachep->memcg_params->memcg_caches[idx]; | 3409 | new_cachep = cache_from_memcg_idx(cachep, idx); |
3416 | if (new_cachep) { | 3410 | if (new_cachep) { |
3417 | css_put(&memcg->css); | 3411 | css_put(&memcg->css); |
3418 | goto out; | 3412 | goto out; |
@@ -3458,8 +3452,8 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) | |||
3458 | * we'll take the set_limit_mutex to protect ourselves against this. | 3452 | * we'll take the set_limit_mutex to protect ourselves against this. |
3459 | */ | 3453 | */ |
3460 | mutex_lock(&set_limit_mutex); | 3454 | mutex_lock(&set_limit_mutex); |
3461 | for (i = 0; i < memcg_limited_groups_array_size; i++) { | 3455 | for_each_memcg_cache_index(i) { |
3462 | c = s->memcg_params->memcg_caches[i]; | 3456 | c = cache_from_memcg_idx(s, i); |
3463 | if (!c) | 3457 | if (!c) |
3464 | continue; | 3458 | continue; |
3465 | 3459 | ||
@@ -3592,8 +3586,8 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
3592 | * code updating memcg_caches will issue a write barrier to match this. | 3586 | * code updating memcg_caches will issue a write barrier to match this. |
3593 | */ | 3587 | */ |
3594 | read_barrier_depends(); | 3588 | read_barrier_depends(); |
3595 | if (likely(cachep->memcg_params->memcg_caches[idx])) { | 3589 | if (likely(cache_from_memcg_idx(cachep, idx))) { |
3596 | cachep = cachep->memcg_params->memcg_caches[idx]; | 3590 | cachep = cache_from_memcg_idx(cachep, idx); |
3597 | goto out; | 3591 | goto out; |
3598 | } | 3592 | } |
3599 | 3593 | ||
@@ -5389,45 +5383,50 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, | |||
5389 | static int memcg_numa_stat_show(struct cgroup_subsys_state *css, | 5383 | static int memcg_numa_stat_show(struct cgroup_subsys_state *css, |
5390 | struct cftype *cft, struct seq_file *m) | 5384 | struct cftype *cft, struct seq_file *m) |
5391 | { | 5385 | { |
5386 | struct numa_stat { | ||
5387 | const char *name; | ||
5388 | unsigned int lru_mask; | ||
5389 | }; | ||
5390 | |||
5391 | static const struct numa_stat stats[] = { | ||
5392 | { "total", LRU_ALL }, | ||
5393 | { "file", LRU_ALL_FILE }, | ||
5394 | { "anon", LRU_ALL_ANON }, | ||
5395 | { "unevictable", BIT(LRU_UNEVICTABLE) }, | ||
5396 | }; | ||
5397 | const struct numa_stat *stat; | ||
5392 | int nid; | 5398 | int nid; |
5393 | unsigned long total_nr, file_nr, anon_nr, unevictable_nr; | 5399 | unsigned long nr; |
5394 | unsigned long node_nr; | ||
5395 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5400 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5396 | 5401 | ||
5397 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); | 5402 | for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { |
5398 | seq_printf(m, "total=%lu", total_nr); | 5403 | nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); |
5399 | for_each_node_state(nid, N_MEMORY) { | 5404 | seq_printf(m, "%s=%lu", stat->name, nr); |
5400 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); | 5405 | for_each_node_state(nid, N_MEMORY) { |
5401 | seq_printf(m, " N%d=%lu", nid, node_nr); | 5406 | nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
5402 | } | 5407 | stat->lru_mask); |
5403 | seq_putc(m, '\n'); | 5408 | seq_printf(m, " N%d=%lu", nid, nr); |
5404 | 5409 | } | |
5405 | file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); | 5410 | seq_putc(m, '\n'); |
5406 | seq_printf(m, "file=%lu", file_nr); | 5411 | } |
5407 | for_each_node_state(nid, N_MEMORY) { | 5412 | |
5408 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 5413 | for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { |
5409 | LRU_ALL_FILE); | 5414 | struct mem_cgroup *iter; |
5410 | seq_printf(m, " N%d=%lu", nid, node_nr); | 5415 | |
5411 | } | 5416 | nr = 0; |
5412 | seq_putc(m, '\n'); | 5417 | for_each_mem_cgroup_tree(iter, memcg) |
5413 | 5418 | nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); | |
5414 | anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); | 5419 | seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); |
5415 | seq_printf(m, "anon=%lu", anon_nr); | 5420 | for_each_node_state(nid, N_MEMORY) { |
5416 | for_each_node_state(nid, N_MEMORY) { | 5421 | nr = 0; |
5417 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 5422 | for_each_mem_cgroup_tree(iter, memcg) |
5418 | LRU_ALL_ANON); | 5423 | nr += mem_cgroup_node_nr_lru_pages( |
5419 | seq_printf(m, " N%d=%lu", nid, node_nr); | 5424 | iter, nid, stat->lru_mask); |
5425 | seq_printf(m, " N%d=%lu", nid, nr); | ||
5426 | } | ||
5427 | seq_putc(m, '\n'); | ||
5420 | } | 5428 | } |
5421 | seq_putc(m, '\n'); | ||
5422 | 5429 | ||
5423 | unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); | ||
5424 | seq_printf(m, "unevictable=%lu", unevictable_nr); | ||
5425 | for_each_node_state(nid, N_MEMORY) { | ||
5426 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | ||
5427 | BIT(LRU_UNEVICTABLE)); | ||
5428 | seq_printf(m, " N%d=%lu", nid, node_nr); | ||
5429 | } | ||
5430 | seq_putc(m, '\n'); | ||
5431 | return 0; | 5430 | return 0; |
5432 | } | 5431 | } |
5433 | #endif /* CONFIG_NUMA */ | 5432 | #endif /* CONFIG_NUMA */ |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index bf3351b5115e..f9d78ec7831f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1423,19 +1423,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1423 | return 1; | 1423 | return 1; |
1424 | 1424 | ||
1425 | /* | 1425 | /* |
1426 | * The lock_memory_hotplug prevents a race with memory hotplug. | ||
1427 | * This is a big hammer, a better would be nicer. | ||
1428 | */ | ||
1429 | lock_memory_hotplug(); | ||
1430 | |||
1431 | /* | ||
1432 | * Isolate the page, so that it doesn't get reallocated if it | ||
1433 | * was free. This flag should be kept set until the source page | ||
1434 | * is freed and PG_hwpoison on it is set. | ||
1435 | */ | ||
1436 | if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE) | ||
1437 | set_migratetype_isolate(p, true); | ||
1438 | /* | ||
1439 | * When the target page is a free hugepage, just remove it | 1426 | * When the target page is a free hugepage, just remove it |
1440 | * from free hugepage list. | 1427 | * from free hugepage list. |
1441 | */ | 1428 | */ |
@@ -1455,7 +1442,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1455 | /* Not a free page */ | 1442 | /* Not a free page */ |
1456 | ret = 1; | 1443 | ret = 1; |
1457 | } | 1444 | } |
1458 | unlock_memory_hotplug(); | ||
1459 | return ret; | 1445 | return ret; |
1460 | } | 1446 | } |
1461 | 1447 | ||
@@ -1654,15 +1640,28 @@ int soft_offline_page(struct page *page, int flags) | |||
1654 | } | 1640 | } |
1655 | } | 1641 | } |
1656 | 1642 | ||
1643 | /* | ||
1644 | * The lock_memory_hotplug prevents a race with memory hotplug. | ||
1645 | * This is a big hammer, a better would be nicer. | ||
1646 | */ | ||
1647 | lock_memory_hotplug(); | ||
1648 | |||
1649 | /* | ||
1650 | * Isolate the page, so that it doesn't get reallocated if it | ||
1651 | * was free. This flag should be kept set until the source page | ||
1652 | * is freed and PG_hwpoison on it is set. | ||
1653 | */ | ||
1654 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | ||
1655 | set_migratetype_isolate(page, true); | ||
1656 | |||
1657 | ret = get_any_page(page, pfn, flags); | 1657 | ret = get_any_page(page, pfn, flags); |
1658 | if (ret < 0) | 1658 | unlock_memory_hotplug(); |
1659 | goto unset; | 1659 | if (ret > 0) { /* for in-use pages */ |
1660 | if (ret) { /* for in-use pages */ | ||
1661 | if (PageHuge(page)) | 1660 | if (PageHuge(page)) |
1662 | ret = soft_offline_huge_page(page, flags); | 1661 | ret = soft_offline_huge_page(page, flags); |
1663 | else | 1662 | else |
1664 | ret = __soft_offline_page(page, flags); | 1663 | ret = __soft_offline_page(page, flags); |
1665 | } else { /* for free pages */ | 1664 | } else if (ret == 0) { /* for free pages */ |
1666 | if (PageHuge(page)) { | 1665 | if (PageHuge(page)) { |
1667 | set_page_hwpoison_huge_page(hpage); | 1666 | set_page_hwpoison_huge_page(hpage); |
1668 | dequeue_hwpoisoned_huge_page(hpage); | 1667 | dequeue_hwpoisoned_huge_page(hpage); |
@@ -1673,7 +1672,6 @@ int soft_offline_page(struct page *page, int flags) | |||
1673 | atomic_long_inc(&num_poisoned_pages); | 1672 | atomic_long_inc(&num_poisoned_pages); |
1674 | } | 1673 | } |
1675 | } | 1674 | } |
1676 | unset: | ||
1677 | unset_migratetype_isolate(page, MIGRATE_MOVABLE); | 1675 | unset_migratetype_isolate(page, MIGRATE_MOVABLE); |
1678 | return ret; | 1676 | return ret; |
1679 | } | 1677 | } |
diff --git a/mm/memory.c b/mm/memory.c index 33a3dbec3cc8..bf8665849a5f 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -453,8 +453,6 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | |||
453 | 453 | ||
454 | /* | 454 | /* |
455 | * This function frees user-level page tables of a process. | 455 | * This function frees user-level page tables of a process. |
456 | * | ||
457 | * Must be called with pagetable lock held. | ||
458 | */ | 456 | */ |
459 | void free_pgd_range(struct mmu_gather *tlb, | 457 | void free_pgd_range(struct mmu_gather *tlb, |
460 | unsigned long addr, unsigned long end, | 458 | unsigned long addr, unsigned long end, |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ed85fe3870e2..489f235502db 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/firmware-map.h> | 31 | #include <linux/firmware-map.h> |
32 | #include <linux/stop_machine.h> | 32 | #include <linux/stop_machine.h> |
33 | #include <linux/hugetlb.h> | 33 | #include <linux/hugetlb.h> |
34 | #include <linux/memblock.h> | ||
34 | 35 | ||
35 | #include <asm/tlbflush.h> | 36 | #include <asm/tlbflush.h> |
36 | 37 | ||
@@ -365,8 +366,7 @@ out_fail: | |||
365 | static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, | 366 | static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, |
366 | unsigned long end_pfn) | 367 | unsigned long end_pfn) |
367 | { | 368 | { |
368 | unsigned long old_pgdat_end_pfn = | 369 | unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); |
369 | pgdat->node_start_pfn + pgdat->node_spanned_pages; | ||
370 | 370 | ||
371 | if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) | 371 | if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) |
372 | pgdat->node_start_pfn = start_pfn; | 372 | pgdat->node_start_pfn = start_pfn; |
@@ -402,13 +402,12 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
402 | static int __meminit __add_section(int nid, struct zone *zone, | 402 | static int __meminit __add_section(int nid, struct zone *zone, |
403 | unsigned long phys_start_pfn) | 403 | unsigned long phys_start_pfn) |
404 | { | 404 | { |
405 | int nr_pages = PAGES_PER_SECTION; | ||
406 | int ret; | 405 | int ret; |
407 | 406 | ||
408 | if (pfn_valid(phys_start_pfn)) | 407 | if (pfn_valid(phys_start_pfn)) |
409 | return -EEXIST; | 408 | return -EEXIST; |
410 | 409 | ||
411 | ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); | 410 | ret = sparse_add_one_section(zone, phys_start_pfn); |
412 | 411 | ||
413 | if (ret < 0) | 412 | if (ret < 0) |
414 | return ret; | 413 | return ret; |
@@ -579,9 +578,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, | |||
579 | static void shrink_pgdat_span(struct pglist_data *pgdat, | 578 | static void shrink_pgdat_span(struct pglist_data *pgdat, |
580 | unsigned long start_pfn, unsigned long end_pfn) | 579 | unsigned long start_pfn, unsigned long end_pfn) |
581 | { | 580 | { |
582 | unsigned long pgdat_start_pfn = pgdat->node_start_pfn; | 581 | unsigned long pgdat_start_pfn = pgdat->node_start_pfn; |
583 | unsigned long pgdat_end_pfn = | 582 | unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ |
584 | pgdat->node_start_pfn + pgdat->node_spanned_pages; | 583 | unsigned long pgdat_end_pfn = p; |
585 | unsigned long pfn; | 584 | unsigned long pfn; |
586 | struct mem_section *ms; | 585 | struct mem_section *ms; |
587 | int nid = pgdat->node_id; | 586 | int nid = pgdat->node_id; |
@@ -935,7 +934,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
935 | arg.nr_pages = nr_pages; | 934 | arg.nr_pages = nr_pages; |
936 | node_states_check_changes_online(nr_pages, zone, &arg); | 935 | node_states_check_changes_online(nr_pages, zone, &arg); |
937 | 936 | ||
938 | nid = page_to_nid(pfn_to_page(pfn)); | 937 | nid = pfn_to_nid(pfn); |
939 | 938 | ||
940 | ret = memory_notify(MEM_GOING_ONLINE, &arg); | 939 | ret = memory_notify(MEM_GOING_ONLINE, &arg); |
941 | ret = notifier_to_errno(ret); | 940 | ret = notifier_to_errno(ret); |
@@ -1044,17 +1043,23 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) | |||
1044 | } | 1043 | } |
1045 | 1044 | ||
1046 | 1045 | ||
1047 | /* | 1046 | /** |
1047 | * try_online_node - online a node if offlined | ||
1048 | * | ||
1048 | * called by cpu_up() to online a node without onlined memory. | 1049 | * called by cpu_up() to online a node without onlined memory. |
1049 | */ | 1050 | */ |
1050 | int mem_online_node(int nid) | 1051 | int try_online_node(int nid) |
1051 | { | 1052 | { |
1052 | pg_data_t *pgdat; | 1053 | pg_data_t *pgdat; |
1053 | int ret; | 1054 | int ret; |
1054 | 1055 | ||
1056 | if (node_online(nid)) | ||
1057 | return 0; | ||
1058 | |||
1055 | lock_memory_hotplug(); | 1059 | lock_memory_hotplug(); |
1056 | pgdat = hotadd_new_pgdat(nid, 0); | 1060 | pgdat = hotadd_new_pgdat(nid, 0); |
1057 | if (!pgdat) { | 1061 | if (!pgdat) { |
1062 | pr_err("Cannot online node %d due to NULL pgdat\n", nid); | ||
1058 | ret = -ENOMEM; | 1063 | ret = -ENOMEM; |
1059 | goto out; | 1064 | goto out; |
1060 | } | 1065 | } |
@@ -1062,6 +1067,12 @@ int mem_online_node(int nid) | |||
1062 | ret = register_one_node(nid); | 1067 | ret = register_one_node(nid); |
1063 | BUG_ON(ret); | 1068 | BUG_ON(ret); |
1064 | 1069 | ||
1070 | if (pgdat->node_zonelists->_zonerefs->zone == NULL) { | ||
1071 | mutex_lock(&zonelists_mutex); | ||
1072 | build_all_zonelists(NULL, NULL); | ||
1073 | mutex_unlock(&zonelists_mutex); | ||
1074 | } | ||
1075 | |||
1065 | out: | 1076 | out: |
1066 | unlock_memory_hotplug(); | 1077 | unlock_memory_hotplug(); |
1067 | return ret; | 1078 | return ret; |
@@ -1412,6 +1423,36 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) | |||
1412 | } | 1423 | } |
1413 | #endif /* CONFIG_MOVABLE_NODE */ | 1424 | #endif /* CONFIG_MOVABLE_NODE */ |
1414 | 1425 | ||
1426 | static int __init cmdline_parse_movable_node(char *p) | ||
1427 | { | ||
1428 | #ifdef CONFIG_MOVABLE_NODE | ||
1429 | /* | ||
1430 | * Memory used by the kernel cannot be hot-removed because Linux | ||
1431 | * cannot migrate the kernel pages. When memory hotplug is | ||
1432 | * enabled, we should prevent memblock from allocating memory | ||
1433 | * for the kernel. | ||
1434 | * | ||
1435 | * ACPI SRAT records all hotpluggable memory ranges. But before | ||
1436 | * SRAT is parsed, we don't know about it. | ||
1437 | * | ||
1438 | * The kernel image is loaded into memory at very early time. We | ||
1439 | * cannot prevent this anyway. So on NUMA system, we set any | ||
1440 | * node the kernel resides in as un-hotpluggable. | ||
1441 | * | ||
1442 | * Since on modern servers, one node could have double-digit | ||
1443 | * gigabytes memory, we can assume the memory around the kernel | ||
1444 | * image is also un-hotpluggable. So before SRAT is parsed, just | ||
1445 | * allocate memory near the kernel image to try the best to keep | ||
1446 | * the kernel away from hotpluggable memory. | ||
1447 | */ | ||
1448 | memblock_set_bottom_up(true); | ||
1449 | #else | ||
1450 | pr_warn("movable_node option not supported\n"); | ||
1451 | #endif | ||
1452 | return 0; | ||
1453 | } | ||
1454 | early_param("movable_node", cmdline_parse_movable_node); | ||
1455 | |||
1415 | /* check which state of node_states will be changed when offline memory */ | 1456 | /* check which state of node_states will be changed when offline memory */ |
1416 | static void node_states_check_changes_offline(unsigned long nr_pages, | 1457 | static void node_states_check_changes_offline(unsigned long nr_pages, |
1417 | struct zone *zone, struct memory_notify *arg) | 1458 | struct zone *zone, struct memory_notify *arg) |
@@ -1702,7 +1743,7 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, | |||
1702 | } | 1743 | } |
1703 | 1744 | ||
1704 | #ifdef CONFIG_MEMORY_HOTREMOVE | 1745 | #ifdef CONFIG_MEMORY_HOTREMOVE |
1705 | static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) | 1746 | static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) |
1706 | { | 1747 | { |
1707 | int ret = !is_memblock_offlined(mem); | 1748 | int ret = !is_memblock_offlined(mem); |
1708 | 1749 | ||
@@ -1854,7 +1895,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) | |||
1854 | * if this is not the case. | 1895 | * if this is not the case. |
1855 | */ | 1896 | */ |
1856 | ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, | 1897 | ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, |
1857 | is_memblock_offlined_cb); | 1898 | check_memblock_offlined_cb); |
1858 | if (ret) { | 1899 | if (ret) { |
1859 | unlock_memory_hotplug(); | 1900 | unlock_memory_hotplug(); |
1860 | BUG(); | 1901 | BUG(); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 71cb253368cb..4cc19f6ab6c6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1125,7 +1125,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, | |||
1125 | tmp = *from; | 1125 | tmp = *from; |
1126 | while (!nodes_empty(tmp)) { | 1126 | while (!nodes_empty(tmp)) { |
1127 | int s,d; | 1127 | int s,d; |
1128 | int source = -1; | 1128 | int source = NUMA_NO_NODE; |
1129 | int dest = 0; | 1129 | int dest = 0; |
1130 | 1130 | ||
1131 | for_each_node_mask(s, tmp) { | 1131 | for_each_node_mask(s, tmp) { |
@@ -1160,7 +1160,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, | |||
1160 | if (!node_isset(dest, tmp)) | 1160 | if (!node_isset(dest, tmp)) |
1161 | break; | 1161 | break; |
1162 | } | 1162 | } |
1163 | if (source == -1) | 1163 | if (source == NUMA_NO_NODE) |
1164 | break; | 1164 | break; |
1165 | 1165 | ||
1166 | node_clear(source, tmp); | 1166 | node_clear(source, tmp); |
@@ -1835,7 +1835,7 @@ static unsigned offset_il_node(struct mempolicy *pol, | |||
1835 | unsigned nnodes = nodes_weight(pol->v.nodes); | 1835 | unsigned nnodes = nodes_weight(pol->v.nodes); |
1836 | unsigned target; | 1836 | unsigned target; |
1837 | int c; | 1837 | int c; |
1838 | int nid = -1; | 1838 | int nid = NUMA_NO_NODE; |
1839 | 1839 | ||
1840 | if (!nnodes) | 1840 | if (!nnodes) |
1841 | return numa_node_id(); | 1841 | return numa_node_id(); |
@@ -1872,11 +1872,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol, | |||
1872 | 1872 | ||
1873 | /* | 1873 | /* |
1874 | * Return the bit number of a random bit set in the nodemask. | 1874 | * Return the bit number of a random bit set in the nodemask. |
1875 | * (returns -1 if nodemask is empty) | 1875 | * (returns NUMA_NO_NODE if nodemask is empty) |
1876 | */ | 1876 | */ |
1877 | int node_random(const nodemask_t *maskp) | 1877 | int node_random(const nodemask_t *maskp) |
1878 | { | 1878 | { |
1879 | int w, bit = -1; | 1879 | int w, bit = NUMA_NO_NODE; |
1880 | 1880 | ||
1881 | w = nodes_weight(*maskp); | 1881 | w = nodes_weight(*maskp); |
1882 | if (w) | 1882 | if (w) |
@@ -2914,62 +2914,45 @@ out: | |||
2914 | * @maxlen: length of @buffer | 2914 | * @maxlen: length of @buffer |
2915 | * @pol: pointer to mempolicy to be formatted | 2915 | * @pol: pointer to mempolicy to be formatted |
2916 | * | 2916 | * |
2917 | * Convert a mempolicy into a string. | 2917 | * Convert @pol into a string. If @buffer is too short, truncate the string. |
2918 | * Returns the number of characters in buffer (if positive) | 2918 | * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the |
2919 | * or an error (negative) | 2919 | * longest flag, "relative", and to display at least a few node ids. |
2920 | */ | 2920 | */ |
2921 | int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | 2921 | void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) |
2922 | { | 2922 | { |
2923 | char *p = buffer; | 2923 | char *p = buffer; |
2924 | int l; | 2924 | nodemask_t nodes = NODE_MASK_NONE; |
2925 | nodemask_t nodes; | 2925 | unsigned short mode = MPOL_DEFAULT; |
2926 | unsigned short mode; | 2926 | unsigned short flags = 0; |
2927 | unsigned short flags = pol ? pol->flags : 0; | ||
2928 | |||
2929 | /* | ||
2930 | * Sanity check: room for longest mode, flag and some nodes | ||
2931 | */ | ||
2932 | VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16); | ||
2933 | 2927 | ||
2934 | if (!pol || pol == &default_policy) | 2928 | if (pol && pol != &default_policy) { |
2935 | mode = MPOL_DEFAULT; | ||
2936 | else | ||
2937 | mode = pol->mode; | 2929 | mode = pol->mode; |
2930 | flags = pol->flags; | ||
2931 | } | ||
2938 | 2932 | ||
2939 | switch (mode) { | 2933 | switch (mode) { |
2940 | case MPOL_DEFAULT: | 2934 | case MPOL_DEFAULT: |
2941 | nodes_clear(nodes); | ||
2942 | break; | 2935 | break; |
2943 | |||
2944 | case MPOL_PREFERRED: | 2936 | case MPOL_PREFERRED: |
2945 | nodes_clear(nodes); | ||
2946 | if (flags & MPOL_F_LOCAL) | 2937 | if (flags & MPOL_F_LOCAL) |
2947 | mode = MPOL_LOCAL; | 2938 | mode = MPOL_LOCAL; |
2948 | else | 2939 | else |
2949 | node_set(pol->v.preferred_node, nodes); | 2940 | node_set(pol->v.preferred_node, nodes); |
2950 | break; | 2941 | break; |
2951 | |||
2952 | case MPOL_BIND: | 2942 | case MPOL_BIND: |
2953 | /* Fall through */ | ||
2954 | case MPOL_INTERLEAVE: | 2943 | case MPOL_INTERLEAVE: |
2955 | nodes = pol->v.nodes; | 2944 | nodes = pol->v.nodes; |
2956 | break; | 2945 | break; |
2957 | |||
2958 | default: | 2946 | default: |
2959 | return -EINVAL; | 2947 | WARN_ON_ONCE(1); |
2948 | snprintf(p, maxlen, "unknown"); | ||
2949 | return; | ||
2960 | } | 2950 | } |
2961 | 2951 | ||
2962 | l = strlen(policy_modes[mode]); | 2952 | p += snprintf(p, maxlen, policy_modes[mode]); |
2963 | if (buffer + maxlen < p + l + 1) | ||
2964 | return -ENOSPC; | ||
2965 | |||
2966 | strcpy(p, policy_modes[mode]); | ||
2967 | p += l; | ||
2968 | 2953 | ||
2969 | if (flags & MPOL_MODE_FLAGS) { | 2954 | if (flags & MPOL_MODE_FLAGS) { |
2970 | if (buffer + maxlen < p + 2) | 2955 | p += snprintf(p, buffer + maxlen - p, "="); |
2971 | return -ENOSPC; | ||
2972 | *p++ = '='; | ||
2973 | 2956 | ||
2974 | /* | 2957 | /* |
2975 | * Currently, the only defined flags are mutually exclusive | 2958 | * Currently, the only defined flags are mutually exclusive |
@@ -2981,10 +2964,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | |||
2981 | } | 2964 | } |
2982 | 2965 | ||
2983 | if (!nodes_empty(nodes)) { | 2966 | if (!nodes_empty(nodes)) { |
2984 | if (buffer + maxlen < p + 2) | 2967 | p += snprintf(p, buffer + maxlen - p, ":"); |
2985 | return -ENOSPC; | ||
2986 | *p++ = ':'; | ||
2987 | p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); | 2968 | p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); |
2988 | } | 2969 | } |
2989 | return p - buffer; | ||
2990 | } | 2970 | } |
@@ -179,14 +179,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
179 | goto error; | 179 | goto error; |
180 | } | 180 | } |
181 | 181 | ||
182 | allowed = (totalram_pages - hugetlb_total_pages()) | 182 | allowed = vm_commit_limit(); |
183 | * sysctl_overcommit_ratio / 100; | ||
184 | /* | 183 | /* |
185 | * Reserve some for root | 184 | * Reserve some for root |
186 | */ | 185 | */ |
187 | if (!cap_sys_admin) | 186 | if (!cap_sys_admin) |
188 | allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); | 187 | allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
189 | allowed += total_swap_pages; | ||
190 | 188 | ||
191 | /* | 189 | /* |
192 | * Don't let a single process grow so big a user can't recover | 190 | * Don't let a single process grow so big a user can't recover |
@@ -1856,7 +1854,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
1856 | struct vm_area_struct *vma; | 1854 | struct vm_area_struct *vma; |
1857 | struct vm_unmapped_area_info info; | 1855 | struct vm_unmapped_area_info info; |
1858 | 1856 | ||
1859 | if (len > TASK_SIZE) | 1857 | if (len > TASK_SIZE - mmap_min_addr) |
1860 | return -ENOMEM; | 1858 | return -ENOMEM; |
1861 | 1859 | ||
1862 | if (flags & MAP_FIXED) | 1860 | if (flags & MAP_FIXED) |
@@ -1865,14 +1863,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
1865 | if (addr) { | 1863 | if (addr) { |
1866 | addr = PAGE_ALIGN(addr); | 1864 | addr = PAGE_ALIGN(addr); |
1867 | vma = find_vma(mm, addr); | 1865 | vma = find_vma(mm, addr); |
1868 | if (TASK_SIZE - len >= addr && | 1866 | if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && |
1869 | (!vma || addr + len <= vma->vm_start)) | 1867 | (!vma || addr + len <= vma->vm_start)) |
1870 | return addr; | 1868 | return addr; |
1871 | } | 1869 | } |
1872 | 1870 | ||
1873 | info.flags = 0; | 1871 | info.flags = 0; |
1874 | info.length = len; | 1872 | info.length = len; |
1875 | info.low_limit = TASK_UNMAPPED_BASE; | 1873 | info.low_limit = mm->mmap_base; |
1876 | info.high_limit = TASK_SIZE; | 1874 | info.high_limit = TASK_SIZE; |
1877 | info.align_mask = 0; | 1875 | info.align_mask = 0; |
1878 | return vm_unmapped_area(&info); | 1876 | return vm_unmapped_area(&info); |
@@ -1895,7 +1893,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1895 | struct vm_unmapped_area_info info; | 1893 | struct vm_unmapped_area_info info; |
1896 | 1894 | ||
1897 | /* requested length too big for entire address space */ | 1895 | /* requested length too big for entire address space */ |
1898 | if (len > TASK_SIZE) | 1896 | if (len > TASK_SIZE - mmap_min_addr) |
1899 | return -ENOMEM; | 1897 | return -ENOMEM; |
1900 | 1898 | ||
1901 | if (flags & MAP_FIXED) | 1899 | if (flags & MAP_FIXED) |
@@ -1905,14 +1903,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1905 | if (addr) { | 1903 | if (addr) { |
1906 | addr = PAGE_ALIGN(addr); | 1904 | addr = PAGE_ALIGN(addr); |
1907 | vma = find_vma(mm, addr); | 1905 | vma = find_vma(mm, addr); |
1908 | if (TASK_SIZE - len >= addr && | 1906 | if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && |
1909 | (!vma || addr + len <= vma->vm_start)) | 1907 | (!vma || addr + len <= vma->vm_start)) |
1910 | return addr; | 1908 | return addr; |
1911 | } | 1909 | } |
1912 | 1910 | ||
1913 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; | 1911 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
1914 | info.length = len; | 1912 | info.length = len; |
1915 | info.low_limit = PAGE_SIZE; | 1913 | info.low_limit = max(PAGE_SIZE, mmap_min_addr); |
1916 | info.high_limit = mm->mmap_base; | 1914 | info.high_limit = mm->mmap_base; |
1917 | info.align_mask = 0; | 1915 | info.align_mask = 0; |
1918 | addr = vm_unmapped_area(&info); | 1916 | addr = vm_unmapped_area(&info); |
diff --git a/mm/mprotect.c b/mm/mprotect.c index a597f2ffcd6f..26667971c824 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -112,6 +112,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, | |||
112 | pmd_t *pmd; | 112 | pmd_t *pmd; |
113 | unsigned long next; | 113 | unsigned long next; |
114 | unsigned long pages = 0; | 114 | unsigned long pages = 0; |
115 | unsigned long nr_huge_updates = 0; | ||
115 | 116 | ||
116 | pmd = pmd_offset(pud, addr); | 117 | pmd = pmd_offset(pud, addr); |
117 | do { | 118 | do { |
@@ -126,9 +127,10 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, | |||
126 | newprot, prot_numa); | 127 | newprot, prot_numa); |
127 | 128 | ||
128 | if (nr_ptes) { | 129 | if (nr_ptes) { |
129 | if (nr_ptes == HPAGE_PMD_NR) | 130 | if (nr_ptes == HPAGE_PMD_NR) { |
130 | pages++; | 131 | pages += HPAGE_PMD_NR; |
131 | 132 | nr_huge_updates++; | |
133 | } | ||
132 | continue; | 134 | continue; |
133 | } | 135 | } |
134 | } | 136 | } |
@@ -141,6 +143,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, | |||
141 | pages += this_pages; | 143 | pages += this_pages; |
142 | } while (pmd++, addr = next, addr != end); | 144 | } while (pmd++, addr = next, addr != end); |
143 | 145 | ||
146 | if (nr_huge_updates) | ||
147 | count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); | ||
144 | return pages; | 148 | return pages; |
145 | } | 149 | } |
146 | 150 | ||
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 61107cf55bb3..2c254d374655 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -82,27 +82,18 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) | |||
82 | 82 | ||
83 | static void __init __free_pages_memory(unsigned long start, unsigned long end) | 83 | static void __init __free_pages_memory(unsigned long start, unsigned long end) |
84 | { | 84 | { |
85 | unsigned long i, start_aligned, end_aligned; | 85 | int order; |
86 | int order = ilog2(BITS_PER_LONG); | ||
87 | 86 | ||
88 | start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); | 87 | while (start < end) { |
89 | end_aligned = end & ~(BITS_PER_LONG - 1); | 88 | order = min(MAX_ORDER - 1UL, __ffs(start)); |
90 | 89 | ||
91 | if (end_aligned <= start_aligned) { | 90 | while (start + (1UL << order) > end) |
92 | for (i = start; i < end; i++) | 91 | order--; |
93 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
94 | 92 | ||
95 | return; | 93 | __free_pages_bootmem(pfn_to_page(start), order); |
96 | } | ||
97 | |||
98 | for (i = start; i < start_aligned; i++) | ||
99 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
100 | 94 | ||
101 | for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) | 95 | start += (1UL << order); |
102 | __free_pages_bootmem(pfn_to_page(i), order); | 96 | } |
103 | |||
104 | for (i = end_aligned; i < end; i++) | ||
105 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
106 | } | 97 | } |
107 | 98 | ||
108 | static unsigned long __init __free_memory_core(phys_addr_t start, | 99 | static unsigned long __init __free_memory_core(phys_addr_t start, |
diff --git a/mm/nommu.c b/mm/nommu.c index 9e6cb02cba64..fec093adad9c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1948,13 +1948,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1948 | goto error; | 1948 | goto error; |
1949 | } | 1949 | } |
1950 | 1950 | ||
1951 | allowed = totalram_pages * sysctl_overcommit_ratio / 100; | 1951 | allowed = vm_commit_limit(); |
1952 | /* | 1952 | /* |
1953 | * Reserve some 3% for root | 1953 | * Reserve some 3% for root |
1954 | */ | 1954 | */ |
1955 | if (!cap_sys_admin) | 1955 | if (!cap_sys_admin) |
1956 | allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); | 1956 | allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
1957 | allowed += total_swap_pages; | ||
1958 | 1957 | ||
1959 | /* | 1958 | /* |
1960 | * Don't let a single process grow so big a user can't recover | 1959 | * Don't let a single process grow so big a user can't recover |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 73d812f16dde..580a5f075ed0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -234,8 +234,8 @@ int page_group_by_mobility_disabled __read_mostly; | |||
234 | 234 | ||
235 | void set_pageblock_migratetype(struct page *page, int migratetype) | 235 | void set_pageblock_migratetype(struct page *page, int migratetype) |
236 | { | 236 | { |
237 | 237 | if (unlikely(page_group_by_mobility_disabled && | |
238 | if (unlikely(page_group_by_mobility_disabled)) | 238 | migratetype < MIGRATE_PCPTYPES)) |
239 | migratetype = MIGRATE_UNMOVABLE; | 239 | migratetype = MIGRATE_UNMOVABLE; |
240 | 240 | ||
241 | set_pageblock_flags_group(page, (unsigned long)migratetype, | 241 | set_pageblock_flags_group(page, (unsigned long)migratetype, |
@@ -1027,6 +1027,10 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, | |||
1027 | { | 1027 | { |
1028 | int current_order = page_order(page); | 1028 | int current_order = page_order(page); |
1029 | 1029 | ||
1030 | /* | ||
1031 | * When borrowing from MIGRATE_CMA, we need to release the excess | ||
1032 | * buddy pages to CMA itself. | ||
1033 | */ | ||
1030 | if (is_migrate_cma(fallback_type)) | 1034 | if (is_migrate_cma(fallback_type)) |
1031 | return fallback_type; | 1035 | return fallback_type; |
1032 | 1036 | ||
@@ -1091,21 +1095,11 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
1091 | list_del(&page->lru); | 1095 | list_del(&page->lru); |
1092 | rmv_page_order(page); | 1096 | rmv_page_order(page); |
1093 | 1097 | ||
1094 | /* | ||
1095 | * Borrow the excess buddy pages as well, irrespective | ||
1096 | * of whether we stole freepages, or took ownership of | ||
1097 | * the pageblock or not. | ||
1098 | * | ||
1099 | * Exception: When borrowing from MIGRATE_CMA, release | ||
1100 | * the excess buddy pages to CMA itself. | ||
1101 | */ | ||
1102 | expand(zone, page, order, current_order, area, | 1098 | expand(zone, page, order, current_order, area, |
1103 | is_migrate_cma(migratetype) | 1099 | new_type); |
1104 | ? migratetype : start_migratetype); | ||
1105 | 1100 | ||
1106 | trace_mm_page_alloc_extfrag(page, order, | 1101 | trace_mm_page_alloc_extfrag(page, order, current_order, |
1107 | current_order, start_migratetype, migratetype, | 1102 | start_migratetype, migratetype, new_type); |
1108 | new_type == start_migratetype); | ||
1109 | 1103 | ||
1110 | return page; | 1104 | return page; |
1111 | } | 1105 | } |
@@ -1711,7 +1705,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | |||
1711 | * comments in mmzone.h. Reduces cache footprint of zonelist scans | 1705 | * comments in mmzone.h. Reduces cache footprint of zonelist scans |
1712 | * that have to skip over a lot of full or unallowed zones. | 1706 | * that have to skip over a lot of full or unallowed zones. |
1713 | * | 1707 | * |
1714 | * If the zonelist cache is present in the passed in zonelist, then | 1708 | * If the zonelist cache is present in the passed zonelist, then |
1715 | * returns a pointer to the allowed node mask (either the current | 1709 | * returns a pointer to the allowed node mask (either the current |
1716 | * tasks mems_allowed, or node_states[N_MEMORY].) | 1710 | * tasks mems_allowed, or node_states[N_MEMORY].) |
1717 | * | 1711 | * |
@@ -2593,7 +2587,7 @@ rebalance: | |||
2593 | * running out of options and have to consider going OOM | 2587 | * running out of options and have to consider going OOM |
2594 | */ | 2588 | */ |
2595 | if (!did_some_progress) { | 2589 | if (!did_some_progress) { |
2596 | if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | 2590 | if (oom_gfp_allowed(gfp_mask)) { |
2597 | if (oom_killer_disabled) | 2591 | if (oom_killer_disabled) |
2598 | goto nopage; | 2592 | goto nopage; |
2599 | /* Coredumps can quickly deplete all memory reserves */ | 2593 | /* Coredumps can quickly deplete all memory reserves */ |
@@ -3881,8 +3875,6 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
3881 | return ffz(~size); | 3875 | return ffz(~size); |
3882 | } | 3876 | } |
3883 | 3877 | ||
3884 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | ||
3885 | |||
3886 | /* | 3878 | /* |
3887 | * Check if a pageblock contains reserved pages | 3879 | * Check if a pageblock contains reserved pages |
3888 | */ | 3880 | */ |
@@ -4266,7 +4258,7 @@ static __meminit void zone_pcp_init(struct zone *zone) | |||
4266 | */ | 4258 | */ |
4267 | zone->pageset = &boot_pageset; | 4259 | zone->pageset = &boot_pageset; |
4268 | 4260 | ||
4269 | if (zone->present_pages) | 4261 | if (populated_zone(zone)) |
4270 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", | 4262 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", |
4271 | zone->name, zone->present_pages, | 4263 | zone->name, zone->present_pages, |
4272 | zone_batchsize(zone)); | 4264 | zone_batchsize(zone)); |
@@ -5160,7 +5152,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) | |||
5160 | 5152 | ||
5161 | for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { | 5153 | for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { |
5162 | struct zone *zone = &pgdat->node_zones[zone_type]; | 5154 | struct zone *zone = &pgdat->node_zones[zone_type]; |
5163 | if (zone->present_pages) { | 5155 | if (populated_zone(zone)) { |
5164 | node_set_state(nid, N_HIGH_MEMORY); | 5156 | node_set_state(nid, N_HIGH_MEMORY); |
5165 | if (N_NORMAL_MEMORY != N_HIGH_MEMORY && | 5157 | if (N_NORMAL_MEMORY != N_HIGH_MEMORY && |
5166 | zone_type <= ZONE_NORMAL) | 5158 | zone_type <= ZONE_NORMAL) |
diff --git a/mm/readahead.c b/mm/readahead.c index e4ed04149785..7cdbb44aa90b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -401,6 +401,7 @@ ondemand_readahead(struct address_space *mapping, | |||
401 | unsigned long req_size) | 401 | unsigned long req_size) |
402 | { | 402 | { |
403 | unsigned long max = max_sane_readahead(ra->ra_pages); | 403 | unsigned long max = max_sane_readahead(ra->ra_pages); |
404 | pgoff_t prev_offset; | ||
404 | 405 | ||
405 | /* | 406 | /* |
406 | * start of file | 407 | * start of file |
@@ -452,8 +453,11 @@ ondemand_readahead(struct address_space *mapping, | |||
452 | 453 | ||
453 | /* | 454 | /* |
454 | * sequential cache miss | 455 | * sequential cache miss |
456 | * trivial case: (offset - prev_offset) == 1 | ||
457 | * unaligned reads: (offset - prev_offset) == 0 | ||
455 | */ | 458 | */ |
456 | if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) | 459 | prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT; |
460 | if (offset - prev_offset <= 1UL) | ||
457 | goto initial_readahead; | 461 | goto initial_readahead; |
458 | 462 | ||
459 | /* | 463 | /* |
@@ -569,7 +573,7 @@ static ssize_t | |||
569 | do_readahead(struct address_space *mapping, struct file *filp, | 573 | do_readahead(struct address_space *mapping, struct file *filp, |
570 | pgoff_t index, unsigned long nr) | 574 | pgoff_t index, unsigned long nr) |
571 | { | 575 | { |
572 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | 576 | if (!mapping || !mapping->a_ops) |
573 | return -EINVAL; | 577 | return -EINVAL; |
574 | 578 | ||
575 | force_page_cache_readahead(mapping, filp, index, nr); | 579 | force_page_cache_readahead(mapping, filp, index, nr); |
@@ -3982,7 +3982,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
3982 | 3982 | ||
3983 | VM_BUG_ON(!mutex_is_locked(&slab_mutex)); | 3983 | VM_BUG_ON(!mutex_is_locked(&slab_mutex)); |
3984 | for_each_memcg_cache_index(i) { | 3984 | for_each_memcg_cache_index(i) { |
3985 | c = cache_from_memcg(cachep, i); | 3985 | c = cache_from_memcg_idx(cachep, i); |
3986 | if (c) | 3986 | if (c) |
3987 | /* return value determined by the parent cache only */ | 3987 | /* return value determined by the parent cache only */ |
3988 | __do_tune_cpucache(c, limit, batchcount, shared, gfp); | 3988 | __do_tune_cpucache(c, limit, batchcount, shared, gfp); |
@@ -160,7 +160,8 @@ static inline const char *cache_name(struct kmem_cache *s) | |||
160 | return s->name; | 160 | return s->name; |
161 | } | 161 | } |
162 | 162 | ||
163 | static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) | 163 | static inline struct kmem_cache * |
164 | cache_from_memcg_idx(struct kmem_cache *s, int idx) | ||
164 | { | 165 | { |
165 | if (!s->memcg_params) | 166 | if (!s->memcg_params) |
166 | return NULL; | 167 | return NULL; |
@@ -204,7 +205,8 @@ static inline const char *cache_name(struct kmem_cache *s) | |||
204 | return s->name; | 205 | return s->name; |
205 | } | 206 | } |
206 | 207 | ||
207 | static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) | 208 | static inline struct kmem_cache * |
209 | cache_from_memcg_idx(struct kmem_cache *s, int idx) | ||
208 | { | 210 | { |
209 | return NULL; | 211 | return NULL; |
210 | } | 212 | } |
diff --git a/mm/slab_common.c b/mm/slab_common.c index e2e98af703ea..0b7bb399b0e4 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -571,7 +571,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) | |||
571 | return; | 571 | return; |
572 | 572 | ||
573 | for_each_memcg_cache_index(i) { | 573 | for_each_memcg_cache_index(i) { |
574 | c = cache_from_memcg(s, i); | 574 | c = cache_from_memcg_idx(s, i); |
575 | if (!c) | 575 | if (!c) |
576 | continue; | 576 | continue; |
577 | 577 | ||
@@ -4983,7 +4983,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, | |||
4983 | * through the descendants with best-effort propagation. | 4983 | * through the descendants with best-effort propagation. |
4984 | */ | 4984 | */ |
4985 | for_each_memcg_cache_index(i) { | 4985 | for_each_memcg_cache_index(i) { |
4986 | struct kmem_cache *c = cache_from_memcg(s, i); | 4986 | struct kmem_cache *c = cache_from_memcg_idx(s, i); |
4987 | if (c) | 4987 | if (c) |
4988 | attribute->store(c, buf, len); | 4988 | attribute->store(c, buf, len); |
4989 | } | 4989 | } |
diff --git a/mm/sparse.c b/mm/sparse.c index 4ac1d7ef548f..8cc7be0e9590 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -590,33 +590,32 @@ void __init sparse_init(void) | |||
590 | 590 | ||
591 | #ifdef CONFIG_MEMORY_HOTPLUG | 591 | #ifdef CONFIG_MEMORY_HOTPLUG |
592 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 592 | #ifdef CONFIG_SPARSEMEM_VMEMMAP |
593 | static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, | 593 | static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) |
594 | unsigned long nr_pages) | ||
595 | { | 594 | { |
596 | /* This will make the necessary allocations eventually. */ | 595 | /* This will make the necessary allocations eventually. */ |
597 | return sparse_mem_map_populate(pnum, nid); | 596 | return sparse_mem_map_populate(pnum, nid); |
598 | } | 597 | } |
599 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | 598 | static void __kfree_section_memmap(struct page *memmap) |
600 | { | 599 | { |
601 | unsigned long start = (unsigned long)memmap; | 600 | unsigned long start = (unsigned long)memmap; |
602 | unsigned long end = (unsigned long)(memmap + nr_pages); | 601 | unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); |
603 | 602 | ||
604 | vmemmap_free(start, end); | 603 | vmemmap_free(start, end); |
605 | } | 604 | } |
606 | #ifdef CONFIG_MEMORY_HOTREMOVE | 605 | #ifdef CONFIG_MEMORY_HOTREMOVE |
607 | static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) | 606 | static void free_map_bootmem(struct page *memmap) |
608 | { | 607 | { |
609 | unsigned long start = (unsigned long)memmap; | 608 | unsigned long start = (unsigned long)memmap; |
610 | unsigned long end = (unsigned long)(memmap + nr_pages); | 609 | unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); |
611 | 610 | ||
612 | vmemmap_free(start, end); | 611 | vmemmap_free(start, end); |
613 | } | 612 | } |
614 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 613 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
615 | #else | 614 | #else |
616 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | 615 | static struct page *__kmalloc_section_memmap(void) |
617 | { | 616 | { |
618 | struct page *page, *ret; | 617 | struct page *page, *ret; |
619 | unsigned long memmap_size = sizeof(struct page) * nr_pages; | 618 | unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION; |
620 | 619 | ||
621 | page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); | 620 | page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); |
622 | if (page) | 621 | if (page) |
@@ -634,28 +633,30 @@ got_map_ptr: | |||
634 | return ret; | 633 | return ret; |
635 | } | 634 | } |
636 | 635 | ||
637 | static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, | 636 | static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) |
638 | unsigned long nr_pages) | ||
639 | { | 637 | { |
640 | return __kmalloc_section_memmap(nr_pages); | 638 | return __kmalloc_section_memmap(); |
641 | } | 639 | } |
642 | 640 | ||
643 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | 641 | static void __kfree_section_memmap(struct page *memmap) |
644 | { | 642 | { |
645 | if (is_vmalloc_addr(memmap)) | 643 | if (is_vmalloc_addr(memmap)) |
646 | vfree(memmap); | 644 | vfree(memmap); |
647 | else | 645 | else |
648 | free_pages((unsigned long)memmap, | 646 | free_pages((unsigned long)memmap, |
649 | get_order(sizeof(struct page) * nr_pages)); | 647 | get_order(sizeof(struct page) * PAGES_PER_SECTION)); |
650 | } | 648 | } |
651 | 649 | ||
652 | #ifdef CONFIG_MEMORY_HOTREMOVE | 650 | #ifdef CONFIG_MEMORY_HOTREMOVE |
653 | static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) | 651 | static void free_map_bootmem(struct page *memmap) |
654 | { | 652 | { |
655 | unsigned long maps_section_nr, removing_section_nr, i; | 653 | unsigned long maps_section_nr, removing_section_nr, i; |
656 | unsigned long magic; | 654 | unsigned long magic, nr_pages; |
657 | struct page *page = virt_to_page(memmap); | 655 | struct page *page = virt_to_page(memmap); |
658 | 656 | ||
657 | nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) | ||
658 | >> PAGE_SHIFT; | ||
659 | |||
659 | for (i = 0; i < nr_pages; i++, page++) { | 660 | for (i = 0; i < nr_pages; i++, page++) { |
660 | magic = (unsigned long) page->lru.next; | 661 | magic = (unsigned long) page->lru.next; |
661 | 662 | ||
@@ -684,8 +685,7 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) | |||
684 | * set. If this is <=0, then that means that the passed-in | 685 | * set. If this is <=0, then that means that the passed-in |
685 | * map was not consumed and must be freed. | 686 | * map was not consumed and must be freed. |
686 | */ | 687 | */ |
687 | int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | 688 | int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) |
688 | int nr_pages) | ||
689 | { | 689 | { |
690 | unsigned long section_nr = pfn_to_section_nr(start_pfn); | 690 | unsigned long section_nr = pfn_to_section_nr(start_pfn); |
691 | struct pglist_data *pgdat = zone->zone_pgdat; | 691 | struct pglist_data *pgdat = zone->zone_pgdat; |
@@ -702,12 +702,12 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
702 | ret = sparse_index_init(section_nr, pgdat->node_id); | 702 | ret = sparse_index_init(section_nr, pgdat->node_id); |
703 | if (ret < 0 && ret != -EEXIST) | 703 | if (ret < 0 && ret != -EEXIST) |
704 | return ret; | 704 | return ret; |
705 | memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); | 705 | memmap = kmalloc_section_memmap(section_nr, pgdat->node_id); |
706 | if (!memmap) | 706 | if (!memmap) |
707 | return -ENOMEM; | 707 | return -ENOMEM; |
708 | usemap = __kmalloc_section_usemap(); | 708 | usemap = __kmalloc_section_usemap(); |
709 | if (!usemap) { | 709 | if (!usemap) { |
710 | __kfree_section_memmap(memmap, nr_pages); | 710 | __kfree_section_memmap(memmap); |
711 | return -ENOMEM; | 711 | return -ENOMEM; |
712 | } | 712 | } |
713 | 713 | ||
@@ -719,7 +719,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
719 | goto out; | 719 | goto out; |
720 | } | 720 | } |
721 | 721 | ||
722 | memset(memmap, 0, sizeof(struct page) * nr_pages); | 722 | memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); |
723 | 723 | ||
724 | ms->section_mem_map |= SECTION_MARKED_PRESENT; | 724 | ms->section_mem_map |= SECTION_MARKED_PRESENT; |
725 | 725 | ||
@@ -729,7 +729,7 @@ out: | |||
729 | pgdat_resize_unlock(pgdat, &flags); | 729 | pgdat_resize_unlock(pgdat, &flags); |
730 | if (ret <= 0) { | 730 | if (ret <= 0) { |
731 | kfree(usemap); | 731 | kfree(usemap); |
732 | __kfree_section_memmap(memmap, nr_pages); | 732 | __kfree_section_memmap(memmap); |
733 | } | 733 | } |
734 | return ret; | 734 | return ret; |
735 | } | 735 | } |
@@ -759,7 +759,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | |||
759 | static void free_section_usemap(struct page *memmap, unsigned long *usemap) | 759 | static void free_section_usemap(struct page *memmap, unsigned long *usemap) |
760 | { | 760 | { |
761 | struct page *usemap_page; | 761 | struct page *usemap_page; |
762 | unsigned long nr_pages; | ||
763 | 762 | ||
764 | if (!usemap) | 763 | if (!usemap) |
765 | return; | 764 | return; |
@@ -771,7 +770,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) | |||
771 | if (PageSlab(usemap_page) || PageCompound(usemap_page)) { | 770 | if (PageSlab(usemap_page) || PageCompound(usemap_page)) { |
772 | kfree(usemap); | 771 | kfree(usemap); |
773 | if (memmap) | 772 | if (memmap) |
774 | __kfree_section_memmap(memmap, PAGES_PER_SECTION); | 773 | __kfree_section_memmap(memmap); |
775 | return; | 774 | return; |
776 | } | 775 | } |
777 | 776 | ||
@@ -780,12 +779,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) | |||
780 | * on the section which has pgdat at boot time. Just keep it as is now. | 779 | * on the section which has pgdat at boot time. Just keep it as is now. |
781 | */ | 780 | */ |
782 | 781 | ||
783 | if (memmap) { | 782 | if (memmap) |
784 | nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) | 783 | free_map_bootmem(memmap); |
785 | >> PAGE_SHIFT; | ||
786 | |||
787 | free_map_bootmem(memmap, nr_pages); | ||
788 | } | ||
789 | } | 784 | } |
790 | 785 | ||
791 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | 786 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) |
diff --git a/mm/swapfile.c b/mm/swapfile.c index de7c904e52e5..612a7c9795f6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -707,7 +707,7 @@ noswap: | |||
707 | return (swp_entry_t) {0}; | 707 | return (swp_entry_t) {0}; |
708 | } | 708 | } |
709 | 709 | ||
710 | /* The only caller of this function is now susupend routine */ | 710 | /* The only caller of this function is now suspend routine */ |
711 | swp_entry_t get_swap_page_of_type(int type) | 711 | swp_entry_t get_swap_page_of_type(int type) |
712 | { | 712 | { |
713 | struct swap_info_struct *si; | 713 | struct swap_info_struct *si; |
@@ -845,7 +845,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
845 | } | 845 | } |
846 | 846 | ||
847 | /* | 847 | /* |
848 | * Caller has made sure that the swapdevice corresponding to entry | 848 | * Caller has made sure that the swap device corresponding to entry |
849 | * is still around or has not been recycled. | 849 | * is still around or has not been recycled. |
850 | */ | 850 | */ |
851 | void swap_free(swp_entry_t entry) | 851 | void swap_free(swp_entry_t entry) |
@@ -947,7 +947,7 @@ int try_to_free_swap(struct page *page) | |||
947 | * original page might be freed under memory pressure, then | 947 | * original page might be freed under memory pressure, then |
948 | * later read back in from swap, now with the wrong data. | 948 | * later read back in from swap, now with the wrong data. |
949 | * | 949 | * |
950 | * Hibration suspends storage while it is writing the image | 950 | * Hibernation suspends storage while it is writing the image |
951 | * to disk so check that here. | 951 | * to disk so check that here. |
952 | */ | 952 | */ |
953 | if (pm_suspended_storage()) | 953 | if (pm_suspended_storage()) |
@@ -1179,7 +1179,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
1179 | * some architectures (e.g. x86_32 with PAE) we might catch a glimpse | 1179 | * some architectures (e.g. x86_32 with PAE) we might catch a glimpse |
1180 | * of unmatched parts which look like swp_pte, so unuse_pte must | 1180 | * of unmatched parts which look like swp_pte, so unuse_pte must |
1181 | * recheck under pte lock. Scanning without pte lock lets it be | 1181 | * recheck under pte lock. Scanning without pte lock lets it be |
1182 | * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. | 1182 | * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. |
1183 | */ | 1183 | */ |
1184 | pte = pte_offset_map(pmd, addr); | 1184 | pte = pte_offset_map(pmd, addr); |
1185 | do { | 1185 | do { |
@@ -1924,17 +1924,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1924 | p->cluster_info = NULL; | 1924 | p->cluster_info = NULL; |
1925 | p->flags = 0; | 1925 | p->flags = 0; |
1926 | frontswap_map = frontswap_map_get(p); | 1926 | frontswap_map = frontswap_map_get(p); |
1927 | frontswap_map_set(p, NULL); | ||
1928 | spin_unlock(&p->lock); | 1927 | spin_unlock(&p->lock); |
1929 | spin_unlock(&swap_lock); | 1928 | spin_unlock(&swap_lock); |
1930 | frontswap_invalidate_area(type); | 1929 | frontswap_invalidate_area(type); |
1930 | frontswap_map_set(p, NULL); | ||
1931 | mutex_unlock(&swapon_mutex); | 1931 | mutex_unlock(&swapon_mutex); |
1932 | free_percpu(p->percpu_cluster); | 1932 | free_percpu(p->percpu_cluster); |
1933 | p->percpu_cluster = NULL; | 1933 | p->percpu_cluster = NULL; |
1934 | vfree(swap_map); | 1934 | vfree(swap_map); |
1935 | vfree(cluster_info); | 1935 | vfree(cluster_info); |
1936 | vfree(frontswap_map); | 1936 | vfree(frontswap_map); |
1937 | /* Destroy swap account informatin */ | 1937 | /* Destroy swap account information */ |
1938 | swap_cgroup_swapoff(type); | 1938 | swap_cgroup_swapoff(type); |
1939 | 1939 | ||
1940 | inode = mapping->host; | 1940 | inode = mapping->host; |
@@ -2786,8 +2786,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | |||
2786 | 2786 | ||
2787 | /* | 2787 | /* |
2788 | * We are fortunate that although vmalloc_to_page uses pte_offset_map, | 2788 | * We are fortunate that although vmalloc_to_page uses pte_offset_map, |
2789 | * no architecture is using highmem pages for kernel pagetables: so it | 2789 | * no architecture is using highmem pages for kernel page tables: so it |
2790 | * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. | 2790 | * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. |
2791 | */ | 2791 | */ |
2792 | head = vmalloc_to_page(si->swap_map + offset); | 2792 | head = vmalloc_to_page(si->swap_map + offset); |
2793 | offset &= ~PAGE_MASK; | 2793 | offset &= ~PAGE_MASK; |
@@ -7,6 +7,9 @@ | |||
7 | #include <linux/security.h> | 7 | #include <linux/security.h> |
8 | #include <linux/swap.h> | 8 | #include <linux/swap.h> |
9 | #include <linux/swapops.h> | 9 | #include <linux/swapops.h> |
10 | #include <linux/mman.h> | ||
11 | #include <linux/hugetlb.h> | ||
12 | |||
10 | #include <asm/uaccess.h> | 13 | #include <asm/uaccess.h> |
11 | 14 | ||
12 | #include "internal.h" | 15 | #include "internal.h" |
@@ -398,6 +401,16 @@ struct address_space *page_mapping(struct page *page) | |||
398 | return mapping; | 401 | return mapping; |
399 | } | 402 | } |
400 | 403 | ||
404 | /* | ||
405 | * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used | ||
406 | */ | ||
407 | unsigned long vm_commit_limit(void) | ||
408 | { | ||
409 | return ((totalram_pages - hugetlb_total_pages()) | ||
410 | * sysctl_overcommit_ratio / 100) + total_swap_pages; | ||
411 | } | ||
412 | |||
413 | |||
401 | /* Tracepoints definitions. */ | 414 | /* Tracepoints definitions. */ |
402 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); | 415 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); |
403 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); | 416 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 107454312d5e..0fdf96803c5b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -359,6 +359,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, | |||
359 | if (unlikely(!va)) | 359 | if (unlikely(!va)) |
360 | return ERR_PTR(-ENOMEM); | 360 | return ERR_PTR(-ENOMEM); |
361 | 361 | ||
362 | /* | ||
363 | * Only scan the relevant parts containing pointers to other objects | ||
364 | * to avoid false negatives. | ||
365 | */ | ||
366 | kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK); | ||
367 | |||
362 | retry: | 368 | retry: |
363 | spin_lock(&vmap_area_lock); | 369 | spin_lock(&vmap_area_lock); |
364 | /* | 370 | /* |
@@ -1546,7 +1552,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, | |||
1546 | gfp_t gfp_mask, pgprot_t prot, | 1552 | gfp_t gfp_mask, pgprot_t prot, |
1547 | int node, const void *caller); | 1553 | int node, const void *caller); |
1548 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | 1554 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
1549 | pgprot_t prot, int node, const void *caller) | 1555 | pgprot_t prot, int node) |
1550 | { | 1556 | { |
1551 | const int order = 0; | 1557 | const int order = 0; |
1552 | struct page **pages; | 1558 | struct page **pages; |
@@ -1560,13 +1566,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1560 | /* Please note that the recursion is strictly bounded. */ | 1566 | /* Please note that the recursion is strictly bounded. */ |
1561 | if (array_size > PAGE_SIZE) { | 1567 | if (array_size > PAGE_SIZE) { |
1562 | pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, | 1568 | pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, |
1563 | PAGE_KERNEL, node, caller); | 1569 | PAGE_KERNEL, node, area->caller); |
1564 | area->flags |= VM_VPAGES; | 1570 | area->flags |= VM_VPAGES; |
1565 | } else { | 1571 | } else { |
1566 | pages = kmalloc_node(array_size, nested_gfp, node); | 1572 | pages = kmalloc_node(array_size, nested_gfp, node); |
1567 | } | 1573 | } |
1568 | area->pages = pages; | 1574 | area->pages = pages; |
1569 | area->caller = caller; | ||
1570 | if (!area->pages) { | 1575 | if (!area->pages) { |
1571 | remove_vm_area(area->addr); | 1576 | remove_vm_area(area->addr); |
1572 | kfree(area); | 1577 | kfree(area); |
@@ -1577,7 +1582,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1577 | struct page *page; | 1582 | struct page *page; |
1578 | gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; | 1583 | gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; |
1579 | 1584 | ||
1580 | if (node < 0) | 1585 | if (node == NUMA_NO_NODE) |
1581 | page = alloc_page(tmp_mask); | 1586 | page = alloc_page(tmp_mask); |
1582 | else | 1587 | else |
1583 | page = alloc_pages_node(node, tmp_mask, order); | 1588 | page = alloc_pages_node(node, tmp_mask, order); |
@@ -1634,9 +1639,9 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, | |||
1634 | if (!area) | 1639 | if (!area) |
1635 | goto fail; | 1640 | goto fail; |
1636 | 1641 | ||
1637 | addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); | 1642 | addr = __vmalloc_area_node(area, gfp_mask, prot, node); |
1638 | if (!addr) | 1643 | if (!addr) |
1639 | goto fail; | 1644 | return NULL; |
1640 | 1645 | ||
1641 | /* | 1646 | /* |
1642 | * In this function, newly allocated vm_struct has VM_UNINITIALIZED | 1647 | * In this function, newly allocated vm_struct has VM_UNINITIALIZED |
@@ -1646,11 +1651,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, | |||
1646 | clear_vm_uninitialized_flag(area); | 1651 | clear_vm_uninitialized_flag(area); |
1647 | 1652 | ||
1648 | /* | 1653 | /* |
1649 | * A ref_count = 3 is needed because the vm_struct and vmap_area | 1654 | * A ref_count = 2 is needed because vm_struct allocated in |
1650 | * structures allocated in the __get_vm_area_node() function contain | 1655 | * __get_vm_area_node() contains a reference to the virtual address of |
1651 | * references to the virtual address of the vmalloc'ed block. | 1656 | * the vmalloc'ed block. |
1652 | */ | 1657 | */ |
1653 | kmemleak_alloc(addr, real_size, 3, gfp_mask); | 1658 | kmemleak_alloc(addr, real_size, 2, gfp_mask); |
1654 | 1659 | ||
1655 | return addr; | 1660 | return addr; |
1656 | 1661 | ||
@@ -2563,6 +2568,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) | |||
2563 | if (!counters) | 2568 | if (!counters) |
2564 | return; | 2569 | return; |
2565 | 2570 | ||
2571 | /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ | ||
2572 | smp_rmb(); | ||
2573 | if (v->flags & VM_UNINITIALIZED) | ||
2574 | return; | ||
2575 | |||
2566 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); | 2576 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); |
2567 | 2577 | ||
2568 | for (nr = 0; nr < v->nr_pages; nr++) | 2578 | for (nr = 0; nr < v->nr_pages; nr++) |
@@ -2579,23 +2589,15 @@ static int s_show(struct seq_file *m, void *p) | |||
2579 | struct vmap_area *va = p; | 2589 | struct vmap_area *va = p; |
2580 | struct vm_struct *v; | 2590 | struct vm_struct *v; |
2581 | 2591 | ||
2582 | if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING)) | 2592 | /* |
2593 | * s_show can encounter race with remove_vm_area, !VM_VM_AREA on | ||
2594 | * behalf of vmap area is being tear down or vm_map_ram allocation. | ||
2595 | */ | ||
2596 | if (!(va->flags & VM_VM_AREA)) | ||
2583 | return 0; | 2597 | return 0; |
2584 | 2598 | ||
2585 | if (!(va->flags & VM_VM_AREA)) { | ||
2586 | seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", | ||
2587 | (void *)va->va_start, (void *)va->va_end, | ||
2588 | va->va_end - va->va_start); | ||
2589 | return 0; | ||
2590 | } | ||
2591 | |||
2592 | v = va->vm; | 2599 | v = va->vm; |
2593 | 2600 | ||
2594 | /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ | ||
2595 | smp_rmb(); | ||
2596 | if (v->flags & VM_UNINITIALIZED) | ||
2597 | return 0; | ||
2598 | |||
2599 | seq_printf(m, "0x%pK-0x%pK %7ld", | 2601 | seq_printf(m, "0x%pK-0x%pK %7ld", |
2600 | v->addr, v->addr + v->size, v->size); | 2602 | v->addr, v->addr + v->size, v->size); |
2601 | 2603 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index 9bb314577911..72496140ac08 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -812,6 +812,7 @@ const char * const vmstat_text[] = { | |||
812 | 812 | ||
813 | #ifdef CONFIG_NUMA_BALANCING | 813 | #ifdef CONFIG_NUMA_BALANCING |
814 | "numa_pte_updates", | 814 | "numa_pte_updates", |
815 | "numa_huge_pte_updates", | ||
815 | "numa_hint_faults", | 816 | "numa_hint_faults", |
816 | "numa_hint_faults_local", | 817 | "numa_hint_faults_local", |
817 | "numa_pages_migrated", | 818 | "numa_pages_migrated", |
@@ -1229,6 +1230,20 @@ static void start_cpu_timer(int cpu) | |||
1229 | schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); | 1230 | schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); |
1230 | } | 1231 | } |
1231 | 1232 | ||
1233 | static void vmstat_cpu_dead(int node) | ||
1234 | { | ||
1235 | int cpu; | ||
1236 | |||
1237 | get_online_cpus(); | ||
1238 | for_each_online_cpu(cpu) | ||
1239 | if (cpu_to_node(cpu) == node) | ||
1240 | goto end; | ||
1241 | |||
1242 | node_clear_state(node, N_CPU); | ||
1243 | end: | ||
1244 | put_online_cpus(); | ||
1245 | } | ||
1246 | |||
1232 | /* | 1247 | /* |
1233 | * Use the cpu notifier to insure that the thresholds are recalculated | 1248 | * Use the cpu notifier to insure that the thresholds are recalculated |
1234 | * when necessary. | 1249 | * when necessary. |
@@ -1258,6 +1273,7 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb, | |||
1258 | case CPU_DEAD: | 1273 | case CPU_DEAD: |
1259 | case CPU_DEAD_FROZEN: | 1274 | case CPU_DEAD_FROZEN: |
1260 | refresh_zone_stat_thresholds(); | 1275 | refresh_zone_stat_thresholds(); |
1276 | vmstat_cpu_dead(cpu_to_node(cpu)); | ||
1261 | break; | 1277 | break; |
1262 | default: | 1278 | default: |
1263 | break; | 1279 | break; |
@@ -1276,8 +1292,12 @@ static int __init setup_vmstat(void) | |||
1276 | 1292 | ||
1277 | register_cpu_notifier(&vmstat_notifier); | 1293 | register_cpu_notifier(&vmstat_notifier); |
1278 | 1294 | ||
1279 | for_each_online_cpu(cpu) | 1295 | get_online_cpus(); |
1296 | for_each_online_cpu(cpu) { | ||
1280 | start_cpu_timer(cpu); | 1297 | start_cpu_timer(cpu); |
1298 | node_set_state(cpu_to_node(cpu), N_CPU); | ||
1299 | } | ||
1300 | put_online_cpus(); | ||
1281 | #endif | 1301 | #endif |
1282 | #ifdef CONFIG_PROC_FS | 1302 | #ifdef CONFIG_PROC_FS |
1283 | proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); | 1303 | proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); |
diff --git a/mm/zswap.c b/mm/zswap.c index d93510c6aa2d..5a63f78a5601 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
@@ -217,6 +217,7 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) | |||
217 | if (!entry) | 217 | if (!entry) |
218 | return NULL; | 218 | return NULL; |
219 | entry->refcount = 1; | 219 | entry->refcount = 1; |
220 | RB_CLEAR_NODE(&entry->rbnode); | ||
220 | return entry; | 221 | return entry; |
221 | } | 222 | } |
222 | 223 | ||
@@ -225,19 +226,6 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) | |||
225 | kmem_cache_free(zswap_entry_cache, entry); | 226 | kmem_cache_free(zswap_entry_cache, entry); |
226 | } | 227 | } |
227 | 228 | ||
228 | /* caller must hold the tree lock */ | ||
229 | static void zswap_entry_get(struct zswap_entry *entry) | ||
230 | { | ||
231 | entry->refcount++; | ||
232 | } | ||
233 | |||
234 | /* caller must hold the tree lock */ | ||
235 | static int zswap_entry_put(struct zswap_entry *entry) | ||
236 | { | ||
237 | entry->refcount--; | ||
238 | return entry->refcount; | ||
239 | } | ||
240 | |||
241 | /********************************* | 229 | /********************************* |
242 | * rbtree functions | 230 | * rbtree functions |
243 | **********************************/ | 231 | **********************************/ |
@@ -285,6 +273,61 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, | |||
285 | return 0; | 273 | return 0; |
286 | } | 274 | } |
287 | 275 | ||
276 | static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) | ||
277 | { | ||
278 | if (!RB_EMPTY_NODE(&entry->rbnode)) { | ||
279 | rb_erase(&entry->rbnode, root); | ||
280 | RB_CLEAR_NODE(&entry->rbnode); | ||
281 | } | ||
282 | } | ||
283 | |||
284 | /* | ||
285 | * Carries out the common pattern of freeing and entry's zsmalloc allocation, | ||
286 | * freeing the entry itself, and decrementing the number of stored pages. | ||
287 | */ | ||
288 | static void zswap_free_entry(struct zswap_tree *tree, | ||
289 | struct zswap_entry *entry) | ||
290 | { | ||
291 | zbud_free(tree->pool, entry->handle); | ||
292 | zswap_entry_cache_free(entry); | ||
293 | atomic_dec(&zswap_stored_pages); | ||
294 | zswap_pool_pages = zbud_get_pool_size(tree->pool); | ||
295 | } | ||
296 | |||
297 | /* caller must hold the tree lock */ | ||
298 | static void zswap_entry_get(struct zswap_entry *entry) | ||
299 | { | ||
300 | entry->refcount++; | ||
301 | } | ||
302 | |||
303 | /* caller must hold the tree lock | ||
304 | * remove from the tree and free it, if nobody reference the entry | ||
305 | */ | ||
306 | static void zswap_entry_put(struct zswap_tree *tree, | ||
307 | struct zswap_entry *entry) | ||
308 | { | ||
309 | int refcount = --entry->refcount; | ||
310 | |||
311 | BUG_ON(refcount < 0); | ||
312 | if (refcount == 0) { | ||
313 | zswap_rb_erase(&tree->rbroot, entry); | ||
314 | zswap_free_entry(tree, entry); | ||
315 | } | ||
316 | } | ||
317 | |||
318 | /* caller must hold the tree lock */ | ||
319 | static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, | ||
320 | pgoff_t offset) | ||
321 | { | ||
322 | struct zswap_entry *entry = NULL; | ||
323 | |||
324 | entry = zswap_rb_search(root, offset); | ||
325 | if (entry) | ||
326 | zswap_entry_get(entry); | ||
327 | |||
328 | return entry; | ||
329 | } | ||
330 | |||
288 | /********************************* | 331 | /********************************* |
289 | * per-cpu code | 332 | * per-cpu code |
290 | **********************************/ | 333 | **********************************/ |
@@ -368,18 +411,6 @@ static bool zswap_is_full(void) | |||
368 | zswap_pool_pages); | 411 | zswap_pool_pages); |
369 | } | 412 | } |
370 | 413 | ||
371 | /* | ||
372 | * Carries out the common pattern of freeing and entry's zsmalloc allocation, | ||
373 | * freeing the entry itself, and decrementing the number of stored pages. | ||
374 | */ | ||
375 | static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry) | ||
376 | { | ||
377 | zbud_free(tree->pool, entry->handle); | ||
378 | zswap_entry_cache_free(entry); | ||
379 | atomic_dec(&zswap_stored_pages); | ||
380 | zswap_pool_pages = zbud_get_pool_size(tree->pool); | ||
381 | } | ||
382 | |||
383 | /********************************* | 414 | /********************************* |
384 | * writeback code | 415 | * writeback code |
385 | **********************************/ | 416 | **********************************/ |
@@ -387,7 +418,7 @@ static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry) | |||
387 | enum zswap_get_swap_ret { | 418 | enum zswap_get_swap_ret { |
388 | ZSWAP_SWAPCACHE_NEW, | 419 | ZSWAP_SWAPCACHE_NEW, |
389 | ZSWAP_SWAPCACHE_EXIST, | 420 | ZSWAP_SWAPCACHE_EXIST, |
390 | ZSWAP_SWAPCACHE_NOMEM | 421 | ZSWAP_SWAPCACHE_FAIL, |
391 | }; | 422 | }; |
392 | 423 | ||
393 | /* | 424 | /* |
@@ -401,9 +432,10 @@ enum zswap_get_swap_ret { | |||
401 | * added to the swap cache, and returned in retpage. | 432 | * added to the swap cache, and returned in retpage. |
402 | * | 433 | * |
403 | * If success, the swap cache page is returned in retpage | 434 | * If success, the swap cache page is returned in retpage |
404 | * Returns 0 if page was already in the swap cache, page is not locked | 435 | * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache |
405 | * Returns 1 if the new page needs to be populated, page is locked | 436 | * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, |
406 | * Returns <0 on error | 437 | * the new page is added to swapcache and locked |
438 | * Returns ZSWAP_SWAPCACHE_FAIL on error | ||
407 | */ | 439 | */ |
408 | static int zswap_get_swap_cache_page(swp_entry_t entry, | 440 | static int zswap_get_swap_cache_page(swp_entry_t entry, |
409 | struct page **retpage) | 441 | struct page **retpage) |
@@ -475,7 +507,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, | |||
475 | if (new_page) | 507 | if (new_page) |
476 | page_cache_release(new_page); | 508 | page_cache_release(new_page); |
477 | if (!found_page) | 509 | if (!found_page) |
478 | return ZSWAP_SWAPCACHE_NOMEM; | 510 | return ZSWAP_SWAPCACHE_FAIL; |
479 | *retpage = found_page; | 511 | *retpage = found_page; |
480 | return ZSWAP_SWAPCACHE_EXIST; | 512 | return ZSWAP_SWAPCACHE_EXIST; |
481 | } | 513 | } |
@@ -502,7 +534,7 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) | |||
502 | struct page *page; | 534 | struct page *page; |
503 | u8 *src, *dst; | 535 | u8 *src, *dst; |
504 | unsigned int dlen; | 536 | unsigned int dlen; |
505 | int ret, refcount; | 537 | int ret; |
506 | struct writeback_control wbc = { | 538 | struct writeback_control wbc = { |
507 | .sync_mode = WB_SYNC_NONE, | 539 | .sync_mode = WB_SYNC_NONE, |
508 | }; | 540 | }; |
@@ -517,23 +549,22 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) | |||
517 | 549 | ||
518 | /* find and ref zswap entry */ | 550 | /* find and ref zswap entry */ |
519 | spin_lock(&tree->lock); | 551 | spin_lock(&tree->lock); |
520 | entry = zswap_rb_search(&tree->rbroot, offset); | 552 | entry = zswap_entry_find_get(&tree->rbroot, offset); |
521 | if (!entry) { | 553 | if (!entry) { |
522 | /* entry was invalidated */ | 554 | /* entry was invalidated */ |
523 | spin_unlock(&tree->lock); | 555 | spin_unlock(&tree->lock); |
524 | return 0; | 556 | return 0; |
525 | } | 557 | } |
526 | zswap_entry_get(entry); | ||
527 | spin_unlock(&tree->lock); | 558 | spin_unlock(&tree->lock); |
528 | BUG_ON(offset != entry->offset); | 559 | BUG_ON(offset != entry->offset); |
529 | 560 | ||
530 | /* try to allocate swap cache page */ | 561 | /* try to allocate swap cache page */ |
531 | switch (zswap_get_swap_cache_page(swpentry, &page)) { | 562 | switch (zswap_get_swap_cache_page(swpentry, &page)) { |
532 | case ZSWAP_SWAPCACHE_NOMEM: /* no memory */ | 563 | case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ |
533 | ret = -ENOMEM; | 564 | ret = -ENOMEM; |
534 | goto fail; | 565 | goto fail; |
535 | 566 | ||
536 | case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */ | 567 | case ZSWAP_SWAPCACHE_EXIST: |
537 | /* page is already in the swap cache, ignore for now */ | 568 | /* page is already in the swap cache, ignore for now */ |
538 | page_cache_release(page); | 569 | page_cache_release(page); |
539 | ret = -EEXIST; | 570 | ret = -EEXIST; |
@@ -556,43 +587,44 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) | |||
556 | SetPageUptodate(page); | 587 | SetPageUptodate(page); |
557 | } | 588 | } |
558 | 589 | ||
590 | /* move it to the tail of the inactive list after end_writeback */ | ||
591 | SetPageReclaim(page); | ||
592 | |||
559 | /* start writeback */ | 593 | /* start writeback */ |
560 | __swap_writepage(page, &wbc, end_swap_bio_write); | 594 | __swap_writepage(page, &wbc, end_swap_bio_write); |
561 | page_cache_release(page); | 595 | page_cache_release(page); |
562 | zswap_written_back_pages++; | 596 | zswap_written_back_pages++; |
563 | 597 | ||
564 | spin_lock(&tree->lock); | 598 | spin_lock(&tree->lock); |
565 | |||
566 | /* drop local reference */ | 599 | /* drop local reference */ |
567 | zswap_entry_put(entry); | 600 | zswap_entry_put(tree, entry); |
568 | /* drop the initial reference from entry creation */ | ||
569 | refcount = zswap_entry_put(entry); | ||
570 | 601 | ||
571 | /* | 602 | /* |
572 | * There are three possible values for refcount here: | 603 | * There are two possible situations for entry here: |
573 | * (1) refcount is 1, load is in progress, unlink from rbtree, | 604 | * (1) refcount is 1(normal case), entry is valid and on the tree |
574 | * load will free | 605 | * (2) refcount is 0, entry is freed and not on the tree |
575 | * (2) refcount is 0, (normal case) entry is valid, | 606 | * because invalidate happened during writeback |
576 | * remove from rbtree and free entry | 607 | * search the tree and free the entry if find entry |
577 | * (3) refcount is -1, invalidate happened during writeback, | 608 | */ |
578 | * free entry | 609 | if (entry == zswap_rb_search(&tree->rbroot, offset)) |
579 | */ | 610 | zswap_entry_put(tree, entry); |
580 | if (refcount >= 0) { | ||
581 | /* no invalidate yet, remove from rbtree */ | ||
582 | rb_erase(&entry->rbnode, &tree->rbroot); | ||
583 | } | ||
584 | spin_unlock(&tree->lock); | 611 | spin_unlock(&tree->lock); |
585 | if (refcount <= 0) { | ||
586 | /* free the entry */ | ||
587 | zswap_free_entry(tree, entry); | ||
588 | return 0; | ||
589 | } | ||
590 | return -EAGAIN; | ||
591 | 612 | ||
613 | goto end; | ||
614 | |||
615 | /* | ||
616 | * if we get here due to ZSWAP_SWAPCACHE_EXIST | ||
617 | * a load may happening concurrently | ||
618 | * it is safe and okay to not free the entry | ||
619 | * if we free the entry in the following put | ||
620 | * it it either okay to return !0 | ||
621 | */ | ||
592 | fail: | 622 | fail: |
593 | spin_lock(&tree->lock); | 623 | spin_lock(&tree->lock); |
594 | zswap_entry_put(entry); | 624 | zswap_entry_put(tree, entry); |
595 | spin_unlock(&tree->lock); | 625 | spin_unlock(&tree->lock); |
626 | |||
627 | end: | ||
596 | return ret; | 628 | return ret; |
597 | } | 629 | } |
598 | 630 | ||
@@ -676,11 +708,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, | |||
676 | if (ret == -EEXIST) { | 708 | if (ret == -EEXIST) { |
677 | zswap_duplicate_entry++; | 709 | zswap_duplicate_entry++; |
678 | /* remove from rbtree */ | 710 | /* remove from rbtree */ |
679 | rb_erase(&dupentry->rbnode, &tree->rbroot); | 711 | zswap_rb_erase(&tree->rbroot, dupentry); |
680 | if (!zswap_entry_put(dupentry)) { | 712 | zswap_entry_put(tree, dupentry); |
681 | /* free */ | ||
682 | zswap_free_entry(tree, dupentry); | ||
683 | } | ||
684 | } | 713 | } |
685 | } while (ret == -EEXIST); | 714 | } while (ret == -EEXIST); |
686 | spin_unlock(&tree->lock); | 715 | spin_unlock(&tree->lock); |
@@ -709,17 +738,16 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, | |||
709 | struct zswap_entry *entry; | 738 | struct zswap_entry *entry; |
710 | u8 *src, *dst; | 739 | u8 *src, *dst; |
711 | unsigned int dlen; | 740 | unsigned int dlen; |
712 | int refcount, ret; | 741 | int ret; |
713 | 742 | ||
714 | /* find */ | 743 | /* find */ |
715 | spin_lock(&tree->lock); | 744 | spin_lock(&tree->lock); |
716 | entry = zswap_rb_search(&tree->rbroot, offset); | 745 | entry = zswap_entry_find_get(&tree->rbroot, offset); |
717 | if (!entry) { | 746 | if (!entry) { |
718 | /* entry was written back */ | 747 | /* entry was written back */ |
719 | spin_unlock(&tree->lock); | 748 | spin_unlock(&tree->lock); |
720 | return -1; | 749 | return -1; |
721 | } | 750 | } |
722 | zswap_entry_get(entry); | ||
723 | spin_unlock(&tree->lock); | 751 | spin_unlock(&tree->lock); |
724 | 752 | ||
725 | /* decompress */ | 753 | /* decompress */ |
@@ -734,22 +762,9 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, | |||
734 | BUG_ON(ret); | 762 | BUG_ON(ret); |
735 | 763 | ||
736 | spin_lock(&tree->lock); | 764 | spin_lock(&tree->lock); |
737 | refcount = zswap_entry_put(entry); | 765 | zswap_entry_put(tree, entry); |
738 | if (likely(refcount)) { | ||
739 | spin_unlock(&tree->lock); | ||
740 | return 0; | ||
741 | } | ||
742 | spin_unlock(&tree->lock); | 766 | spin_unlock(&tree->lock); |
743 | 767 | ||
744 | /* | ||
745 | * We don't have to unlink from the rbtree because | ||
746 | * zswap_writeback_entry() or zswap_frontswap_invalidate page() | ||
747 | * has already done this for us if we are the last reference. | ||
748 | */ | ||
749 | /* free */ | ||
750 | |||
751 | zswap_free_entry(tree, entry); | ||
752 | |||
753 | return 0; | 768 | return 0; |
754 | } | 769 | } |
755 | 770 | ||
@@ -758,7 +773,6 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) | |||
758 | { | 773 | { |
759 | struct zswap_tree *tree = zswap_trees[type]; | 774 | struct zswap_tree *tree = zswap_trees[type]; |
760 | struct zswap_entry *entry; | 775 | struct zswap_entry *entry; |
761 | int refcount; | ||
762 | 776 | ||
763 | /* find */ | 777 | /* find */ |
764 | spin_lock(&tree->lock); | 778 | spin_lock(&tree->lock); |
@@ -770,20 +784,12 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) | |||
770 | } | 784 | } |
771 | 785 | ||
772 | /* remove from rbtree */ | 786 | /* remove from rbtree */ |
773 | rb_erase(&entry->rbnode, &tree->rbroot); | 787 | zswap_rb_erase(&tree->rbroot, entry); |
774 | 788 | ||
775 | /* drop the initial reference from entry creation */ | 789 | /* drop the initial reference from entry creation */ |
776 | refcount = zswap_entry_put(entry); | 790 | zswap_entry_put(tree, entry); |
777 | 791 | ||
778 | spin_unlock(&tree->lock); | 792 | spin_unlock(&tree->lock); |
779 | |||
780 | if (refcount) { | ||
781 | /* writeback in progress, writeback will free */ | ||
782 | return; | ||
783 | } | ||
784 | |||
785 | /* free */ | ||
786 | zswap_free_entry(tree, entry); | ||
787 | } | 793 | } |
788 | 794 | ||
789 | /* frees all zswap entries for the given swap type */ | 795 | /* frees all zswap entries for the given swap type */ |
@@ -797,11 +803,8 @@ static void zswap_frontswap_invalidate_area(unsigned type) | |||
797 | 803 | ||
798 | /* walk the tree and free everything */ | 804 | /* walk the tree and free everything */ |
799 | spin_lock(&tree->lock); | 805 | spin_lock(&tree->lock); |
800 | rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) { | 806 | rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) |
801 | zbud_free(tree->pool, entry->handle); | 807 | zswap_free_entry(tree, entry); |
802 | zswap_entry_cache_free(entry); | ||
803 | atomic_dec(&zswap_stored_pages); | ||
804 | } | ||
805 | tree->rbroot = RB_ROOT; | 808 | tree->rbroot = RB_ROOT; |
806 | spin_unlock(&tree->lock); | 809 | spin_unlock(&tree->lock); |
807 | 810 | ||