diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 17 | ||||
| -rw-r--r-- | mm/Makefile | 1 | ||||
| -rw-r--r-- | mm/compaction.c | 605 | ||||
| -rw-r--r-- | mm/filemap.c | 14 | ||||
| -rw-r--r-- | mm/highmem.c | 2 | ||||
| -rw-r--r-- | mm/hugetlb.c | 12 | ||||
| -rw-r--r-- | mm/ksm.c | 4 | ||||
| -rw-r--r-- | mm/memory.c | 13 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 36 | ||||
| -rw-r--r-- | mm/mempolicy.c | 226 | ||||
| -rw-r--r-- | mm/migrate.c | 72 | ||||
| -rw-r--r-- | mm/mincore.c | 263 | ||||
| -rw-r--r-- | mm/page_alloc.c | 267 | ||||
| -rw-r--r-- | mm/readahead.c | 2 | ||||
| -rw-r--r-- | mm/rmap.c | 40 | ||||
| -rw-r--r-- | mm/shmem.c | 2 | ||||
| -rw-r--r-- | mm/slab.c | 4 | ||||
| -rw-r--r-- | mm/slub.c | 6 | ||||
| -rw-r--r-- | mm/sparse.c | 9 | ||||
| -rw-r--r-- | mm/vmscan.c | 213 | ||||
| -rw-r--r-- | mm/vmstat.c | 253 |
21 files changed, 1684 insertions, 377 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 9c61158308dc..527136b22384 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -172,6 +172,15 @@ config SPLIT_PTLOCK_CPUS | |||
| 172 | default "4" | 172 | default "4" |
| 173 | 173 | ||
| 174 | # | 174 | # |
| 175 | # support for memory compaction | ||
| 176 | config COMPACTION | ||
| 177 | bool "Allow for memory compaction" | ||
| 178 | select MIGRATION | ||
| 179 | depends on EXPERIMENTAL && HUGETLB_PAGE && MMU | ||
| 180 | help | ||
| 181 | Allows the compaction of memory for the allocation of huge pages. | ||
| 182 | |||
| 183 | # | ||
| 175 | # support for page migration | 184 | # support for page migration |
| 176 | # | 185 | # |
| 177 | config MIGRATION | 186 | config MIGRATION |
| @@ -180,9 +189,11 @@ config MIGRATION | |||
| 180 | depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE | 189 | depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE |
| 181 | help | 190 | help |
| 182 | Allows the migration of the physical location of pages of processes | 191 | Allows the migration of the physical location of pages of processes |
| 183 | while the virtual addresses are not changed. This is useful for | 192 | while the virtual addresses are not changed. This is useful in |
| 184 | example on NUMA systems to put pages nearer to the processors accessing | 193 | two situations. The first is on NUMA systems to put pages nearer |
| 185 | the page. | 194 | to the processors accessing. The second is when allocating huge |
| 195 | pages as migration can relocate pages to satisfy a huge page | ||
| 196 | allocation instead of reclaiming. | ||
| 186 | 197 | ||
| 187 | config PHYS_ADDR_T_64BIT | 198 | config PHYS_ADDR_T_64BIT |
| 188 | def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT | 199 | def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT |
diff --git a/mm/Makefile b/mm/Makefile index 6c2a73a54a43..8982504bd03b 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -23,6 +23,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o | |||
| 23 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 23 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
| 24 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | 24 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o |
| 25 | obj-$(CONFIG_SLOB) += slob.o | 25 | obj-$(CONFIG_SLOB) += slob.o |
| 26 | obj-$(CONFIG_COMPACTION) += compaction.o | ||
| 26 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | 27 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o |
| 27 | obj-$(CONFIG_KSM) += ksm.o | 28 | obj-$(CONFIG_KSM) += ksm.o |
| 28 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o | 29 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o |
diff --git a/mm/compaction.c b/mm/compaction.c new file mode 100644 index 000000000000..94cce51b0b35 --- /dev/null +++ b/mm/compaction.c | |||
| @@ -0,0 +1,605 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/compaction.c | ||
| 3 | * | ||
| 4 | * Memory compaction for the reduction of external fragmentation. Note that | ||
| 5 | * this heavily depends upon page migration to do all the real heavy | ||
| 6 | * lifting | ||
| 7 | * | ||
| 8 | * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> | ||
| 9 | */ | ||
| 10 | #include <linux/swap.h> | ||
| 11 | #include <linux/migrate.h> | ||
| 12 | #include <linux/compaction.h> | ||
| 13 | #include <linux/mm_inline.h> | ||
| 14 | #include <linux/backing-dev.h> | ||
| 15 | #include <linux/sysctl.h> | ||
| 16 | #include <linux/sysfs.h> | ||
| 17 | #include "internal.h" | ||
| 18 | |||
| 19 | /* | ||
| 20 | * compact_control is used to track pages being migrated and the free pages | ||
| 21 | * they are being migrated to during memory compaction. The free_pfn starts | ||
| 22 | * at the end of a zone and migrate_pfn begins at the start. Movable pages | ||
| 23 | * are moved to the end of a zone during a compaction run and the run | ||
| 24 | * completes when free_pfn <= migrate_pfn | ||
| 25 | */ | ||
| 26 | struct compact_control { | ||
| 27 | struct list_head freepages; /* List of free pages to migrate to */ | ||
| 28 | struct list_head migratepages; /* List of pages being migrated */ | ||
| 29 | unsigned long nr_freepages; /* Number of isolated free pages */ | ||
| 30 | unsigned long nr_migratepages; /* Number of pages to migrate */ | ||
| 31 | unsigned long free_pfn; /* isolate_freepages search base */ | ||
| 32 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | ||
| 33 | |||
| 34 | /* Account for isolated anon and file pages */ | ||
| 35 | unsigned long nr_anon; | ||
| 36 | unsigned long nr_file; | ||
| 37 | |||
| 38 | unsigned int order; /* order a direct compactor needs */ | ||
| 39 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | ||
| 40 | struct zone *zone; | ||
| 41 | }; | ||
| 42 | |||
| 43 | static unsigned long release_freepages(struct list_head *freelist) | ||
| 44 | { | ||
| 45 | struct page *page, *next; | ||
| 46 | unsigned long count = 0; | ||
| 47 | |||
| 48 | list_for_each_entry_safe(page, next, freelist, lru) { | ||
| 49 | list_del(&page->lru); | ||
| 50 | __free_page(page); | ||
| 51 | count++; | ||
| 52 | } | ||
| 53 | |||
| 54 | return count; | ||
| 55 | } | ||
| 56 | |||
| 57 | /* Isolate free pages onto a private freelist. Must hold zone->lock */ | ||
| 58 | static unsigned long isolate_freepages_block(struct zone *zone, | ||
| 59 | unsigned long blockpfn, | ||
| 60 | struct list_head *freelist) | ||
| 61 | { | ||
| 62 | unsigned long zone_end_pfn, end_pfn; | ||
| 63 | int total_isolated = 0; | ||
| 64 | struct page *cursor; | ||
| 65 | |||
| 66 | /* Get the last PFN we should scan for free pages at */ | ||
| 67 | zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
| 68 | end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn); | ||
| 69 | |||
| 70 | /* Find the first usable PFN in the block to initialse page cursor */ | ||
| 71 | for (; blockpfn < end_pfn; blockpfn++) { | ||
| 72 | if (pfn_valid_within(blockpfn)) | ||
| 73 | break; | ||
| 74 | } | ||
| 75 | cursor = pfn_to_page(blockpfn); | ||
| 76 | |||
| 77 | /* Isolate free pages. This assumes the block is valid */ | ||
| 78 | for (; blockpfn < end_pfn; blockpfn++, cursor++) { | ||
| 79 | int isolated, i; | ||
| 80 | struct page *page = cursor; | ||
| 81 | |||
| 82 | if (!pfn_valid_within(blockpfn)) | ||
| 83 | continue; | ||
| 84 | |||
| 85 | if (!PageBuddy(page)) | ||
| 86 | continue; | ||
| 87 | |||
| 88 | /* Found a free page, break it into order-0 pages */ | ||
| 89 | isolated = split_free_page(page); | ||
| 90 | total_isolated += isolated; | ||
| 91 | for (i = 0; i < isolated; i++) { | ||
| 92 | list_add(&page->lru, freelist); | ||
| 93 | page++; | ||
| 94 | } | ||
| 95 | |||
| 96 | /* If a page was split, advance to the end of it */ | ||
| 97 | if (isolated) { | ||
| 98 | blockpfn += isolated - 1; | ||
| 99 | cursor += isolated - 1; | ||
| 100 | } | ||
| 101 | } | ||
| 102 | |||
| 103 | return total_isolated; | ||
| 104 | } | ||
| 105 | |||
| 106 | /* Returns true if the page is within a block suitable for migration to */ | ||
| 107 | static bool suitable_migration_target(struct page *page) | ||
| 108 | { | ||
| 109 | |||
| 110 | int migratetype = get_pageblock_migratetype(page); | ||
| 111 | |||
| 112 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | ||
| 113 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) | ||
| 114 | return false; | ||
| 115 | |||
| 116 | /* If the page is a large free page, then allow migration */ | ||
| 117 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | ||
| 118 | return true; | ||
| 119 | |||
| 120 | /* If the block is MIGRATE_MOVABLE, allow migration */ | ||
| 121 | if (migratetype == MIGRATE_MOVABLE) | ||
| 122 | return true; | ||
| 123 | |||
| 124 | /* Otherwise skip the block */ | ||
| 125 | return false; | ||
| 126 | } | ||
| 127 | |||
| 128 | /* | ||
| 129 | * Based on information in the current compact_control, find blocks | ||
| 130 | * suitable for isolating free pages from and then isolate them. | ||
| 131 | */ | ||
| 132 | static void isolate_freepages(struct zone *zone, | ||
| 133 | struct compact_control *cc) | ||
| 134 | { | ||
| 135 | struct page *page; | ||
| 136 | unsigned long high_pfn, low_pfn, pfn; | ||
| 137 | unsigned long flags; | ||
| 138 | int nr_freepages = cc->nr_freepages; | ||
| 139 | struct list_head *freelist = &cc->freepages; | ||
| 140 | |||
| 141 | pfn = cc->free_pfn; | ||
| 142 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; | ||
| 143 | high_pfn = low_pfn; | ||
| 144 | |||
| 145 | /* | ||
| 146 | * Isolate free pages until enough are available to migrate the | ||
| 147 | * pages on cc->migratepages. We stop searching if the migrate | ||
| 148 | * and free page scanners meet or enough free pages are isolated. | ||
| 149 | */ | ||
| 150 | spin_lock_irqsave(&zone->lock, flags); | ||
| 151 | for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; | ||
| 152 | pfn -= pageblock_nr_pages) { | ||
| 153 | unsigned long isolated; | ||
| 154 | |||
| 155 | if (!pfn_valid(pfn)) | ||
| 156 | continue; | ||
| 157 | |||
| 158 | /* | ||
| 159 | * Check for overlapping nodes/zones. It's possible on some | ||
| 160 | * configurations to have a setup like | ||
| 161 | * node0 node1 node0 | ||
| 162 | * i.e. it's possible that all pages within a zones range of | ||
| 163 | * pages do not belong to a single zone. | ||
| 164 | */ | ||
| 165 | page = pfn_to_page(pfn); | ||
| 166 | if (page_zone(page) != zone) | ||
| 167 | continue; | ||
| 168 | |||
| 169 | /* Check the block is suitable for migration */ | ||
| 170 | if (!suitable_migration_target(page)) | ||
| 171 | continue; | ||
| 172 | |||
| 173 | /* Found a block suitable for isolating free pages from */ | ||
| 174 | isolated = isolate_freepages_block(zone, pfn, freelist); | ||
| 175 | nr_freepages += isolated; | ||
| 176 | |||
| 177 | /* | ||
| 178 | * Record the highest PFN we isolated pages from. When next | ||
| 179 | * looking for free pages, the search will restart here as | ||
| 180 | * page migration may have returned some pages to the allocator | ||
| 181 | */ | ||
| 182 | if (isolated) | ||
| 183 | high_pfn = max(high_pfn, pfn); | ||
| 184 | } | ||
| 185 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 186 | |||
| 187 | /* split_free_page does not map the pages */ | ||
| 188 | list_for_each_entry(page, freelist, lru) { | ||
| 189 | arch_alloc_page(page, 0); | ||
| 190 | kernel_map_pages(page, 1, 1); | ||
| 191 | } | ||
| 192 | |||
| 193 | cc->free_pfn = high_pfn; | ||
| 194 | cc->nr_freepages = nr_freepages; | ||
| 195 | } | ||
| 196 | |||
| 197 | /* Update the number of anon and file isolated pages in the zone */ | ||
| 198 | static void acct_isolated(struct zone *zone, struct compact_control *cc) | ||
| 199 | { | ||
| 200 | struct page *page; | ||
| 201 | unsigned int count[NR_LRU_LISTS] = { 0, }; | ||
| 202 | |||
| 203 | list_for_each_entry(page, &cc->migratepages, lru) { | ||
| 204 | int lru = page_lru_base_type(page); | ||
| 205 | count[lru]++; | ||
| 206 | } | ||
| 207 | |||
| 208 | cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; | ||
| 209 | cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; | ||
| 210 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon); | ||
| 211 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file); | ||
| 212 | } | ||
| 213 | |||
| 214 | /* Similar to reclaim, but different enough that they don't share logic */ | ||
| 215 | static bool too_many_isolated(struct zone *zone) | ||
| 216 | { | ||
| 217 | |||
| 218 | unsigned long inactive, isolated; | ||
| 219 | |||
| 220 | inactive = zone_page_state(zone, NR_INACTIVE_FILE) + | ||
| 221 | zone_page_state(zone, NR_INACTIVE_ANON); | ||
| 222 | isolated = zone_page_state(zone, NR_ISOLATED_FILE) + | ||
| 223 | zone_page_state(zone, NR_ISOLATED_ANON); | ||
| 224 | |||
| 225 | return isolated > inactive; | ||
| 226 | } | ||
| 227 | |||
| 228 | /* | ||
| 229 | * Isolate all pages that can be migrated from the block pointed to by | ||
| 230 | * the migrate scanner within compact_control. | ||
| 231 | */ | ||
| 232 | static unsigned long isolate_migratepages(struct zone *zone, | ||
| 233 | struct compact_control *cc) | ||
| 234 | { | ||
| 235 | unsigned long low_pfn, end_pfn; | ||
| 236 | struct list_head *migratelist = &cc->migratepages; | ||
| 237 | |||
| 238 | /* Do not scan outside zone boundaries */ | ||
| 239 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); | ||
| 240 | |||
| 241 | /* Only scan within a pageblock boundary */ | ||
| 242 | end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); | ||
| 243 | |||
| 244 | /* Do not cross the free scanner or scan within a memory hole */ | ||
| 245 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { | ||
| 246 | cc->migrate_pfn = end_pfn; | ||
| 247 | return 0; | ||
| 248 | } | ||
| 249 | |||
| 250 | /* | ||
| 251 | * Ensure that there are not too many pages isolated from the LRU | ||
| 252 | * list by either parallel reclaimers or compaction. If there are, | ||
| 253 | * delay for some time until fewer pages are isolated | ||
| 254 | */ | ||
| 255 | while (unlikely(too_many_isolated(zone))) { | ||
| 256 | congestion_wait(BLK_RW_ASYNC, HZ/10); | ||
| 257 | |||
| 258 | if (fatal_signal_pending(current)) | ||
| 259 | return 0; | ||
| 260 | } | ||
| 261 | |||
| 262 | /* Time to isolate some pages for migration */ | ||
| 263 | spin_lock_irq(&zone->lru_lock); | ||
| 264 | for (; low_pfn < end_pfn; low_pfn++) { | ||
| 265 | struct page *page; | ||
| 266 | if (!pfn_valid_within(low_pfn)) | ||
| 267 | continue; | ||
| 268 | |||
| 269 | /* Get the page and skip if free */ | ||
| 270 | page = pfn_to_page(low_pfn); | ||
| 271 | if (PageBuddy(page)) | ||
| 272 | continue; | ||
| 273 | |||
| 274 | /* Try isolate the page */ | ||
| 275 | if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) | ||
| 276 | continue; | ||
| 277 | |||
| 278 | /* Successfully isolated */ | ||
| 279 | del_page_from_lru_list(zone, page, page_lru(page)); | ||
| 280 | list_add(&page->lru, migratelist); | ||
| 281 | mem_cgroup_del_lru(page); | ||
| 282 | cc->nr_migratepages++; | ||
| 283 | |||
| 284 | /* Avoid isolating too much */ | ||
| 285 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) | ||
| 286 | break; | ||
| 287 | } | ||
| 288 | |||
| 289 | acct_isolated(zone, cc); | ||
| 290 | |||
| 291 | spin_unlock_irq(&zone->lru_lock); | ||
| 292 | cc->migrate_pfn = low_pfn; | ||
| 293 | |||
| 294 | return cc->nr_migratepages; | ||
| 295 | } | ||
| 296 | |||
| 297 | /* | ||
| 298 | * This is a migrate-callback that "allocates" freepages by taking pages | ||
| 299 | * from the isolated freelists in the block we are migrating to. | ||
| 300 | */ | ||
| 301 | static struct page *compaction_alloc(struct page *migratepage, | ||
| 302 | unsigned long data, | ||
| 303 | int **result) | ||
| 304 | { | ||
| 305 | struct compact_control *cc = (struct compact_control *)data; | ||
| 306 | struct page *freepage; | ||
| 307 | |||
| 308 | /* Isolate free pages if necessary */ | ||
| 309 | if (list_empty(&cc->freepages)) { | ||
| 310 | isolate_freepages(cc->zone, cc); | ||
| 311 | |||
| 312 | if (list_empty(&cc->freepages)) | ||
| 313 | return NULL; | ||
| 314 | } | ||
| 315 | |||
| 316 | freepage = list_entry(cc->freepages.next, struct page, lru); | ||
| 317 | list_del(&freepage->lru); | ||
| 318 | cc->nr_freepages--; | ||
| 319 | |||
| 320 | return freepage; | ||
| 321 | } | ||
| 322 | |||
| 323 | /* | ||
| 324 | * We cannot control nr_migratepages and nr_freepages fully when migration is | ||
| 325 | * running as migrate_pages() has no knowledge of compact_control. When | ||
| 326 | * migration is complete, we count the number of pages on the lists by hand. | ||
| 327 | */ | ||
| 328 | static void update_nr_listpages(struct compact_control *cc) | ||
| 329 | { | ||
| 330 | int nr_migratepages = 0; | ||
| 331 | int nr_freepages = 0; | ||
| 332 | struct page *page; | ||
| 333 | |||
| 334 | list_for_each_entry(page, &cc->migratepages, lru) | ||
| 335 | nr_migratepages++; | ||
| 336 | list_for_each_entry(page, &cc->freepages, lru) | ||
| 337 | nr_freepages++; | ||
| 338 | |||
| 339 | cc->nr_migratepages = nr_migratepages; | ||
| 340 | cc->nr_freepages = nr_freepages; | ||
| 341 | } | ||
| 342 | |||
| 343 | static int compact_finished(struct zone *zone, | ||
| 344 | struct compact_control *cc) | ||
| 345 | { | ||
| 346 | unsigned int order; | ||
| 347 | unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); | ||
| 348 | |||
| 349 | if (fatal_signal_pending(current)) | ||
| 350 | return COMPACT_PARTIAL; | ||
| 351 | |||
| 352 | /* Compaction run completes if the migrate and free scanner meet */ | ||
| 353 | if (cc->free_pfn <= cc->migrate_pfn) | ||
| 354 | return COMPACT_COMPLETE; | ||
| 355 | |||
| 356 | /* Compaction run is not finished if the watermark is not met */ | ||
| 357 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) | ||
| 358 | return COMPACT_CONTINUE; | ||
| 359 | |||
| 360 | if (cc->order == -1) | ||
| 361 | return COMPACT_CONTINUE; | ||
| 362 | |||
| 363 | /* Direct compactor: Is a suitable page free? */ | ||
| 364 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
| 365 | /* Job done if page is free of the right migratetype */ | ||
| 366 | if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) | ||
| 367 | return COMPACT_PARTIAL; | ||
| 368 | |||
| 369 | /* Job done if allocation would set block type */ | ||
| 370 | if (order >= pageblock_order && zone->free_area[order].nr_free) | ||
| 371 | return COMPACT_PARTIAL; | ||
| 372 | } | ||
| 373 | |||
| 374 | return COMPACT_CONTINUE; | ||
| 375 | } | ||
| 376 | |||
| 377 | static int compact_zone(struct zone *zone, struct compact_control *cc) | ||
| 378 | { | ||
| 379 | int ret; | ||
| 380 | |||
| 381 | /* Setup to move all movable pages to the end of the zone */ | ||
| 382 | cc->migrate_pfn = zone->zone_start_pfn; | ||
| 383 | cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; | ||
| 384 | cc->free_pfn &= ~(pageblock_nr_pages-1); | ||
| 385 | |||
| 386 | migrate_prep_local(); | ||
| 387 | |||
| 388 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { | ||
| 389 | unsigned long nr_migrate, nr_remaining; | ||
| 390 | |||
| 391 | if (!isolate_migratepages(zone, cc)) | ||
| 392 | continue; | ||
| 393 | |||
| 394 | nr_migrate = cc->nr_migratepages; | ||
| 395 | migrate_pages(&cc->migratepages, compaction_alloc, | ||
| 396 | (unsigned long)cc, 0); | ||
| 397 | update_nr_listpages(cc); | ||
| 398 | nr_remaining = cc->nr_migratepages; | ||
| 399 | |||
| 400 | count_vm_event(COMPACTBLOCKS); | ||
| 401 | count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); | ||
| 402 | if (nr_remaining) | ||
| 403 | count_vm_events(COMPACTPAGEFAILED, nr_remaining); | ||
| 404 | |||
| 405 | /* Release LRU pages not migrated */ | ||
| 406 | if (!list_empty(&cc->migratepages)) { | ||
| 407 | putback_lru_pages(&cc->migratepages); | ||
| 408 | cc->nr_migratepages = 0; | ||
| 409 | } | ||
| 410 | |||
| 411 | } | ||
| 412 | |||
| 413 | /* Release free pages and check accounting */ | ||
| 414 | cc->nr_freepages -= release_freepages(&cc->freepages); | ||
| 415 | VM_BUG_ON(cc->nr_freepages != 0); | ||
| 416 | |||
| 417 | return ret; | ||
| 418 | } | ||
| 419 | |||
| 420 | static unsigned long compact_zone_order(struct zone *zone, | ||
| 421 | int order, gfp_t gfp_mask) | ||
| 422 | { | ||
| 423 | struct compact_control cc = { | ||
| 424 | .nr_freepages = 0, | ||
| 425 | .nr_migratepages = 0, | ||
| 426 | .order = order, | ||
| 427 | .migratetype = allocflags_to_migratetype(gfp_mask), | ||
| 428 | .zone = zone, | ||
| 429 | }; | ||
| 430 | INIT_LIST_HEAD(&cc.freepages); | ||
| 431 | INIT_LIST_HEAD(&cc.migratepages); | ||
| 432 | |||
| 433 | return compact_zone(zone, &cc); | ||
| 434 | } | ||
| 435 | |||
| 436 | int sysctl_extfrag_threshold = 500; | ||
| 437 | |||
| 438 | /** | ||
| 439 | * try_to_compact_pages - Direct compact to satisfy a high-order allocation | ||
| 440 | * @zonelist: The zonelist used for the current allocation | ||
| 441 | * @order: The order of the current allocation | ||
| 442 | * @gfp_mask: The GFP mask of the current allocation | ||
| 443 | * @nodemask: The allowed nodes to allocate from | ||
| 444 | * | ||
| 445 | * This is the main entry point for direct page compaction. | ||
| 446 | */ | ||
| 447 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | ||
| 448 | int order, gfp_t gfp_mask, nodemask_t *nodemask) | ||
| 449 | { | ||
| 450 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
| 451 | int may_enter_fs = gfp_mask & __GFP_FS; | ||
| 452 | int may_perform_io = gfp_mask & __GFP_IO; | ||
| 453 | unsigned long watermark; | ||
| 454 | struct zoneref *z; | ||
| 455 | struct zone *zone; | ||
| 456 | int rc = COMPACT_SKIPPED; | ||
| 457 | |||
| 458 | /* | ||
| 459 | * Check whether it is worth even starting compaction. The order check is | ||
| 460 | * made because an assumption is made that the page allocator can satisfy | ||
| 461 | * the "cheaper" orders without taking special steps | ||
| 462 | */ | ||
| 463 | if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) | ||
| 464 | return rc; | ||
| 465 | |||
| 466 | count_vm_event(COMPACTSTALL); | ||
| 467 | |||
| 468 | /* Compact each zone in the list */ | ||
| 469 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, | ||
| 470 | nodemask) { | ||
| 471 | int fragindex; | ||
| 472 | int status; | ||
| 473 | |||
| 474 | /* | ||
| 475 | * Watermarks for order-0 must be met for compaction. Note | ||
| 476 | * the 2UL. This is because during migration, copies of | ||
| 477 | * pages need to be allocated and for a short time, the | ||
| 478 | * footprint is higher | ||
| 479 | */ | ||
| 480 | watermark = low_wmark_pages(zone) + (2UL << order); | ||
| 481 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | ||
| 482 | continue; | ||
| 483 | |||
| 484 | /* | ||
| 485 | * fragmentation index determines if allocation failures are | ||
| 486 | * due to low memory or external fragmentation | ||
| 487 | * | ||
| 488 | * index of -1 implies allocations might succeed depending | ||
| 489 | * on watermarks | ||
| 490 | * index towards 0 implies failure is due to lack of memory | ||
| 491 | * index towards 1000 implies failure is due to fragmentation | ||
| 492 | * | ||
| 493 | * Only compact if a failure would be due to fragmentation. | ||
| 494 | */ | ||
| 495 | fragindex = fragmentation_index(zone, order); | ||
| 496 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | ||
| 497 | continue; | ||
| 498 | |||
| 499 | if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { | ||
| 500 | rc = COMPACT_PARTIAL; | ||
| 501 | break; | ||
| 502 | } | ||
| 503 | |||
| 504 | status = compact_zone_order(zone, order, gfp_mask); | ||
| 505 | rc = max(status, rc); | ||
| 506 | |||
| 507 | if (zone_watermark_ok(zone, order, watermark, 0, 0)) | ||
| 508 | break; | ||
| 509 | } | ||
| 510 | |||
| 511 | return rc; | ||
| 512 | } | ||
| 513 | |||
| 514 | |||
| 515 | /* Compact all zones within a node */ | ||
| 516 | static int compact_node(int nid) | ||
| 517 | { | ||
| 518 | int zoneid; | ||
| 519 | pg_data_t *pgdat; | ||
| 520 | struct zone *zone; | ||
| 521 | |||
| 522 | if (nid < 0 || nid >= nr_node_ids || !node_online(nid)) | ||
| 523 | return -EINVAL; | ||
| 524 | pgdat = NODE_DATA(nid); | ||
| 525 | |||
| 526 | /* Flush pending updates to the LRU lists */ | ||
| 527 | lru_add_drain_all(); | ||
| 528 | |||
| 529 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { | ||
| 530 | struct compact_control cc = { | ||
| 531 | .nr_freepages = 0, | ||
| 532 | .nr_migratepages = 0, | ||
| 533 | .order = -1, | ||
| 534 | }; | ||
| 535 | |||
| 536 | zone = &pgdat->node_zones[zoneid]; | ||
| 537 | if (!populated_zone(zone)) | ||
| 538 | continue; | ||
| 539 | |||
| 540 | cc.zone = zone; | ||
| 541 | INIT_LIST_HEAD(&cc.freepages); | ||
| 542 | INIT_LIST_HEAD(&cc.migratepages); | ||
| 543 | |||
| 544 | compact_zone(zone, &cc); | ||
| 545 | |||
| 546 | VM_BUG_ON(!list_empty(&cc.freepages)); | ||
| 547 | VM_BUG_ON(!list_empty(&cc.migratepages)); | ||
| 548 | } | ||
| 549 | |||
| 550 | return 0; | ||
| 551 | } | ||
| 552 | |||
| 553 | /* Compact all nodes in the system */ | ||
| 554 | static int compact_nodes(void) | ||
| 555 | { | ||
| 556 | int nid; | ||
| 557 | |||
| 558 | for_each_online_node(nid) | ||
| 559 | compact_node(nid); | ||
| 560 | |||
| 561 | return COMPACT_COMPLETE; | ||
| 562 | } | ||
| 563 | |||
| 564 | /* The written value is actually unused, all memory is compacted */ | ||
| 565 | int sysctl_compact_memory; | ||
| 566 | |||
| 567 | /* This is the entry point for compacting all nodes via /proc/sys/vm */ | ||
| 568 | int sysctl_compaction_handler(struct ctl_table *table, int write, | ||
| 569 | void __user *buffer, size_t *length, loff_t *ppos) | ||
| 570 | { | ||
| 571 | if (write) | ||
| 572 | return compact_nodes(); | ||
| 573 | |||
| 574 | return 0; | ||
| 575 | } | ||
| 576 | |||
| 577 | int sysctl_extfrag_handler(struct ctl_table *table, int write, | ||
| 578 | void __user *buffer, size_t *length, loff_t *ppos) | ||
| 579 | { | ||
| 580 | proc_dointvec_minmax(table, write, buffer, length, ppos); | ||
| 581 | |||
| 582 | return 0; | ||
| 583 | } | ||
| 584 | |||
| 585 | #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) | ||
| 586 | ssize_t sysfs_compact_node(struct sys_device *dev, | ||
| 587 | struct sysdev_attribute *attr, | ||
| 588 | const char *buf, size_t count) | ||
| 589 | { | ||
| 590 | compact_node(dev->id); | ||
| 591 | |||
| 592 | return count; | ||
| 593 | } | ||
| 594 | static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); | ||
| 595 | |||
| 596 | int compaction_register_node(struct node *node) | ||
| 597 | { | ||
| 598 | return sysdev_create_file(&node->sysdev, &attr_compact); | ||
| 599 | } | ||
| 600 | |||
| 601 | void compaction_unregister_node(struct node *node) | ||
| 602 | { | ||
| 603 | return sysdev_remove_file(&node->sysdev, &attr_compact); | ||
| 604 | } | ||
| 605 | #endif /* CONFIG_SYSFS && CONFIG_NUMA */ | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 140ebda9640f..88d719665a28 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -441,7 +441,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | |||
| 441 | /* | 441 | /* |
| 442 | * Splice_read and readahead add shmem/tmpfs pages into the page cache | 442 | * Splice_read and readahead add shmem/tmpfs pages into the page cache |
| 443 | * before shmem_readpage has a chance to mark them as SwapBacked: they | 443 | * before shmem_readpage has a chance to mark them as SwapBacked: they |
| 444 | * need to go on the active_anon lru below, and mem_cgroup_cache_charge | 444 | * need to go on the anon lru below, and mem_cgroup_cache_charge |
| 445 | * (called in add_to_page_cache) needs to know where they're going too. | 445 | * (called in add_to_page_cache) needs to know where they're going too. |
| 446 | */ | 446 | */ |
| 447 | if (mapping_cap_swap_backed(mapping)) | 447 | if (mapping_cap_swap_backed(mapping)) |
| @@ -452,7 +452,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | |||
| 452 | if (page_is_file_cache(page)) | 452 | if (page_is_file_cache(page)) |
| 453 | lru_cache_add_file(page); | 453 | lru_cache_add_file(page); |
| 454 | else | 454 | else |
| 455 | lru_cache_add_active_anon(page); | 455 | lru_cache_add_anon(page); |
| 456 | } | 456 | } |
| 457 | return ret; | 457 | return ret; |
| 458 | } | 458 | } |
| @@ -461,9 +461,15 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru); | |||
| 461 | #ifdef CONFIG_NUMA | 461 | #ifdef CONFIG_NUMA |
| 462 | struct page *__page_cache_alloc(gfp_t gfp) | 462 | struct page *__page_cache_alloc(gfp_t gfp) |
| 463 | { | 463 | { |
| 464 | int n; | ||
| 465 | struct page *page; | ||
| 466 | |||
| 464 | if (cpuset_do_page_mem_spread()) { | 467 | if (cpuset_do_page_mem_spread()) { |
| 465 | int n = cpuset_mem_spread_node(); | 468 | get_mems_allowed(); |
| 466 | return alloc_pages_exact_node(n, gfp, 0); | 469 | n = cpuset_mem_spread_node(); |
| 470 | page = alloc_pages_exact_node(n, gfp, 0); | ||
| 471 | put_mems_allowed(); | ||
| 472 | return page; | ||
| 467 | } | 473 | } |
| 468 | return alloc_pages(gfp, 0); | 474 | return alloc_pages(gfp, 0); |
| 469 | } | 475 | } |
diff --git a/mm/highmem.c b/mm/highmem.c index bed8a8bfd01f..66baa20f78f5 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
| @@ -422,7 +422,7 @@ void __init page_address_init(void) | |||
| 422 | 422 | ||
| 423 | #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ | 423 | #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ |
| 424 | 424 | ||
| 425 | #if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT) | 425 | #ifdef CONFIG_DEBUG_HIGHMEM |
| 426 | 426 | ||
| 427 | void debug_kmap_atomic(enum km_type type) | 427 | void debug_kmap_atomic(enum km_type type) |
| 428 | { | 428 | { |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4c9e6bbf3772..54d42b009dbe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -465,11 +465,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
| 465 | struct page *page = NULL; | 465 | struct page *page = NULL; |
| 466 | struct mempolicy *mpol; | 466 | struct mempolicy *mpol; |
| 467 | nodemask_t *nodemask; | 467 | nodemask_t *nodemask; |
| 468 | struct zonelist *zonelist = huge_zonelist(vma, address, | 468 | struct zonelist *zonelist; |
| 469 | htlb_alloc_mask, &mpol, &nodemask); | ||
| 470 | struct zone *zone; | 469 | struct zone *zone; |
| 471 | struct zoneref *z; | 470 | struct zoneref *z; |
| 472 | 471 | ||
| 472 | get_mems_allowed(); | ||
| 473 | zonelist = huge_zonelist(vma, address, | ||
| 474 | htlb_alloc_mask, &mpol, &nodemask); | ||
| 473 | /* | 475 | /* |
| 474 | * A child process with MAP_PRIVATE mappings created by their parent | 476 | * A child process with MAP_PRIVATE mappings created by their parent |
| 475 | * have no page reserves. This check ensures that reservations are | 477 | * have no page reserves. This check ensures that reservations are |
| @@ -477,11 +479,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
| 477 | */ | 479 | */ |
| 478 | if (!vma_has_reserves(vma) && | 480 | if (!vma_has_reserves(vma) && |
| 479 | h->free_huge_pages - h->resv_huge_pages == 0) | 481 | h->free_huge_pages - h->resv_huge_pages == 0) |
| 480 | return NULL; | 482 | goto err; |
| 481 | 483 | ||
| 482 | /* If reserves cannot be used, ensure enough pages are in the pool */ | 484 | /* If reserves cannot be used, ensure enough pages are in the pool */ |
| 483 | if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) | 485 | if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) |
| 484 | return NULL; | 486 | goto err;; |
| 485 | 487 | ||
| 486 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 488 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
| 487 | MAX_NR_ZONES - 1, nodemask) { | 489 | MAX_NR_ZONES - 1, nodemask) { |
| @@ -500,7 +502,9 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
| 500 | break; | 502 | break; |
| 501 | } | 503 | } |
| 502 | } | 504 | } |
| 505 | err: | ||
| 503 | mpol_cond_put(mpol); | 506 | mpol_cond_put(mpol); |
| 507 | put_mems_allowed(); | ||
| 504 | return page; | 508 | return page; |
| 505 | } | 509 | } |
| 506 | 510 | ||
| @@ -318,14 +318,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item, | |||
| 318 | struct anon_vma *anon_vma) | 318 | struct anon_vma *anon_vma) |
| 319 | { | 319 | { |
| 320 | rmap_item->anon_vma = anon_vma; | 320 | rmap_item->anon_vma = anon_vma; |
| 321 | atomic_inc(&anon_vma->ksm_refcount); | 321 | atomic_inc(&anon_vma->external_refcount); |
| 322 | } | 322 | } |
| 323 | 323 | ||
| 324 | static void drop_anon_vma(struct rmap_item *rmap_item) | 324 | static void drop_anon_vma(struct rmap_item *rmap_item) |
| 325 | { | 325 | { |
| 326 | struct anon_vma *anon_vma = rmap_item->anon_vma; | 326 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
| 327 | 327 | ||
| 328 | if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { | 328 | if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { |
| 329 | int empty = list_empty(&anon_vma->head); | 329 | int empty = list_empty(&anon_vma->head); |
| 330 | spin_unlock(&anon_vma->lock); | 330 | spin_unlock(&anon_vma->lock); |
| 331 | if (empty) | 331 | if (empty) |
diff --git a/mm/memory.c b/mm/memory.c index 833952d8b74d..119b7ccdf39b 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -1227,8 +1227,17 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, | |||
| 1227 | } | 1227 | } |
| 1228 | EXPORT_SYMBOL_GPL(zap_vma_ptes); | 1228 | EXPORT_SYMBOL_GPL(zap_vma_ptes); |
| 1229 | 1229 | ||
| 1230 | /* | 1230 | /** |
| 1231 | * Do a quick page-table lookup for a single page. | 1231 | * follow_page - look up a page descriptor from a user-virtual address |
| 1232 | * @vma: vm_area_struct mapping @address | ||
| 1233 | * @address: virtual address to look up | ||
| 1234 | * @flags: flags modifying lookup behaviour | ||
| 1235 | * | ||
| 1236 | * @flags can have FOLL_ flags set, defined in <linux/mm.h> | ||
| 1237 | * | ||
| 1238 | * Returns the mapped (struct page *), %NULL if no mapping exists, or | ||
| 1239 | * an error pointer if there is a mapping to something not represented | ||
| 1240 | * by a page descriptor (see also vm_normal_page()). | ||
| 1232 | */ | 1241 | */ |
| 1233 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | 1242 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, |
| 1234 | unsigned int flags) | 1243 | unsigned int flags) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index be211a582930..a4cfcdc00455 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -415,12 +415,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 415 | * This means the page allocator ignores this zone. | 415 | * This means the page allocator ignores this zone. |
| 416 | * So, zonelist must be updated after online. | 416 | * So, zonelist must be updated after online. |
| 417 | */ | 417 | */ |
| 418 | mutex_lock(&zonelists_mutex); | ||
| 418 | if (!populated_zone(zone)) | 419 | if (!populated_zone(zone)) |
| 419 | need_zonelists_rebuild = 1; | 420 | need_zonelists_rebuild = 1; |
| 420 | 421 | ||
| 421 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, | 422 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, |
| 422 | online_pages_range); | 423 | online_pages_range); |
| 423 | if (ret) { | 424 | if (ret) { |
| 425 | mutex_unlock(&zonelists_mutex); | ||
| 424 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", | 426 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", |
| 425 | nr_pages, pfn); | 427 | nr_pages, pfn); |
| 426 | memory_notify(MEM_CANCEL_ONLINE, &arg); | 428 | memory_notify(MEM_CANCEL_ONLINE, &arg); |
| @@ -429,8 +431,12 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 429 | 431 | ||
| 430 | zone->present_pages += onlined_pages; | 432 | zone->present_pages += onlined_pages; |
| 431 | zone->zone_pgdat->node_present_pages += onlined_pages; | 433 | zone->zone_pgdat->node_present_pages += onlined_pages; |
| 434 | if (need_zonelists_rebuild) | ||
| 435 | build_all_zonelists(zone); | ||
| 436 | else | ||
| 437 | zone_pcp_update(zone); | ||
| 432 | 438 | ||
| 433 | zone_pcp_update(zone); | 439 | mutex_unlock(&zonelists_mutex); |
| 434 | setup_per_zone_wmarks(); | 440 | setup_per_zone_wmarks(); |
| 435 | calculate_zone_inactive_ratio(zone); | 441 | calculate_zone_inactive_ratio(zone); |
| 436 | if (onlined_pages) { | 442 | if (onlined_pages) { |
| @@ -438,10 +444,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 438 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | 444 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); |
| 439 | } | 445 | } |
| 440 | 446 | ||
| 441 | if (need_zonelists_rebuild) | 447 | vm_total_pages = nr_free_pagecache_pages(); |
| 442 | build_all_zonelists(); | ||
| 443 | else | ||
| 444 | vm_total_pages = nr_free_pagecache_pages(); | ||
| 445 | 448 | ||
| 446 | writeback_set_ratelimit(); | 449 | writeback_set_ratelimit(); |
| 447 | 450 | ||
| @@ -482,6 +485,29 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) | |||
| 482 | } | 485 | } |
| 483 | 486 | ||
| 484 | 487 | ||
| 488 | /* | ||
| 489 | * called by cpu_up() to online a node without onlined memory. | ||
| 490 | */ | ||
| 491 | int mem_online_node(int nid) | ||
| 492 | { | ||
| 493 | pg_data_t *pgdat; | ||
| 494 | int ret; | ||
| 495 | |||
| 496 | lock_system_sleep(); | ||
| 497 | pgdat = hotadd_new_pgdat(nid, 0); | ||
| 498 | if (pgdat) { | ||
| 499 | ret = -ENOMEM; | ||
| 500 | goto out; | ||
| 501 | } | ||
| 502 | node_set_online(nid); | ||
| 503 | ret = register_one_node(nid); | ||
| 504 | BUG_ON(ret); | ||
| 505 | |||
| 506 | out: | ||
| 507 | unlock_system_sleep(); | ||
| 508 | return ret; | ||
| 509 | } | ||
| 510 | |||
| 485 | /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ | 511 | /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ |
| 486 | int __ref add_memory(int nid, u64 start, u64 size) | 512 | int __ref add_memory(int nid, u64 start, u64 size) |
| 487 | { | 513 | { |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 08f40a2f3fe0..75751012c552 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -119,7 +119,22 @@ struct mempolicy default_policy = { | |||
| 119 | 119 | ||
| 120 | static const struct mempolicy_operations { | 120 | static const struct mempolicy_operations { |
| 121 | int (*create)(struct mempolicy *pol, const nodemask_t *nodes); | 121 | int (*create)(struct mempolicy *pol, const nodemask_t *nodes); |
| 122 | void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); | 122 | /* |
| 123 | * If read-side task has no lock to protect task->mempolicy, write-side | ||
| 124 | * task will rebind the task->mempolicy by two step. The first step is | ||
| 125 | * setting all the newly nodes, and the second step is cleaning all the | ||
| 126 | * disallowed nodes. In this way, we can avoid finding no node to alloc | ||
| 127 | * page. | ||
| 128 | * If we have a lock to protect task->mempolicy in read-side, we do | ||
| 129 | * rebind directly. | ||
| 130 | * | ||
| 131 | * step: | ||
| 132 | * MPOL_REBIND_ONCE - do rebind work at once | ||
| 133 | * MPOL_REBIND_STEP1 - set all the newly nodes | ||
| 134 | * MPOL_REBIND_STEP2 - clean all the disallowed nodes | ||
| 135 | */ | ||
| 136 | void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, | ||
| 137 | enum mpol_rebind_step step); | ||
| 123 | } mpol_ops[MPOL_MAX]; | 138 | } mpol_ops[MPOL_MAX]; |
| 124 | 139 | ||
| 125 | /* Check that the nodemask contains at least one populated zone */ | 140 | /* Check that the nodemask contains at least one populated zone */ |
| @@ -127,9 +142,6 @@ static int is_valid_nodemask(const nodemask_t *nodemask) | |||
| 127 | { | 142 | { |
| 128 | int nd, k; | 143 | int nd, k; |
| 129 | 144 | ||
| 130 | /* Check that there is something useful in this mask */ | ||
| 131 | k = policy_zone; | ||
| 132 | |||
| 133 | for_each_node_mask(nd, *nodemask) { | 145 | for_each_node_mask(nd, *nodemask) { |
| 134 | struct zone *z; | 146 | struct zone *z; |
| 135 | 147 | ||
| @@ -145,7 +157,7 @@ static int is_valid_nodemask(const nodemask_t *nodemask) | |||
| 145 | 157 | ||
| 146 | static inline int mpol_store_user_nodemask(const struct mempolicy *pol) | 158 | static inline int mpol_store_user_nodemask(const struct mempolicy *pol) |
| 147 | { | 159 | { |
| 148 | return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES); | 160 | return pol->flags & MPOL_MODE_FLAGS; |
| 149 | } | 161 | } |
| 150 | 162 | ||
| 151 | static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, | 163 | static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, |
| @@ -277,12 +289,19 @@ void __mpol_put(struct mempolicy *p) | |||
| 277 | kmem_cache_free(policy_cache, p); | 289 | kmem_cache_free(policy_cache, p); |
| 278 | } | 290 | } |
| 279 | 291 | ||
| 280 | static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) | 292 | static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, |
| 293 | enum mpol_rebind_step step) | ||
| 281 | { | 294 | { |
| 282 | } | 295 | } |
| 283 | 296 | ||
| 284 | static void mpol_rebind_nodemask(struct mempolicy *pol, | 297 | /* |
| 285 | const nodemask_t *nodes) | 298 | * step: |
| 299 | * MPOL_REBIND_ONCE - do rebind work at once | ||
| 300 | * MPOL_REBIND_STEP1 - set all the newly nodes | ||
| 301 | * MPOL_REBIND_STEP2 - clean all the disallowed nodes | ||
| 302 | */ | ||
| 303 | static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, | ||
| 304 | enum mpol_rebind_step step) | ||
| 286 | { | 305 | { |
| 287 | nodemask_t tmp; | 306 | nodemask_t tmp; |
| 288 | 307 | ||
| @@ -291,12 +310,31 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, | |||
| 291 | else if (pol->flags & MPOL_F_RELATIVE_NODES) | 310 | else if (pol->flags & MPOL_F_RELATIVE_NODES) |
| 292 | mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); | 311 | mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); |
| 293 | else { | 312 | else { |
| 294 | nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, | 313 | /* |
| 295 | *nodes); | 314 | * if step == 1, we use ->w.cpuset_mems_allowed to cache the |
| 296 | pol->w.cpuset_mems_allowed = *nodes; | 315 | * result |
| 316 | */ | ||
| 317 | if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { | ||
| 318 | nodes_remap(tmp, pol->v.nodes, | ||
| 319 | pol->w.cpuset_mems_allowed, *nodes); | ||
| 320 | pol->w.cpuset_mems_allowed = step ? tmp : *nodes; | ||
| 321 | } else if (step == MPOL_REBIND_STEP2) { | ||
| 322 | tmp = pol->w.cpuset_mems_allowed; | ||
| 323 | pol->w.cpuset_mems_allowed = *nodes; | ||
| 324 | } else | ||
| 325 | BUG(); | ||
| 297 | } | 326 | } |
| 298 | 327 | ||
| 299 | pol->v.nodes = tmp; | 328 | if (nodes_empty(tmp)) |
| 329 | tmp = *nodes; | ||
| 330 | |||
| 331 | if (step == MPOL_REBIND_STEP1) | ||
| 332 | nodes_or(pol->v.nodes, pol->v.nodes, tmp); | ||
| 333 | else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) | ||
| 334 | pol->v.nodes = tmp; | ||
| 335 | else | ||
| 336 | BUG(); | ||
| 337 | |||
| 300 | if (!node_isset(current->il_next, tmp)) { | 338 | if (!node_isset(current->il_next, tmp)) { |
| 301 | current->il_next = next_node(current->il_next, tmp); | 339 | current->il_next = next_node(current->il_next, tmp); |
| 302 | if (current->il_next >= MAX_NUMNODES) | 340 | if (current->il_next >= MAX_NUMNODES) |
| @@ -307,7 +345,8 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, | |||
| 307 | } | 345 | } |
| 308 | 346 | ||
| 309 | static void mpol_rebind_preferred(struct mempolicy *pol, | 347 | static void mpol_rebind_preferred(struct mempolicy *pol, |
| 310 | const nodemask_t *nodes) | 348 | const nodemask_t *nodes, |
| 349 | enum mpol_rebind_step step) | ||
| 311 | { | 350 | { |
| 312 | nodemask_t tmp; | 351 | nodemask_t tmp; |
| 313 | 352 | ||
| @@ -330,16 +369,45 @@ static void mpol_rebind_preferred(struct mempolicy *pol, | |||
| 330 | } | 369 | } |
| 331 | } | 370 | } |
| 332 | 371 | ||
| 333 | /* Migrate a policy to a different set of nodes */ | 372 | /* |
| 334 | static void mpol_rebind_policy(struct mempolicy *pol, | 373 | * mpol_rebind_policy - Migrate a policy to a different set of nodes |
| 335 | const nodemask_t *newmask) | 374 | * |
| 375 | * If read-side task has no lock to protect task->mempolicy, write-side | ||
| 376 | * task will rebind the task->mempolicy by two step. The first step is | ||
| 377 | * setting all the newly nodes, and the second step is cleaning all the | ||
| 378 | * disallowed nodes. In this way, we can avoid finding no node to alloc | ||
| 379 | * page. | ||
| 380 | * If we have a lock to protect task->mempolicy in read-side, we do | ||
| 381 | * rebind directly. | ||
| 382 | * | ||
| 383 | * step: | ||
| 384 | * MPOL_REBIND_ONCE - do rebind work at once | ||
| 385 | * MPOL_REBIND_STEP1 - set all the newly nodes | ||
| 386 | * MPOL_REBIND_STEP2 - clean all the disallowed nodes | ||
| 387 | */ | ||
| 388 | static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, | ||
| 389 | enum mpol_rebind_step step) | ||
| 336 | { | 390 | { |
| 337 | if (!pol) | 391 | if (!pol) |
| 338 | return; | 392 | return; |
| 339 | if (!mpol_store_user_nodemask(pol) && | 393 | if (!mpol_store_user_nodemask(pol) && step == 0 && |
| 340 | nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) | 394 | nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) |
| 341 | return; | 395 | return; |
| 342 | mpol_ops[pol->mode].rebind(pol, newmask); | 396 | |
| 397 | if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) | ||
| 398 | return; | ||
| 399 | |||
| 400 | if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) | ||
| 401 | BUG(); | ||
| 402 | |||
| 403 | if (step == MPOL_REBIND_STEP1) | ||
| 404 | pol->flags |= MPOL_F_REBINDING; | ||
| 405 | else if (step == MPOL_REBIND_STEP2) | ||
| 406 | pol->flags &= ~MPOL_F_REBINDING; | ||
| 407 | else if (step >= MPOL_REBIND_NSTEP) | ||
| 408 | BUG(); | ||
| 409 | |||
| 410 | mpol_ops[pol->mode].rebind(pol, newmask, step); | ||
| 343 | } | 411 | } |
| 344 | 412 | ||
| 345 | /* | 413 | /* |
| @@ -349,9 +417,10 @@ static void mpol_rebind_policy(struct mempolicy *pol, | |||
| 349 | * Called with task's alloc_lock held. | 417 | * Called with task's alloc_lock held. |
| 350 | */ | 418 | */ |
| 351 | 419 | ||
| 352 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) | 420 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, |
| 421 | enum mpol_rebind_step step) | ||
| 353 | { | 422 | { |
| 354 | mpol_rebind_policy(tsk->mempolicy, new); | 423 | mpol_rebind_policy(tsk->mempolicy, new, step); |
| 355 | } | 424 | } |
| 356 | 425 | ||
| 357 | /* | 426 | /* |
| @@ -366,7 +435,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) | |||
| 366 | 435 | ||
| 367 | down_write(&mm->mmap_sem); | 436 | down_write(&mm->mmap_sem); |
| 368 | for (vma = mm->mmap; vma; vma = vma->vm_next) | 437 | for (vma = mm->mmap; vma; vma = vma->vm_next) |
| 369 | mpol_rebind_policy(vma->vm_policy, new); | 438 | mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); |
| 370 | up_write(&mm->mmap_sem); | 439 | up_write(&mm->mmap_sem); |
| 371 | } | 440 | } |
| 372 | 441 | ||
| @@ -859,7 +928,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
| 859 | nodes_clear(nmask); | 928 | nodes_clear(nmask); |
| 860 | node_set(source, nmask); | 929 | node_set(source, nmask); |
| 861 | 930 | ||
| 862 | check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, | 931 | check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, |
| 863 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 932 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
| 864 | 933 | ||
| 865 | if (!list_empty(&pagelist)) | 934 | if (!list_empty(&pagelist)) |
| @@ -1444,15 +1513,13 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) | |||
| 1444 | /* | 1513 | /* |
| 1445 | * Normally, MPOL_BIND allocations are node-local within the | 1514 | * Normally, MPOL_BIND allocations are node-local within the |
| 1446 | * allowed nodemask. However, if __GFP_THISNODE is set and the | 1515 | * allowed nodemask. However, if __GFP_THISNODE is set and the |
| 1447 | * current node is part of the mask, we use the zonelist for | 1516 | * current node isn't part of the mask, we use the zonelist for |
| 1448 | * the first node in the mask instead. | 1517 | * the first node in the mask instead. |
| 1449 | */ | 1518 | */ |
| 1450 | if (unlikely(gfp & __GFP_THISNODE) && | 1519 | if (unlikely(gfp & __GFP_THISNODE) && |
| 1451 | unlikely(!node_isset(nd, policy->v.nodes))) | 1520 | unlikely(!node_isset(nd, policy->v.nodes))) |
| 1452 | nd = first_node(policy->v.nodes); | 1521 | nd = first_node(policy->v.nodes); |
| 1453 | break; | 1522 | break; |
| 1454 | case MPOL_INTERLEAVE: /* should not happen */ | ||
| 1455 | break; | ||
| 1456 | default: | 1523 | default: |
| 1457 | BUG(); | 1524 | BUG(); |
| 1458 | } | 1525 | } |
| @@ -1572,6 +1639,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol, | |||
| 1572 | * to the struct mempolicy for conditional unref after allocation. | 1639 | * to the struct mempolicy for conditional unref after allocation. |
| 1573 | * If the effective policy is 'BIND, returns a pointer to the mempolicy's | 1640 | * If the effective policy is 'BIND, returns a pointer to the mempolicy's |
| 1574 | * @nodemask for filtering the zonelist. | 1641 | * @nodemask for filtering the zonelist. |
| 1642 | * | ||
| 1643 | * Must be protected by get_mems_allowed() | ||
| 1575 | */ | 1644 | */ |
| 1576 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | 1645 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, |
| 1577 | gfp_t gfp_flags, struct mempolicy **mpol, | 1646 | gfp_t gfp_flags, struct mempolicy **mpol, |
| @@ -1617,6 +1686,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) | |||
| 1617 | if (!(mask && current->mempolicy)) | 1686 | if (!(mask && current->mempolicy)) |
| 1618 | return false; | 1687 | return false; |
| 1619 | 1688 | ||
| 1689 | task_lock(current); | ||
| 1620 | mempolicy = current->mempolicy; | 1690 | mempolicy = current->mempolicy; |
| 1621 | switch (mempolicy->mode) { | 1691 | switch (mempolicy->mode) { |
| 1622 | case MPOL_PREFERRED: | 1692 | case MPOL_PREFERRED: |
| @@ -1636,6 +1706,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) | |||
| 1636 | default: | 1706 | default: |
| 1637 | BUG(); | 1707 | BUG(); |
| 1638 | } | 1708 | } |
| 1709 | task_unlock(current); | ||
| 1639 | 1710 | ||
| 1640 | return true; | 1711 | return true; |
| 1641 | } | 1712 | } |
| @@ -1683,13 +1754,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
| 1683 | { | 1754 | { |
| 1684 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1755 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
| 1685 | struct zonelist *zl; | 1756 | struct zonelist *zl; |
| 1757 | struct page *page; | ||
| 1686 | 1758 | ||
| 1759 | get_mems_allowed(); | ||
| 1687 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1760 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
| 1688 | unsigned nid; | 1761 | unsigned nid; |
| 1689 | 1762 | ||
| 1690 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); | 1763 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); |
| 1691 | mpol_cond_put(pol); | 1764 | mpol_cond_put(pol); |
| 1692 | return alloc_page_interleave(gfp, 0, nid); | 1765 | page = alloc_page_interleave(gfp, 0, nid); |
| 1766 | put_mems_allowed(); | ||
| 1767 | return page; | ||
| 1693 | } | 1768 | } |
| 1694 | zl = policy_zonelist(gfp, pol); | 1769 | zl = policy_zonelist(gfp, pol); |
| 1695 | if (unlikely(mpol_needs_cond_ref(pol))) { | 1770 | if (unlikely(mpol_needs_cond_ref(pol))) { |
| @@ -1699,12 +1774,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
| 1699 | struct page *page = __alloc_pages_nodemask(gfp, 0, | 1774 | struct page *page = __alloc_pages_nodemask(gfp, 0, |
| 1700 | zl, policy_nodemask(gfp, pol)); | 1775 | zl, policy_nodemask(gfp, pol)); |
| 1701 | __mpol_put(pol); | 1776 | __mpol_put(pol); |
| 1777 | put_mems_allowed(); | ||
| 1702 | return page; | 1778 | return page; |
| 1703 | } | 1779 | } |
| 1704 | /* | 1780 | /* |
| 1705 | * fast path: default or task policy | 1781 | * fast path: default or task policy |
| 1706 | */ | 1782 | */ |
| 1707 | return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); | 1783 | page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); |
| 1784 | put_mems_allowed(); | ||
| 1785 | return page; | ||
| 1708 | } | 1786 | } |
| 1709 | 1787 | ||
| 1710 | /** | 1788 | /** |
| @@ -1729,18 +1807,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
| 1729 | struct page *alloc_pages_current(gfp_t gfp, unsigned order) | 1807 | struct page *alloc_pages_current(gfp_t gfp, unsigned order) |
| 1730 | { | 1808 | { |
| 1731 | struct mempolicy *pol = current->mempolicy; | 1809 | struct mempolicy *pol = current->mempolicy; |
| 1810 | struct page *page; | ||
| 1732 | 1811 | ||
| 1733 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 1812 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
| 1734 | pol = &default_policy; | 1813 | pol = &default_policy; |
| 1735 | 1814 | ||
| 1815 | get_mems_allowed(); | ||
| 1736 | /* | 1816 | /* |
| 1737 | * No reference counting needed for current->mempolicy | 1817 | * No reference counting needed for current->mempolicy |
| 1738 | * nor system default_policy | 1818 | * nor system default_policy |
| 1739 | */ | 1819 | */ |
| 1740 | if (pol->mode == MPOL_INTERLEAVE) | 1820 | if (pol->mode == MPOL_INTERLEAVE) |
| 1741 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); | 1821 | page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); |
| 1742 | return __alloc_pages_nodemask(gfp, order, | 1822 | else |
| 1823 | page = __alloc_pages_nodemask(gfp, order, | ||
| 1743 | policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); | 1824 | policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); |
| 1825 | put_mems_allowed(); | ||
| 1826 | return page; | ||
| 1744 | } | 1827 | } |
| 1745 | EXPORT_SYMBOL(alloc_pages_current); | 1828 | EXPORT_SYMBOL(alloc_pages_current); |
| 1746 | 1829 | ||
| @@ -1750,6 +1833,9 @@ EXPORT_SYMBOL(alloc_pages_current); | |||
| 1750 | * with the mems_allowed returned by cpuset_mems_allowed(). This | 1833 | * with the mems_allowed returned by cpuset_mems_allowed(). This |
| 1751 | * keeps mempolicies cpuset relative after its cpuset moves. See | 1834 | * keeps mempolicies cpuset relative after its cpuset moves. See |
| 1752 | * further kernel/cpuset.c update_nodemask(). | 1835 | * further kernel/cpuset.c update_nodemask(). |
| 1836 | * | ||
| 1837 | * current's mempolicy may be rebinded by the other task(the task that changes | ||
| 1838 | * cpuset's mems), so we needn't do rebind work for current task. | ||
| 1753 | */ | 1839 | */ |
| 1754 | 1840 | ||
| 1755 | /* Slow path of a mempolicy duplicate */ | 1841 | /* Slow path of a mempolicy duplicate */ |
| @@ -1759,13 +1845,24 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) | |||
| 1759 | 1845 | ||
| 1760 | if (!new) | 1846 | if (!new) |
| 1761 | return ERR_PTR(-ENOMEM); | 1847 | return ERR_PTR(-ENOMEM); |
| 1848 | |||
| 1849 | /* task's mempolicy is protected by alloc_lock */ | ||
| 1850 | if (old == current->mempolicy) { | ||
| 1851 | task_lock(current); | ||
| 1852 | *new = *old; | ||
| 1853 | task_unlock(current); | ||
| 1854 | } else | ||
| 1855 | *new = *old; | ||
| 1856 | |||
| 1762 | rcu_read_lock(); | 1857 | rcu_read_lock(); |
| 1763 | if (current_cpuset_is_being_rebound()) { | 1858 | if (current_cpuset_is_being_rebound()) { |
| 1764 | nodemask_t mems = cpuset_mems_allowed(current); | 1859 | nodemask_t mems = cpuset_mems_allowed(current); |
| 1765 | mpol_rebind_policy(old, &mems); | 1860 | if (new->flags & MPOL_F_REBINDING) |
| 1861 | mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); | ||
| 1862 | else | ||
| 1863 | mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); | ||
| 1766 | } | 1864 | } |
| 1767 | rcu_read_unlock(); | 1865 | rcu_read_unlock(); |
| 1768 | *new = *old; | ||
| 1769 | atomic_set(&new->refcnt, 1); | 1866 | atomic_set(&new->refcnt, 1); |
| 1770 | return new; | 1867 | return new; |
| 1771 | } | 1868 | } |
| @@ -1792,16 +1889,6 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, | |||
| 1792 | return tompol; | 1889 | return tompol; |
| 1793 | } | 1890 | } |
| 1794 | 1891 | ||
| 1795 | static int mpol_match_intent(const struct mempolicy *a, | ||
| 1796 | const struct mempolicy *b) | ||
| 1797 | { | ||
| 1798 | if (a->flags != b->flags) | ||
| 1799 | return 0; | ||
| 1800 | if (!mpol_store_user_nodemask(a)) | ||
| 1801 | return 1; | ||
| 1802 | return nodes_equal(a->w.user_nodemask, b->w.user_nodemask); | ||
| 1803 | } | ||
| 1804 | |||
| 1805 | /* Slow path of a mempolicy comparison */ | 1892 | /* Slow path of a mempolicy comparison */ |
| 1806 | int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | 1893 | int __mpol_equal(struct mempolicy *a, struct mempolicy *b) |
| 1807 | { | 1894 | { |
| @@ -1809,8 +1896,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
| 1809 | return 0; | 1896 | return 0; |
| 1810 | if (a->mode != b->mode) | 1897 | if (a->mode != b->mode) |
| 1811 | return 0; | 1898 | return 0; |
| 1812 | if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b)) | 1899 | if (a->flags != b->flags) |
| 1813 | return 0; | 1900 | return 0; |
| 1901 | if (mpol_store_user_nodemask(a)) | ||
| 1902 | if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) | ||
| 1903 | return 0; | ||
| 1904 | |||
| 1814 | switch (a->mode) { | 1905 | switch (a->mode) { |
| 1815 | case MPOL_BIND: | 1906 | case MPOL_BIND: |
| 1816 | /* Fall through */ | 1907 | /* Fall through */ |
| @@ -2006,26 +2097,22 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | |||
| 2006 | return; | 2097 | return; |
| 2007 | /* contextualize the tmpfs mount point mempolicy */ | 2098 | /* contextualize the tmpfs mount point mempolicy */ |
| 2008 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); | 2099 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); |
| 2009 | if (IS_ERR(new)) { | 2100 | if (IS_ERR(new)) |
| 2010 | mpol_put(mpol); /* drop our ref on sb mpol */ | 2101 | goto put_free; /* no valid nodemask intersection */ |
| 2011 | NODEMASK_SCRATCH_FREE(scratch); | ||
| 2012 | return; /* no valid nodemask intersection */ | ||
| 2013 | } | ||
| 2014 | 2102 | ||
| 2015 | task_lock(current); | 2103 | task_lock(current); |
| 2016 | ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); | 2104 | ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); |
| 2017 | task_unlock(current); | 2105 | task_unlock(current); |
| 2018 | mpol_put(mpol); /* drop our ref on sb mpol */ | 2106 | mpol_put(mpol); /* drop our ref on sb mpol */ |
| 2019 | if (ret) { | 2107 | if (ret) |
| 2020 | NODEMASK_SCRATCH_FREE(scratch); | 2108 | goto put_free; |
| 2021 | mpol_put(new); | ||
| 2022 | return; | ||
| 2023 | } | ||
| 2024 | 2109 | ||
| 2025 | /* Create pseudo-vma that contains just the policy */ | 2110 | /* Create pseudo-vma that contains just the policy */ |
| 2026 | memset(&pvma, 0, sizeof(struct vm_area_struct)); | 2111 | memset(&pvma, 0, sizeof(struct vm_area_struct)); |
| 2027 | pvma.vm_end = TASK_SIZE; /* policy covers entire file */ | 2112 | pvma.vm_end = TASK_SIZE; /* policy covers entire file */ |
| 2028 | mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ | 2113 | mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ |
| 2114 | |||
| 2115 | put_free: | ||
| 2029 | mpol_put(new); /* drop initial ref */ | 2116 | mpol_put(new); /* drop initial ref */ |
| 2030 | NODEMASK_SCRATCH_FREE(scratch); | 2117 | NODEMASK_SCRATCH_FREE(scratch); |
| 2031 | } | 2118 | } |
| @@ -2132,9 +2219,15 @@ void numa_default_policy(void) | |||
| 2132 | * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag | 2219 | * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag |
| 2133 | * Used only for mpol_parse_str() and mpol_to_str() | 2220 | * Used only for mpol_parse_str() and mpol_to_str() |
| 2134 | */ | 2221 | */ |
| 2135 | #define MPOL_LOCAL (MPOL_INTERLEAVE + 1) | 2222 | #define MPOL_LOCAL MPOL_MAX |
| 2136 | static const char * const policy_types[] = | 2223 | static const char * const policy_modes[] = |
| 2137 | { "default", "prefer", "bind", "interleave", "local" }; | 2224 | { |
| 2225 | [MPOL_DEFAULT] = "default", | ||
| 2226 | [MPOL_PREFERRED] = "prefer", | ||
| 2227 | [MPOL_BIND] = "bind", | ||
| 2228 | [MPOL_INTERLEAVE] = "interleave", | ||
| 2229 | [MPOL_LOCAL] = "local" | ||
| 2230 | }; | ||
| 2138 | 2231 | ||
| 2139 | 2232 | ||
| 2140 | #ifdef CONFIG_TMPFS | 2233 | #ifdef CONFIG_TMPFS |
| @@ -2159,12 +2252,11 @@ static const char * const policy_types[] = | |||
| 2159 | int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | 2252 | int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) |
| 2160 | { | 2253 | { |
| 2161 | struct mempolicy *new = NULL; | 2254 | struct mempolicy *new = NULL; |
| 2162 | unsigned short uninitialized_var(mode); | 2255 | unsigned short mode; |
| 2163 | unsigned short uninitialized_var(mode_flags); | 2256 | unsigned short uninitialized_var(mode_flags); |
| 2164 | nodemask_t nodes; | 2257 | nodemask_t nodes; |
| 2165 | char *nodelist = strchr(str, ':'); | 2258 | char *nodelist = strchr(str, ':'); |
| 2166 | char *flags = strchr(str, '='); | 2259 | char *flags = strchr(str, '='); |
| 2167 | int i; | ||
| 2168 | int err = 1; | 2260 | int err = 1; |
| 2169 | 2261 | ||
| 2170 | if (nodelist) { | 2262 | if (nodelist) { |
| @@ -2180,13 +2272,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
| 2180 | if (flags) | 2272 | if (flags) |
| 2181 | *flags++ = '\0'; /* terminate mode string */ | 2273 | *flags++ = '\0'; /* terminate mode string */ |
| 2182 | 2274 | ||
| 2183 | for (i = 0; i <= MPOL_LOCAL; i++) { | 2275 | for (mode = 0; mode <= MPOL_LOCAL; mode++) { |
| 2184 | if (!strcmp(str, policy_types[i])) { | 2276 | if (!strcmp(str, policy_modes[mode])) { |
| 2185 | mode = i; | ||
| 2186 | break; | 2277 | break; |
| 2187 | } | 2278 | } |
| 2188 | } | 2279 | } |
| 2189 | if (i > MPOL_LOCAL) | 2280 | if (mode > MPOL_LOCAL) |
| 2190 | goto out; | 2281 | goto out; |
| 2191 | 2282 | ||
| 2192 | switch (mode) { | 2283 | switch (mode) { |
| @@ -2250,7 +2341,10 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
| 2250 | if (IS_ERR(new)) | 2341 | if (IS_ERR(new)) |
| 2251 | goto out; | 2342 | goto out; |
| 2252 | 2343 | ||
| 2253 | { | 2344 | if (no_context) { |
| 2345 | /* save for contextualization */ | ||
| 2346 | new->w.user_nodemask = nodes; | ||
| 2347 | } else { | ||
| 2254 | int ret; | 2348 | int ret; |
| 2255 | NODEMASK_SCRATCH(scratch); | 2349 | NODEMASK_SCRATCH(scratch); |
| 2256 | if (scratch) { | 2350 | if (scratch) { |
| @@ -2266,10 +2360,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
| 2266 | } | 2360 | } |
| 2267 | } | 2361 | } |
| 2268 | err = 0; | 2362 | err = 0; |
| 2269 | if (no_context) { | ||
| 2270 | /* save for contextualization */ | ||
| 2271 | new->w.user_nodemask = nodes; | ||
| 2272 | } | ||
| 2273 | 2363 | ||
| 2274 | out: | 2364 | out: |
| 2275 | /* Restore string for error message */ | 2365 | /* Restore string for error message */ |
| @@ -2338,11 +2428,11 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) | |||
| 2338 | BUG(); | 2428 | BUG(); |
| 2339 | } | 2429 | } |
| 2340 | 2430 | ||
| 2341 | l = strlen(policy_types[mode]); | 2431 | l = strlen(policy_modes[mode]); |
| 2342 | if (buffer + maxlen < p + l + 1) | 2432 | if (buffer + maxlen < p + l + 1) |
| 2343 | return -ENOSPC; | 2433 | return -ENOSPC; |
| 2344 | 2434 | ||
| 2345 | strcpy(p, policy_types[mode]); | 2435 | strcpy(p, policy_modes[mode]); |
| 2346 | p += l; | 2436 | p += l; |
| 2347 | 2437 | ||
| 2348 | if (flags & MPOL_MODE_FLAGS) { | 2438 | if (flags & MPOL_MODE_FLAGS) { |
diff --git a/mm/migrate.c b/mm/migrate.c index d3f3f7f81075..09e2471afa0f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -40,7 +40,8 @@ | |||
| 40 | 40 | ||
| 41 | /* | 41 | /* |
| 42 | * migrate_prep() needs to be called before we start compiling a list of pages | 42 | * migrate_prep() needs to be called before we start compiling a list of pages |
| 43 | * to be migrated using isolate_lru_page(). | 43 | * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is |
| 44 | * undesirable, use migrate_prep_local() | ||
| 44 | */ | 45 | */ |
| 45 | int migrate_prep(void) | 46 | int migrate_prep(void) |
| 46 | { | 47 | { |
| @@ -55,26 +56,29 @@ int migrate_prep(void) | |||
| 55 | return 0; | 56 | return 0; |
| 56 | } | 57 | } |
| 57 | 58 | ||
| 59 | /* Do the necessary work of migrate_prep but not if it involves other CPUs */ | ||
| 60 | int migrate_prep_local(void) | ||
| 61 | { | ||
| 62 | lru_add_drain(); | ||
| 63 | |||
| 64 | return 0; | ||
| 65 | } | ||
| 66 | |||
| 58 | /* | 67 | /* |
| 59 | * Add isolated pages on the list back to the LRU under page lock | 68 | * Add isolated pages on the list back to the LRU under page lock |
| 60 | * to avoid leaking evictable pages back onto unevictable list. | 69 | * to avoid leaking evictable pages back onto unevictable list. |
| 61 | * | ||
| 62 | * returns the number of pages put back. | ||
| 63 | */ | 70 | */ |
| 64 | int putback_lru_pages(struct list_head *l) | 71 | void putback_lru_pages(struct list_head *l) |
| 65 | { | 72 | { |
| 66 | struct page *page; | 73 | struct page *page; |
| 67 | struct page *page2; | 74 | struct page *page2; |
| 68 | int count = 0; | ||
| 69 | 75 | ||
| 70 | list_for_each_entry_safe(page, page2, l, lru) { | 76 | list_for_each_entry_safe(page, page2, l, lru) { |
| 71 | list_del(&page->lru); | 77 | list_del(&page->lru); |
| 72 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 78 | dec_zone_page_state(page, NR_ISOLATED_ANON + |
| 73 | page_is_file_cache(page)); | 79 | page_is_file_cache(page)); |
| 74 | putback_lru_page(page); | 80 | putback_lru_page(page); |
| 75 | count++; | ||
| 76 | } | 81 | } |
| 77 | return count; | ||
| 78 | } | 82 | } |
| 79 | 83 | ||
| 80 | /* | 84 | /* |
| @@ -490,7 +494,8 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
| 490 | * < 0 - error code | 494 | * < 0 - error code |
| 491 | * == 0 - success | 495 | * == 0 - success |
| 492 | */ | 496 | */ |
| 493 | static int move_to_new_page(struct page *newpage, struct page *page) | 497 | static int move_to_new_page(struct page *newpage, struct page *page, |
| 498 | int remap_swapcache) | ||
| 494 | { | 499 | { |
| 495 | struct address_space *mapping; | 500 | struct address_space *mapping; |
| 496 | int rc; | 501 | int rc; |
| @@ -525,10 +530,12 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
| 525 | else | 530 | else |
| 526 | rc = fallback_migrate_page(mapping, newpage, page); | 531 | rc = fallback_migrate_page(mapping, newpage, page); |
| 527 | 532 | ||
| 528 | if (!rc) | 533 | if (rc) { |
| 529 | remove_migration_ptes(page, newpage); | ||
| 530 | else | ||
| 531 | newpage->mapping = NULL; | 534 | newpage->mapping = NULL; |
| 535 | } else { | ||
| 536 | if (remap_swapcache) | ||
| 537 | remove_migration_ptes(page, newpage); | ||
| 538 | } | ||
| 532 | 539 | ||
| 533 | unlock_page(newpage); | 540 | unlock_page(newpage); |
| 534 | 541 | ||
| @@ -545,9 +552,11 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 545 | int rc = 0; | 552 | int rc = 0; |
| 546 | int *result = NULL; | 553 | int *result = NULL; |
| 547 | struct page *newpage = get_new_page(page, private, &result); | 554 | struct page *newpage = get_new_page(page, private, &result); |
| 555 | int remap_swapcache = 1; | ||
| 548 | int rcu_locked = 0; | 556 | int rcu_locked = 0; |
| 549 | int charge = 0; | 557 | int charge = 0; |
| 550 | struct mem_cgroup *mem = NULL; | 558 | struct mem_cgroup *mem = NULL; |
| 559 | struct anon_vma *anon_vma = NULL; | ||
| 551 | 560 | ||
| 552 | if (!newpage) | 561 | if (!newpage) |
| 553 | return -ENOMEM; | 562 | return -ENOMEM; |
| @@ -604,6 +613,34 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 604 | if (PageAnon(page)) { | 613 | if (PageAnon(page)) { |
| 605 | rcu_read_lock(); | 614 | rcu_read_lock(); |
| 606 | rcu_locked = 1; | 615 | rcu_locked = 1; |
| 616 | |||
| 617 | /* Determine how to safely use anon_vma */ | ||
| 618 | if (!page_mapped(page)) { | ||
| 619 | if (!PageSwapCache(page)) | ||
| 620 | goto rcu_unlock; | ||
| 621 | |||
| 622 | /* | ||
| 623 | * We cannot be sure that the anon_vma of an unmapped | ||
| 624 | * swapcache page is safe to use because we don't | ||
| 625 | * know in advance if the VMA that this page belonged | ||
| 626 | * to still exists. If the VMA and others sharing the | ||
| 627 | * data have been freed, then the anon_vma could | ||
| 628 | * already be invalid. | ||
| 629 | * | ||
| 630 | * To avoid this possibility, swapcache pages get | ||
| 631 | * migrated but are not remapped when migration | ||
| 632 | * completes | ||
| 633 | */ | ||
| 634 | remap_swapcache = 0; | ||
| 635 | } else { | ||
| 636 | /* | ||
| 637 | * Take a reference count on the anon_vma if the | ||
| 638 | * page is mapped so that it is guaranteed to | ||
| 639 | * exist when the page is remapped later | ||
| 640 | */ | ||
| 641 | anon_vma = page_anon_vma(page); | ||
| 642 | atomic_inc(&anon_vma->external_refcount); | ||
| 643 | } | ||
| 607 | } | 644 | } |
| 608 | 645 | ||
| 609 | /* | 646 | /* |
| @@ -638,11 +675,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 638 | 675 | ||
| 639 | skip_unmap: | 676 | skip_unmap: |
| 640 | if (!page_mapped(page)) | 677 | if (!page_mapped(page)) |
| 641 | rc = move_to_new_page(newpage, page); | 678 | rc = move_to_new_page(newpage, page, remap_swapcache); |
| 642 | 679 | ||
| 643 | if (rc) | 680 | if (rc && remap_swapcache) |
| 644 | remove_migration_ptes(page, page); | 681 | remove_migration_ptes(page, page); |
| 645 | rcu_unlock: | 682 | rcu_unlock: |
| 683 | |||
| 684 | /* Drop an anon_vma reference if we took one */ | ||
| 685 | if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { | ||
| 686 | int empty = list_empty(&anon_vma->head); | ||
| 687 | spin_unlock(&anon_vma->lock); | ||
| 688 | if (empty) | ||
| 689 | anon_vma_free(anon_vma); | ||
| 690 | } | ||
| 691 | |||
| 646 | if (rcu_locked) | 692 | if (rcu_locked) |
| 647 | rcu_read_unlock(); | 693 | rcu_read_unlock(); |
| 648 | uncharge: | 694 | uncharge: |
diff --git a/mm/mincore.c b/mm/mincore.c index f77433c20279..9ac42dc6d7b6 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
| @@ -19,6 +19,40 @@ | |||
| 19 | #include <asm/uaccess.h> | 19 | #include <asm/uaccess.h> |
| 20 | #include <asm/pgtable.h> | 20 | #include <asm/pgtable.h> |
| 21 | 21 | ||
| 22 | static void mincore_hugetlb_page_range(struct vm_area_struct *vma, | ||
| 23 | unsigned long addr, unsigned long end, | ||
| 24 | unsigned char *vec) | ||
| 25 | { | ||
| 26 | #ifdef CONFIG_HUGETLB_PAGE | ||
| 27 | struct hstate *h; | ||
| 28 | |||
| 29 | h = hstate_vma(vma); | ||
| 30 | while (1) { | ||
| 31 | unsigned char present; | ||
| 32 | pte_t *ptep; | ||
| 33 | /* | ||
| 34 | * Huge pages are always in RAM for now, but | ||
| 35 | * theoretically it needs to be checked. | ||
| 36 | */ | ||
| 37 | ptep = huge_pte_offset(current->mm, | ||
| 38 | addr & huge_page_mask(h)); | ||
| 39 | present = ptep && !huge_pte_none(huge_ptep_get(ptep)); | ||
| 40 | while (1) { | ||
| 41 | *vec = present; | ||
| 42 | vec++; | ||
| 43 | addr += PAGE_SIZE; | ||
| 44 | if (addr == end) | ||
| 45 | return; | ||
| 46 | /* check hugepage border */ | ||
| 47 | if (!(addr & ~huge_page_mask(h))) | ||
| 48 | break; | ||
| 49 | } | ||
| 50 | } | ||
| 51 | #else | ||
| 52 | BUG(); | ||
| 53 | #endif | ||
| 54 | } | ||
| 55 | |||
| 22 | /* | 56 | /* |
| 23 | * Later we can get more picky about what "in core" means precisely. | 57 | * Later we can get more picky about what "in core" means precisely. |
| 24 | * For now, simply check to see if the page is in the page cache, | 58 | * For now, simply check to see if the page is in the page cache, |
| @@ -49,145 +83,150 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) | |||
| 49 | return present; | 83 | return present; |
| 50 | } | 84 | } |
| 51 | 85 | ||
| 52 | /* | 86 | static void mincore_unmapped_range(struct vm_area_struct *vma, |
| 53 | * Do a chunk of "sys_mincore()". We've already checked | 87 | unsigned long addr, unsigned long end, |
| 54 | * all the arguments, we hold the mmap semaphore: we should | 88 | unsigned char *vec) |
| 55 | * just return the amount of info we're asked for. | ||
| 56 | */ | ||
| 57 | static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages) | ||
| 58 | { | 89 | { |
| 59 | pgd_t *pgd; | 90 | unsigned long nr = (end - addr) >> PAGE_SHIFT; |
| 60 | pud_t *pud; | ||
| 61 | pmd_t *pmd; | ||
| 62 | pte_t *ptep; | ||
| 63 | spinlock_t *ptl; | ||
| 64 | unsigned long nr; | ||
| 65 | int i; | 91 | int i; |
| 66 | pgoff_t pgoff; | ||
| 67 | struct vm_area_struct *vma = find_vma(current->mm, addr); | ||
| 68 | 92 | ||
| 69 | /* | 93 | if (vma->vm_file) { |
| 70 | * find_vma() didn't find anything above us, or we're | 94 | pgoff_t pgoff; |
| 71 | * in an unmapped hole in the address space: ENOMEM. | ||
| 72 | */ | ||
| 73 | if (!vma || addr < vma->vm_start) | ||
| 74 | return -ENOMEM; | ||
| 75 | |||
| 76 | #ifdef CONFIG_HUGETLB_PAGE | ||
| 77 | if (is_vm_hugetlb_page(vma)) { | ||
| 78 | struct hstate *h; | ||
| 79 | unsigned long nr_huge; | ||
| 80 | unsigned char present; | ||
| 81 | 95 | ||
| 82 | i = 0; | 96 | pgoff = linear_page_index(vma, addr); |
| 83 | nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); | 97 | for (i = 0; i < nr; i++, pgoff++) |
| 84 | h = hstate_vma(vma); | 98 | vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); |
| 85 | nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h)) | 99 | } else { |
| 86 | - (addr >> huge_page_shift(h)) + 1; | 100 | for (i = 0; i < nr; i++) |
| 87 | nr_huge = min(nr_huge, | 101 | vec[i] = 0; |
| 88 | (vma->vm_end - addr) >> huge_page_shift(h)); | ||
| 89 | while (1) { | ||
| 90 | /* hugepage always in RAM for now, | ||
| 91 | * but generally it needs to be check */ | ||
| 92 | ptep = huge_pte_offset(current->mm, | ||
| 93 | addr & huge_page_mask(h)); | ||
| 94 | present = !!(ptep && | ||
| 95 | !huge_pte_none(huge_ptep_get(ptep))); | ||
| 96 | while (1) { | ||
| 97 | vec[i++] = present; | ||
| 98 | addr += PAGE_SIZE; | ||
| 99 | /* reach buffer limit */ | ||
| 100 | if (i == nr) | ||
| 101 | return nr; | ||
| 102 | /* check hugepage border */ | ||
| 103 | if (!((addr & ~huge_page_mask(h)) | ||
| 104 | >> PAGE_SHIFT)) | ||
| 105 | break; | ||
| 106 | } | ||
| 107 | } | ||
| 108 | return nr; | ||
| 109 | } | 102 | } |
| 110 | #endif | 103 | } |
| 111 | |||
| 112 | /* | ||
| 113 | * Calculate how many pages there are left in the last level of the | ||
| 114 | * PTE array for our address. | ||
| 115 | */ | ||
| 116 | nr = PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1)); | ||
| 117 | |||
| 118 | /* | ||
| 119 | * Don't overrun this vma | ||
| 120 | */ | ||
| 121 | nr = min(nr, (vma->vm_end - addr) >> PAGE_SHIFT); | ||
| 122 | |||
| 123 | /* | ||
| 124 | * Don't return more than the caller asked for | ||
| 125 | */ | ||
| 126 | nr = min(nr, pages); | ||
| 127 | 104 | ||
| 128 | pgd = pgd_offset(vma->vm_mm, addr); | 105 | static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
| 129 | if (pgd_none_or_clear_bad(pgd)) | 106 | unsigned long addr, unsigned long end, |
| 130 | goto none_mapped; | 107 | unsigned char *vec) |
| 131 | pud = pud_offset(pgd, addr); | 108 | { |
| 132 | if (pud_none_or_clear_bad(pud)) | 109 | unsigned long next; |
| 133 | goto none_mapped; | 110 | spinlock_t *ptl; |
| 134 | pmd = pmd_offset(pud, addr); | 111 | pte_t *ptep; |
| 135 | if (pmd_none_or_clear_bad(pmd)) | ||
| 136 | goto none_mapped; | ||
| 137 | 112 | ||
| 138 | ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 113 | ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
| 139 | for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) { | 114 | do { |
| 140 | unsigned char present; | ||
| 141 | pte_t pte = *ptep; | 115 | pte_t pte = *ptep; |
| 116 | pgoff_t pgoff; | ||
| 142 | 117 | ||
| 143 | if (pte_present(pte)) { | 118 | next = addr + PAGE_SIZE; |
| 144 | present = 1; | 119 | if (pte_none(pte)) |
| 145 | 120 | mincore_unmapped_range(vma, addr, next, vec); | |
| 146 | } else if (pte_none(pte)) { | 121 | else if (pte_present(pte)) |
| 147 | if (vma->vm_file) { | 122 | *vec = 1; |
| 148 | pgoff = linear_page_index(vma, addr); | 123 | else if (pte_file(pte)) { |
| 149 | present = mincore_page(vma->vm_file->f_mapping, | ||
| 150 | pgoff); | ||
| 151 | } else | ||
| 152 | present = 0; | ||
| 153 | |||
| 154 | } else if (pte_file(pte)) { | ||
| 155 | pgoff = pte_to_pgoff(pte); | 124 | pgoff = pte_to_pgoff(pte); |
| 156 | present = mincore_page(vma->vm_file->f_mapping, pgoff); | 125 | *vec = mincore_page(vma->vm_file->f_mapping, pgoff); |
| 157 | |||
| 158 | } else { /* pte is a swap entry */ | 126 | } else { /* pte is a swap entry */ |
| 159 | swp_entry_t entry = pte_to_swp_entry(pte); | 127 | swp_entry_t entry = pte_to_swp_entry(pte); |
| 128 | |||
| 160 | if (is_migration_entry(entry)) { | 129 | if (is_migration_entry(entry)) { |
| 161 | /* migration entries are always uptodate */ | 130 | /* migration entries are always uptodate */ |
| 162 | present = 1; | 131 | *vec = 1; |
| 163 | } else { | 132 | } else { |
| 164 | #ifdef CONFIG_SWAP | 133 | #ifdef CONFIG_SWAP |
| 165 | pgoff = entry.val; | 134 | pgoff = entry.val; |
| 166 | present = mincore_page(&swapper_space, pgoff); | 135 | *vec = mincore_page(&swapper_space, pgoff); |
| 167 | #else | 136 | #else |
| 168 | WARN_ON(1); | 137 | WARN_ON(1); |
| 169 | present = 1; | 138 | *vec = 1; |
| 170 | #endif | 139 | #endif |
| 171 | } | 140 | } |
| 172 | } | 141 | } |
| 142 | vec++; | ||
| 143 | } while (ptep++, addr = next, addr != end); | ||
| 144 | pte_unmap_unlock(ptep - 1, ptl); | ||
| 145 | } | ||
| 173 | 146 | ||
| 174 | vec[i] = present; | 147 | static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
| 175 | } | 148 | unsigned long addr, unsigned long end, |
| 176 | pte_unmap_unlock(ptep-1, ptl); | 149 | unsigned char *vec) |
| 150 | { | ||
| 151 | unsigned long next; | ||
| 152 | pmd_t *pmd; | ||
| 177 | 153 | ||
| 178 | return nr; | 154 | pmd = pmd_offset(pud, addr); |
| 155 | do { | ||
| 156 | next = pmd_addr_end(addr, end); | ||
| 157 | if (pmd_none_or_clear_bad(pmd)) | ||
| 158 | mincore_unmapped_range(vma, addr, next, vec); | ||
| 159 | else | ||
| 160 | mincore_pte_range(vma, pmd, addr, next, vec); | ||
| 161 | vec += (next - addr) >> PAGE_SHIFT; | ||
| 162 | } while (pmd++, addr = next, addr != end); | ||
| 163 | } | ||
| 179 | 164 | ||
| 180 | none_mapped: | 165 | static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
| 181 | if (vma->vm_file) { | 166 | unsigned long addr, unsigned long end, |
| 182 | pgoff = linear_page_index(vma, addr); | 167 | unsigned char *vec) |
| 183 | for (i = 0; i < nr; i++, pgoff++) | 168 | { |
| 184 | vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); | 169 | unsigned long next; |
| 185 | } else { | 170 | pud_t *pud; |
| 186 | for (i = 0; i < nr; i++) | 171 | |
| 187 | vec[i] = 0; | 172 | pud = pud_offset(pgd, addr); |
| 173 | do { | ||
| 174 | next = pud_addr_end(addr, end); | ||
| 175 | if (pud_none_or_clear_bad(pud)) | ||
| 176 | mincore_unmapped_range(vma, addr, next, vec); | ||
| 177 | else | ||
| 178 | mincore_pmd_range(vma, pud, addr, next, vec); | ||
| 179 | vec += (next - addr) >> PAGE_SHIFT; | ||
| 180 | } while (pud++, addr = next, addr != end); | ||
| 181 | } | ||
| 182 | |||
| 183 | static void mincore_page_range(struct vm_area_struct *vma, | ||
| 184 | unsigned long addr, unsigned long end, | ||
| 185 | unsigned char *vec) | ||
| 186 | { | ||
| 187 | unsigned long next; | ||
| 188 | pgd_t *pgd; | ||
| 189 | |||
| 190 | pgd = pgd_offset(vma->vm_mm, addr); | ||
| 191 | do { | ||
| 192 | next = pgd_addr_end(addr, end); | ||
| 193 | if (pgd_none_or_clear_bad(pgd)) | ||
| 194 | mincore_unmapped_range(vma, addr, next, vec); | ||
| 195 | else | ||
| 196 | mincore_pud_range(vma, pgd, addr, next, vec); | ||
| 197 | vec += (next - addr) >> PAGE_SHIFT; | ||
| 198 | } while (pgd++, addr = next, addr != end); | ||
| 199 | } | ||
| 200 | |||
| 201 | /* | ||
| 202 | * Do a chunk of "sys_mincore()". We've already checked | ||
| 203 | * all the arguments, we hold the mmap semaphore: we should | ||
| 204 | * just return the amount of info we're asked for. | ||
| 205 | */ | ||
| 206 | static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec) | ||
| 207 | { | ||
| 208 | struct vm_area_struct *vma; | ||
| 209 | unsigned long end; | ||
| 210 | |||
| 211 | vma = find_vma(current->mm, addr); | ||
| 212 | if (!vma || addr < vma->vm_start) | ||
| 213 | return -ENOMEM; | ||
| 214 | |||
| 215 | end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); | ||
| 216 | |||
| 217 | if (is_vm_hugetlb_page(vma)) { | ||
| 218 | mincore_hugetlb_page_range(vma, addr, end, vec); | ||
| 219 | return (end - addr) >> PAGE_SHIFT; | ||
| 188 | } | 220 | } |
| 189 | 221 | ||
| 190 | return nr; | 222 | end = pmd_addr_end(addr, end); |
| 223 | |||
| 224 | if (is_vm_hugetlb_page(vma)) | ||
| 225 | mincore_hugetlb_page_range(vma, addr, end, vec); | ||
| 226 | else | ||
| 227 | mincore_page_range(vma, addr, end, vec); | ||
| 228 | |||
| 229 | return (end - addr) >> PAGE_SHIFT; | ||
| 191 | } | 230 | } |
| 192 | 231 | ||
| 193 | /* | 232 | /* |
| @@ -247,7 +286,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, | |||
| 247 | * the temporary buffer size. | 286 | * the temporary buffer size. |
| 248 | */ | 287 | */ |
| 249 | down_read(¤t->mm->mmap_sem); | 288 | down_read(¤t->mm->mmap_sem); |
| 250 | retval = do_mincore(start, tmp, min(pages, PAGE_SIZE)); | 289 | retval = do_mincore(start, min(pages, PAGE_SIZE), tmp); |
| 251 | up_read(¤t->mm->mmap_sem); | 290 | up_read(¤t->mm->mmap_sem); |
| 252 | 291 | ||
| 253 | if (retval <= 0) | 292 | if (retval <= 0) |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a6326c71b663..08b349931ebc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -49,6 +49,7 @@ | |||
| 49 | #include <linux/debugobjects.h> | 49 | #include <linux/debugobjects.h> |
| 50 | #include <linux/kmemleak.h> | 50 | #include <linux/kmemleak.h> |
| 51 | #include <linux/memory.h> | 51 | #include <linux/memory.h> |
| 52 | #include <linux/compaction.h> | ||
| 52 | #include <trace/events/kmem.h> | 53 | #include <trace/events/kmem.h> |
| 53 | #include <linux/ftrace_event.h> | 54 | #include <linux/ftrace_event.h> |
| 54 | 55 | ||
| @@ -475,6 +476,8 @@ static inline void __free_one_page(struct page *page, | |||
| 475 | int migratetype) | 476 | int migratetype) |
| 476 | { | 477 | { |
| 477 | unsigned long page_idx; | 478 | unsigned long page_idx; |
| 479 | unsigned long combined_idx; | ||
| 480 | struct page *buddy; | ||
| 478 | 481 | ||
| 479 | if (unlikely(PageCompound(page))) | 482 | if (unlikely(PageCompound(page))) |
| 480 | if (unlikely(destroy_compound_page(page, order))) | 483 | if (unlikely(destroy_compound_page(page, order))) |
| @@ -488,9 +491,6 @@ static inline void __free_one_page(struct page *page, | |||
| 488 | VM_BUG_ON(bad_range(zone, page)); | 491 | VM_BUG_ON(bad_range(zone, page)); |
| 489 | 492 | ||
| 490 | while (order < MAX_ORDER-1) { | 493 | while (order < MAX_ORDER-1) { |
| 491 | unsigned long combined_idx; | ||
| 492 | struct page *buddy; | ||
| 493 | |||
| 494 | buddy = __page_find_buddy(page, page_idx, order); | 494 | buddy = __page_find_buddy(page, page_idx, order); |
| 495 | if (!page_is_buddy(page, buddy, order)) | 495 | if (!page_is_buddy(page, buddy, order)) |
| 496 | break; | 496 | break; |
| @@ -505,8 +505,29 @@ static inline void __free_one_page(struct page *page, | |||
| 505 | order++; | 505 | order++; |
| 506 | } | 506 | } |
| 507 | set_page_order(page, order); | 507 | set_page_order(page, order); |
| 508 | list_add(&page->lru, | 508 | |
| 509 | &zone->free_area[order].free_list[migratetype]); | 509 | /* |
| 510 | * If this is not the largest possible page, check if the buddy | ||
| 511 | * of the next-highest order is free. If it is, it's possible | ||
| 512 | * that pages are being freed that will coalesce soon. In case, | ||
| 513 | * that is happening, add the free page to the tail of the list | ||
| 514 | * so it's less likely to be used soon and more likely to be merged | ||
| 515 | * as a higher order page | ||
| 516 | */ | ||
| 517 | if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { | ||
| 518 | struct page *higher_page, *higher_buddy; | ||
| 519 | combined_idx = __find_combined_index(page_idx, order); | ||
| 520 | higher_page = page + combined_idx - page_idx; | ||
| 521 | higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); | ||
| 522 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { | ||
| 523 | list_add_tail(&page->lru, | ||
| 524 | &zone->free_area[order].free_list[migratetype]); | ||
| 525 | goto out; | ||
| 526 | } | ||
| 527 | } | ||
| 528 | |||
| 529 | list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); | ||
| 530 | out: | ||
| 510 | zone->free_area[order].nr_free++; | 531 | zone->free_area[order].nr_free++; |
| 511 | } | 532 | } |
| 512 | 533 | ||
| @@ -599,20 +620,23 @@ static void free_one_page(struct zone *zone, struct page *page, int order, | |||
| 599 | spin_unlock(&zone->lock); | 620 | spin_unlock(&zone->lock); |
| 600 | } | 621 | } |
| 601 | 622 | ||
| 602 | static void __free_pages_ok(struct page *page, unsigned int order) | 623 | static bool free_pages_prepare(struct page *page, unsigned int order) |
| 603 | { | 624 | { |
| 604 | unsigned long flags; | ||
| 605 | int i; | 625 | int i; |
| 606 | int bad = 0; | 626 | int bad = 0; |
| 607 | int wasMlocked = __TestClearPageMlocked(page); | ||
| 608 | 627 | ||
| 609 | trace_mm_page_free_direct(page, order); | 628 | trace_mm_page_free_direct(page, order); |
| 610 | kmemcheck_free_shadow(page, order); | 629 | kmemcheck_free_shadow(page, order); |
| 611 | 630 | ||
| 612 | for (i = 0 ; i < (1 << order) ; ++i) | 631 | for (i = 0; i < (1 << order); i++) { |
| 613 | bad += free_pages_check(page + i); | 632 | struct page *pg = page + i; |
| 633 | |||
| 634 | if (PageAnon(pg)) | ||
| 635 | pg->mapping = NULL; | ||
| 636 | bad += free_pages_check(pg); | ||
| 637 | } | ||
| 614 | if (bad) | 638 | if (bad) |
| 615 | return; | 639 | return false; |
| 616 | 640 | ||
| 617 | if (!PageHighMem(page)) { | 641 | if (!PageHighMem(page)) { |
| 618 | debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); | 642 | debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); |
| @@ -622,6 +646,17 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
| 622 | arch_free_page(page, order); | 646 | arch_free_page(page, order); |
| 623 | kernel_map_pages(page, 1 << order, 0); | 647 | kernel_map_pages(page, 1 << order, 0); |
| 624 | 648 | ||
| 649 | return true; | ||
| 650 | } | ||
| 651 | |||
| 652 | static void __free_pages_ok(struct page *page, unsigned int order) | ||
| 653 | { | ||
| 654 | unsigned long flags; | ||
| 655 | int wasMlocked = __TestClearPageMlocked(page); | ||
| 656 | |||
| 657 | if (!free_pages_prepare(page, order)) | ||
| 658 | return; | ||
| 659 | |||
| 625 | local_irq_save(flags); | 660 | local_irq_save(flags); |
| 626 | if (unlikely(wasMlocked)) | 661 | if (unlikely(wasMlocked)) |
| 627 | free_page_mlock(page); | 662 | free_page_mlock(page); |
| @@ -1107,21 +1142,9 @@ void free_hot_cold_page(struct page *page, int cold) | |||
| 1107 | int migratetype; | 1142 | int migratetype; |
| 1108 | int wasMlocked = __TestClearPageMlocked(page); | 1143 | int wasMlocked = __TestClearPageMlocked(page); |
| 1109 | 1144 | ||
| 1110 | trace_mm_page_free_direct(page, 0); | 1145 | if (!free_pages_prepare(page, 0)) |
| 1111 | kmemcheck_free_shadow(page, 0); | ||
| 1112 | |||
| 1113 | if (PageAnon(page)) | ||
| 1114 | page->mapping = NULL; | ||
| 1115 | if (free_pages_check(page)) | ||
| 1116 | return; | 1146 | return; |
| 1117 | 1147 | ||
| 1118 | if (!PageHighMem(page)) { | ||
| 1119 | debug_check_no_locks_freed(page_address(page), PAGE_SIZE); | ||
| 1120 | debug_check_no_obj_freed(page_address(page), PAGE_SIZE); | ||
| 1121 | } | ||
| 1122 | arch_free_page(page, 0); | ||
| 1123 | kernel_map_pages(page, 1, 0); | ||
| 1124 | |||
| 1125 | migratetype = get_pageblock_migratetype(page); | 1148 | migratetype = get_pageblock_migratetype(page); |
| 1126 | set_page_private(page, migratetype); | 1149 | set_page_private(page, migratetype); |
| 1127 | local_irq_save(flags); | 1150 | local_irq_save(flags); |
| @@ -1188,6 +1211,51 @@ void split_page(struct page *page, unsigned int order) | |||
| 1188 | } | 1211 | } |
| 1189 | 1212 | ||
| 1190 | /* | 1213 | /* |
| 1214 | * Similar to split_page except the page is already free. As this is only | ||
| 1215 | * being used for migration, the migratetype of the block also changes. | ||
| 1216 | * As this is called with interrupts disabled, the caller is responsible | ||
| 1217 | * for calling arch_alloc_page() and kernel_map_page() after interrupts | ||
| 1218 | * are enabled. | ||
| 1219 | * | ||
| 1220 | * Note: this is probably too low level an operation for use in drivers. | ||
| 1221 | * Please consult with lkml before using this in your driver. | ||
| 1222 | */ | ||
| 1223 | int split_free_page(struct page *page) | ||
| 1224 | { | ||
| 1225 | unsigned int order; | ||
| 1226 | unsigned long watermark; | ||
| 1227 | struct zone *zone; | ||
| 1228 | |||
| 1229 | BUG_ON(!PageBuddy(page)); | ||
| 1230 | |||
| 1231 | zone = page_zone(page); | ||
| 1232 | order = page_order(page); | ||
| 1233 | |||
| 1234 | /* Obey watermarks as if the page was being allocated */ | ||
| 1235 | watermark = low_wmark_pages(zone) + (1 << order); | ||
| 1236 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | ||
| 1237 | return 0; | ||
| 1238 | |||
| 1239 | /* Remove page from free list */ | ||
| 1240 | list_del(&page->lru); | ||
| 1241 | zone->free_area[order].nr_free--; | ||
| 1242 | rmv_page_order(page); | ||
| 1243 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); | ||
| 1244 | |||
| 1245 | /* Split into individual pages */ | ||
| 1246 | set_page_refcounted(page); | ||
| 1247 | split_page(page, order); | ||
| 1248 | |||
| 1249 | if (order >= pageblock_order - 1) { | ||
| 1250 | struct page *endpage = page + (1 << order) - 1; | ||
| 1251 | for (; page < endpage; page += pageblock_nr_pages) | ||
| 1252 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
| 1253 | } | ||
| 1254 | |||
| 1255 | return 1 << order; | ||
| 1256 | } | ||
| 1257 | |||
| 1258 | /* | ||
| 1191 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But | 1259 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But |
| 1192 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 1260 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
| 1193 | * or two. | 1261 | * or two. |
| @@ -1693,6 +1761,62 @@ out: | |||
| 1693 | return page; | 1761 | return page; |
| 1694 | } | 1762 | } |
| 1695 | 1763 | ||
| 1764 | #ifdef CONFIG_COMPACTION | ||
| 1765 | /* Try memory compaction for high-order allocations before reclaim */ | ||
| 1766 | static struct page * | ||
| 1767 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | ||
| 1768 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
| 1769 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
| 1770 | int migratetype, unsigned long *did_some_progress) | ||
| 1771 | { | ||
| 1772 | struct page *page; | ||
| 1773 | |||
| 1774 | if (!order || compaction_deferred(preferred_zone)) | ||
| 1775 | return NULL; | ||
| 1776 | |||
| 1777 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | ||
| 1778 | nodemask); | ||
| 1779 | if (*did_some_progress != COMPACT_SKIPPED) { | ||
| 1780 | |||
| 1781 | /* Page migration frees to the PCP lists but we want merging */ | ||
| 1782 | drain_pages(get_cpu()); | ||
| 1783 | put_cpu(); | ||
| 1784 | |||
| 1785 | page = get_page_from_freelist(gfp_mask, nodemask, | ||
| 1786 | order, zonelist, high_zoneidx, | ||
| 1787 | alloc_flags, preferred_zone, | ||
| 1788 | migratetype); | ||
| 1789 | if (page) { | ||
| 1790 | preferred_zone->compact_considered = 0; | ||
| 1791 | preferred_zone->compact_defer_shift = 0; | ||
| 1792 | count_vm_event(COMPACTSUCCESS); | ||
| 1793 | return page; | ||
| 1794 | } | ||
| 1795 | |||
| 1796 | /* | ||
| 1797 | * It's bad if compaction run occurs and fails. | ||
| 1798 | * The most likely reason is that pages exist, | ||
| 1799 | * but not enough to satisfy watermarks. | ||
| 1800 | */ | ||
| 1801 | count_vm_event(COMPACTFAIL); | ||
| 1802 | defer_compaction(preferred_zone); | ||
| 1803 | |||
| 1804 | cond_resched(); | ||
| 1805 | } | ||
| 1806 | |||
| 1807 | return NULL; | ||
| 1808 | } | ||
| 1809 | #else | ||
| 1810 | static inline struct page * | ||
| 1811 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | ||
| 1812 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
| 1813 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
| 1814 | int migratetype, unsigned long *did_some_progress) | ||
| 1815 | { | ||
| 1816 | return NULL; | ||
| 1817 | } | ||
| 1818 | #endif /* CONFIG_COMPACTION */ | ||
| 1819 | |||
| 1696 | /* The really slow allocator path where we enter direct reclaim */ | 1820 | /* The really slow allocator path where we enter direct reclaim */ |
| 1697 | static inline struct page * | 1821 | static inline struct page * |
| 1698 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | 1822 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, |
| @@ -1879,6 +2003,15 @@ rebalance: | |||
| 1879 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) | 2003 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) |
| 1880 | goto nopage; | 2004 | goto nopage; |
| 1881 | 2005 | ||
| 2006 | /* Try direct compaction */ | ||
| 2007 | page = __alloc_pages_direct_compact(gfp_mask, order, | ||
| 2008 | zonelist, high_zoneidx, | ||
| 2009 | nodemask, | ||
| 2010 | alloc_flags, preferred_zone, | ||
| 2011 | migratetype, &did_some_progress); | ||
| 2012 | if (page) | ||
| 2013 | goto got_pg; | ||
| 2014 | |||
| 1882 | /* Try direct reclaim and then allocating */ | 2015 | /* Try direct reclaim and then allocating */ |
| 1883 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2016 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
| 1884 | zonelist, high_zoneidx, | 2017 | zonelist, high_zoneidx, |
| @@ -1970,10 +2103,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
| 1970 | if (unlikely(!zonelist->_zonerefs->zone)) | 2103 | if (unlikely(!zonelist->_zonerefs->zone)) |
| 1971 | return NULL; | 2104 | return NULL; |
| 1972 | 2105 | ||
| 2106 | get_mems_allowed(); | ||
| 1973 | /* The preferred zone is used for statistics later */ | 2107 | /* The preferred zone is used for statistics later */ |
| 1974 | first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); | 2108 | first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); |
| 1975 | if (!preferred_zone) | 2109 | if (!preferred_zone) { |
| 2110 | put_mems_allowed(); | ||
| 1976 | return NULL; | 2111 | return NULL; |
| 2112 | } | ||
| 1977 | 2113 | ||
| 1978 | /* First allocation attempt */ | 2114 | /* First allocation attempt */ |
| 1979 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2115 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
| @@ -1983,6 +2119,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
| 1983 | page = __alloc_pages_slowpath(gfp_mask, order, | 2119 | page = __alloc_pages_slowpath(gfp_mask, order, |
| 1984 | zonelist, high_zoneidx, nodemask, | 2120 | zonelist, high_zoneidx, nodemask, |
| 1985 | preferred_zone, migratetype); | 2121 | preferred_zone, migratetype); |
| 2122 | put_mems_allowed(); | ||
| 1986 | 2123 | ||
| 1987 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2124 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
| 1988 | return page; | 2125 | return page; |
| @@ -2434,8 +2571,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
| 2434 | strncpy((char*)table->data, saved_string, | 2571 | strncpy((char*)table->data, saved_string, |
| 2435 | NUMA_ZONELIST_ORDER_LEN); | 2572 | NUMA_ZONELIST_ORDER_LEN); |
| 2436 | user_zonelist_order = oldval; | 2573 | user_zonelist_order = oldval; |
| 2437 | } else if (oldval != user_zonelist_order) | 2574 | } else if (oldval != user_zonelist_order) { |
| 2438 | build_all_zonelists(); | 2575 | mutex_lock(&zonelists_mutex); |
| 2576 | build_all_zonelists(NULL); | ||
| 2577 | mutex_unlock(&zonelists_mutex); | ||
| 2578 | } | ||
| 2439 | } | 2579 | } |
| 2440 | out: | 2580 | out: |
| 2441 | mutex_unlock(&zl_order_mutex); | 2581 | mutex_unlock(&zl_order_mutex); |
| @@ -2582,7 +2722,7 @@ static int default_zonelist_order(void) | |||
| 2582 | * ZONE_DMA and ZONE_DMA32 can be very small area in the system. | 2722 | * ZONE_DMA and ZONE_DMA32 can be very small area in the system. |
| 2583 | * If they are really small and used heavily, the system can fall | 2723 | * If they are really small and used heavily, the system can fall |
| 2584 | * into OOM very easily. | 2724 | * into OOM very easily. |
| 2585 | * This function detect ZONE_DMA/DMA32 size and confgigures zone order. | 2725 | * This function detect ZONE_DMA/DMA32 size and configures zone order. |
| 2586 | */ | 2726 | */ |
| 2587 | /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ | 2727 | /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ |
| 2588 | low_kmem_size = 0; | 2728 | low_kmem_size = 0; |
| @@ -2594,6 +2734,15 @@ static int default_zonelist_order(void) | |||
| 2594 | if (zone_type < ZONE_NORMAL) | 2734 | if (zone_type < ZONE_NORMAL) |
| 2595 | low_kmem_size += z->present_pages; | 2735 | low_kmem_size += z->present_pages; |
| 2596 | total_size += z->present_pages; | 2736 | total_size += z->present_pages; |
| 2737 | } else if (zone_type == ZONE_NORMAL) { | ||
| 2738 | /* | ||
| 2739 | * If any node has only lowmem, then node order | ||
| 2740 | * is preferred to allow kernel allocations | ||
| 2741 | * locally; otherwise, they can easily infringe | ||
| 2742 | * on other nodes when there is an abundance of | ||
| 2743 | * lowmem available to allocate from. | ||
| 2744 | */ | ||
| 2745 | return ZONELIST_ORDER_NODE; | ||
| 2597 | } | 2746 | } |
| 2598 | } | 2747 | } |
| 2599 | } | 2748 | } |
| @@ -2776,9 +2925,16 @@ static void build_zonelist_cache(pg_data_t *pgdat) | |||
| 2776 | */ | 2925 | */ |
| 2777 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); | 2926 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); |
| 2778 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); | 2927 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); |
| 2928 | static void setup_zone_pageset(struct zone *zone); | ||
| 2929 | |||
| 2930 | /* | ||
| 2931 | * Global mutex to protect against size modification of zonelists | ||
| 2932 | * as well as to serialize pageset setup for the new populated zone. | ||
| 2933 | */ | ||
| 2934 | DEFINE_MUTEX(zonelists_mutex); | ||
| 2779 | 2935 | ||
| 2780 | /* return values int ....just for stop_machine() */ | 2936 | /* return values int ....just for stop_machine() */ |
| 2781 | static int __build_all_zonelists(void *dummy) | 2937 | static __init_refok int __build_all_zonelists(void *data) |
| 2782 | { | 2938 | { |
| 2783 | int nid; | 2939 | int nid; |
| 2784 | int cpu; | 2940 | int cpu; |
| @@ -2793,6 +2949,14 @@ static int __build_all_zonelists(void *dummy) | |||
| 2793 | build_zonelist_cache(pgdat); | 2949 | build_zonelist_cache(pgdat); |
| 2794 | } | 2950 | } |
| 2795 | 2951 | ||
| 2952 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
| 2953 | /* Setup real pagesets for the new zone */ | ||
| 2954 | if (data) { | ||
| 2955 | struct zone *zone = data; | ||
| 2956 | setup_zone_pageset(zone); | ||
| 2957 | } | ||
| 2958 | #endif | ||
| 2959 | |||
| 2796 | /* | 2960 | /* |
| 2797 | * Initialize the boot_pagesets that are going to be used | 2961 | * Initialize the boot_pagesets that are going to be used |
| 2798 | * for bootstrapping processors. The real pagesets for | 2962 | * for bootstrapping processors. The real pagesets for |
| @@ -2812,7 +2976,11 @@ static int __build_all_zonelists(void *dummy) | |||
| 2812 | return 0; | 2976 | return 0; |
| 2813 | } | 2977 | } |
| 2814 | 2978 | ||
| 2815 | void build_all_zonelists(void) | 2979 | /* |
| 2980 | * Called with zonelists_mutex held always | ||
| 2981 | * unless system_state == SYSTEM_BOOTING. | ||
| 2982 | */ | ||
| 2983 | void build_all_zonelists(void *data) | ||
| 2816 | { | 2984 | { |
| 2817 | set_zonelist_order(); | 2985 | set_zonelist_order(); |
| 2818 | 2986 | ||
| @@ -2823,7 +2991,7 @@ void build_all_zonelists(void) | |||
| 2823 | } else { | 2991 | } else { |
| 2824 | /* we have to stop all cpus to guarantee there is no user | 2992 | /* we have to stop all cpus to guarantee there is no user |
| 2825 | of zonelist */ | 2993 | of zonelist */ |
| 2826 | stop_machine(__build_all_zonelists, NULL, NULL); | 2994 | stop_machine(__build_all_zonelists, data, NULL); |
| 2827 | /* cpuset refresh routine should be here */ | 2995 | /* cpuset refresh routine should be here */ |
| 2828 | } | 2996 | } |
| 2829 | vm_total_pages = nr_free_pagecache_pages(); | 2997 | vm_total_pages = nr_free_pagecache_pages(); |
| @@ -3146,31 +3314,34 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, | |||
| 3146 | pcp->batch = PAGE_SHIFT * 8; | 3314 | pcp->batch = PAGE_SHIFT * 8; |
| 3147 | } | 3315 | } |
| 3148 | 3316 | ||
| 3317 | static __meminit void setup_zone_pageset(struct zone *zone) | ||
| 3318 | { | ||
| 3319 | int cpu; | ||
| 3320 | |||
| 3321 | zone->pageset = alloc_percpu(struct per_cpu_pageset); | ||
| 3322 | |||
| 3323 | for_each_possible_cpu(cpu) { | ||
| 3324 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); | ||
| 3325 | |||
| 3326 | setup_pageset(pcp, zone_batchsize(zone)); | ||
| 3327 | |||
| 3328 | if (percpu_pagelist_fraction) | ||
| 3329 | setup_pagelist_highmark(pcp, | ||
| 3330 | (zone->present_pages / | ||
| 3331 | percpu_pagelist_fraction)); | ||
| 3332 | } | ||
| 3333 | } | ||
| 3334 | |||
| 3149 | /* | 3335 | /* |
| 3150 | * Allocate per cpu pagesets and initialize them. | 3336 | * Allocate per cpu pagesets and initialize them. |
| 3151 | * Before this call only boot pagesets were available. | 3337 | * Before this call only boot pagesets were available. |
| 3152 | * Boot pagesets will no longer be used by this processorr | ||
| 3153 | * after setup_per_cpu_pageset(). | ||
| 3154 | */ | 3338 | */ |
| 3155 | void __init setup_per_cpu_pageset(void) | 3339 | void __init setup_per_cpu_pageset(void) |
| 3156 | { | 3340 | { |
| 3157 | struct zone *zone; | 3341 | struct zone *zone; |
| 3158 | int cpu; | ||
| 3159 | 3342 | ||
| 3160 | for_each_populated_zone(zone) { | 3343 | for_each_populated_zone(zone) |
| 3161 | zone->pageset = alloc_percpu(struct per_cpu_pageset); | 3344 | setup_zone_pageset(zone); |
| 3162 | |||
| 3163 | for_each_possible_cpu(cpu) { | ||
| 3164 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); | ||
| 3165 | |||
| 3166 | setup_pageset(pcp, zone_batchsize(zone)); | ||
| 3167 | |||
| 3168 | if (percpu_pagelist_fraction) | ||
| 3169 | setup_pagelist_highmark(pcp, | ||
| 3170 | (zone->present_pages / | ||
| 3171 | percpu_pagelist_fraction)); | ||
| 3172 | } | ||
| 3173 | } | ||
| 3174 | } | 3345 | } |
| 3175 | 3346 | ||
| 3176 | static noinline __init_refok | 3347 | static noinline __init_refok |
diff --git a/mm/readahead.c b/mm/readahead.c index dfa9a1a03a11..77506a291a2d 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
| @@ -523,7 +523,7 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead); | |||
| 523 | * @req_size: hint: total size of the read which the caller is performing in | 523 | * @req_size: hint: total size of the read which the caller is performing in |
| 524 | * pagecache pages | 524 | * pagecache pages |
| 525 | * | 525 | * |
| 526 | * page_cache_async_ondemand() should be called when a page is used which | 526 | * page_cache_async_readahead() should be called when a page is used which |
| 527 | * has the PG_readahead flag; this is a marker to suggest that the application | 527 | * has the PG_readahead flag; this is a marker to suggest that the application |
| 528 | * has used up enough of the readahead window that we should start pulling in | 528 | * has used up enough of the readahead window that we should start pulling in |
| 529 | * more pages. | 529 | * more pages. |
| @@ -250,7 +250,7 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) | |||
| 250 | list_del(&anon_vma_chain->same_anon_vma); | 250 | list_del(&anon_vma_chain->same_anon_vma); |
| 251 | 251 | ||
| 252 | /* We must garbage collect the anon_vma if it's empty */ | 252 | /* We must garbage collect the anon_vma if it's empty */ |
| 253 | empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); | 253 | empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); |
| 254 | spin_unlock(&anon_vma->lock); | 254 | spin_unlock(&anon_vma->lock); |
| 255 | 255 | ||
| 256 | if (empty) | 256 | if (empty) |
| @@ -274,7 +274,7 @@ static void anon_vma_ctor(void *data) | |||
| 274 | struct anon_vma *anon_vma = data; | 274 | struct anon_vma *anon_vma = data; |
| 275 | 275 | ||
| 276 | spin_lock_init(&anon_vma->lock); | 276 | spin_lock_init(&anon_vma->lock); |
| 277 | ksm_refcount_init(anon_vma); | 277 | anonvma_external_refcount_init(anon_vma); |
| 278 | INIT_LIST_HEAD(&anon_vma->head); | 278 | INIT_LIST_HEAD(&anon_vma->head); |
| 279 | } | 279 | } |
| 280 | 280 | ||
| @@ -1131,6 +1131,20 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
| 1131 | return ret; | 1131 | return ret; |
| 1132 | } | 1132 | } |
| 1133 | 1133 | ||
| 1134 | static bool is_vma_temporary_stack(struct vm_area_struct *vma) | ||
| 1135 | { | ||
| 1136 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | ||
| 1137 | |||
| 1138 | if (!maybe_stack) | ||
| 1139 | return false; | ||
| 1140 | |||
| 1141 | if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == | ||
| 1142 | VM_STACK_INCOMPLETE_SETUP) | ||
| 1143 | return true; | ||
| 1144 | |||
| 1145 | return false; | ||
| 1146 | } | ||
| 1147 | |||
| 1134 | /** | 1148 | /** |
| 1135 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based | 1149 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based |
| 1136 | * rmap method | 1150 | * rmap method |
| @@ -1159,7 +1173,21 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
| 1159 | 1173 | ||
| 1160 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1174 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
| 1161 | struct vm_area_struct *vma = avc->vma; | 1175 | struct vm_area_struct *vma = avc->vma; |
| 1162 | unsigned long address = vma_address(page, vma); | 1176 | unsigned long address; |
| 1177 | |||
| 1178 | /* | ||
| 1179 | * During exec, a temporary VMA is setup and later moved. | ||
| 1180 | * The VMA is moved under the anon_vma lock but not the | ||
| 1181 | * page tables leading to a race where migration cannot | ||
| 1182 | * find the migration ptes. Rather than increasing the | ||
| 1183 | * locking requirements of exec(), migration skips | ||
| 1184 | * temporary VMAs until after exec() completes. | ||
| 1185 | */ | ||
| 1186 | if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && | ||
| 1187 | is_vma_temporary_stack(vma)) | ||
| 1188 | continue; | ||
| 1189 | |||
| 1190 | address = vma_address(page, vma); | ||
| 1163 | if (address == -EFAULT) | 1191 | if (address == -EFAULT) |
| 1164 | continue; | 1192 | continue; |
| 1165 | ret = try_to_unmap_one(page, vma, address, flags); | 1193 | ret = try_to_unmap_one(page, vma, address, flags); |
| @@ -1355,10 +1383,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
| 1355 | /* | 1383 | /* |
| 1356 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma() | 1384 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma() |
| 1357 | * because that depends on page_mapped(); but not all its usages | 1385 | * because that depends on page_mapped(); but not all its usages |
| 1358 | * are holding mmap_sem, which also gave the necessary guarantee | 1386 | * are holding mmap_sem. Users without mmap_sem are required to |
| 1359 | * (that this anon_vma's slab has not already been destroyed). | 1387 | * take a reference count to prevent the anon_vma disappearing |
| 1360 | * This needs to be reviewed later: avoiding page_lock_anon_vma() | ||
| 1361 | * is risky, and currently limits the usefulness of rmap_walk(). | ||
| 1362 | */ | 1388 | */ |
| 1363 | anon_vma = page_anon_vma(page); | 1389 | anon_vma = page_anon_vma(page); |
| 1364 | if (!anon_vma) | 1390 | if (!anon_vma) |
diff --git a/mm/shmem.c b/mm/shmem.c index 0cd7f66f1c66..4ef9797bd430 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -433,8 +433,6 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long | |||
| 433 | 433 | ||
| 434 | spin_unlock(&info->lock); | 434 | spin_unlock(&info->lock); |
| 435 | page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); | 435 | page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); |
| 436 | if (page) | ||
| 437 | set_page_private(page, 0); | ||
| 438 | spin_lock(&info->lock); | 436 | spin_lock(&info->lock); |
| 439 | 437 | ||
| 440 | if (!page) { | 438 | if (!page) { |
| @@ -3217,10 +3217,12 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
| 3217 | if (in_interrupt() || (flags & __GFP_THISNODE)) | 3217 | if (in_interrupt() || (flags & __GFP_THISNODE)) |
| 3218 | return NULL; | 3218 | return NULL; |
| 3219 | nid_alloc = nid_here = numa_node_id(); | 3219 | nid_alloc = nid_here = numa_node_id(); |
| 3220 | get_mems_allowed(); | ||
| 3220 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | 3221 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) |
| 3221 | nid_alloc = cpuset_mem_spread_node(); | 3222 | nid_alloc = cpuset_mem_spread_node(); |
| 3222 | else if (current->mempolicy) | 3223 | else if (current->mempolicy) |
| 3223 | nid_alloc = slab_node(current->mempolicy); | 3224 | nid_alloc = slab_node(current->mempolicy); |
| 3225 | put_mems_allowed(); | ||
| 3224 | if (nid_alloc != nid_here) | 3226 | if (nid_alloc != nid_here) |
| 3225 | return ____cache_alloc_node(cachep, flags, nid_alloc); | 3227 | return ____cache_alloc_node(cachep, flags, nid_alloc); |
| 3226 | return NULL; | 3228 | return NULL; |
| @@ -3247,6 +3249,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
| 3247 | if (flags & __GFP_THISNODE) | 3249 | if (flags & __GFP_THISNODE) |
| 3248 | return NULL; | 3250 | return NULL; |
| 3249 | 3251 | ||
| 3252 | get_mems_allowed(); | ||
| 3250 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | 3253 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); |
| 3251 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | 3254 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
| 3252 | 3255 | ||
| @@ -3302,6 +3305,7 @@ retry: | |||
| 3302 | } | 3305 | } |
| 3303 | } | 3306 | } |
| 3304 | } | 3307 | } |
| 3308 | put_mems_allowed(); | ||
| 3305 | return obj; | 3309 | return obj; |
| 3306 | } | 3310 | } |
| 3307 | 3311 | ||
| @@ -1360,6 +1360,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
| 1360 | get_cycles() % 1024 > s->remote_node_defrag_ratio) | 1360 | get_cycles() % 1024 > s->remote_node_defrag_ratio) |
| 1361 | return NULL; | 1361 | return NULL; |
| 1362 | 1362 | ||
| 1363 | get_mems_allowed(); | ||
| 1363 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | 1364 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); |
| 1364 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 1365 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
| 1365 | struct kmem_cache_node *n; | 1366 | struct kmem_cache_node *n; |
| @@ -1369,10 +1370,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
| 1369 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && | 1370 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && |
| 1370 | n->nr_partial > s->min_partial) { | 1371 | n->nr_partial > s->min_partial) { |
| 1371 | page = get_partial_node(n); | 1372 | page = get_partial_node(n); |
| 1372 | if (page) | 1373 | if (page) { |
| 1374 | put_mems_allowed(); | ||
| 1373 | return page; | 1375 | return page; |
| 1376 | } | ||
| 1374 | } | 1377 | } |
| 1375 | } | 1378 | } |
| 1379 | put_mems_allowed(); | ||
| 1376 | #endif | 1380 | #endif |
| 1377 | return NULL; | 1381 | return NULL; |
| 1378 | } | 1382 | } |
diff --git a/mm/sparse.c b/mm/sparse.c index dc0cc4d43ff3..95ac219af379 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
| @@ -382,13 +382,15 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, | |||
| 382 | struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) | 382 | struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) |
| 383 | { | 383 | { |
| 384 | struct page *map; | 384 | struct page *map; |
| 385 | unsigned long size; | ||
| 385 | 386 | ||
| 386 | map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); | 387 | map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); |
| 387 | if (map) | 388 | if (map) |
| 388 | return map; | 389 | return map; |
| 389 | 390 | ||
| 390 | map = alloc_bootmem_pages_node(NODE_DATA(nid), | 391 | size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); |
| 391 | PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); | 392 | map = __alloc_bootmem_node_high(NODE_DATA(nid), size, |
| 393 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | ||
| 392 | return map; | 394 | return map; |
| 393 | } | 395 | } |
| 394 | void __init sparse_mem_maps_populate_node(struct page **map_map, | 396 | void __init sparse_mem_maps_populate_node(struct page **map_map, |
| @@ -412,7 +414,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, | |||
| 412 | } | 414 | } |
| 413 | 415 | ||
| 414 | size = PAGE_ALIGN(size); | 416 | size = PAGE_ALIGN(size); |
| 415 | map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count); | 417 | map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, |
| 418 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | ||
| 416 | if (map) { | 419 | if (map) { |
| 417 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 420 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
| 418 | if (!present_section_nr(pnum)) | 421 | if (!present_section_nr(pnum)) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 3ff3311447f5..915dceb487c1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -73,10 +73,14 @@ struct scan_control { | |||
| 73 | 73 | ||
| 74 | int swappiness; | 74 | int swappiness; |
| 75 | 75 | ||
| 76 | int all_unreclaimable; | ||
| 77 | |||
| 78 | int order; | 76 | int order; |
| 79 | 77 | ||
| 78 | /* | ||
| 79 | * Intend to reclaim enough contenious memory rather than to reclaim | ||
| 80 | * enough amount memory. I.e, it's the mode for high order allocation. | ||
| 81 | */ | ||
| 82 | bool lumpy_reclaim_mode; | ||
| 83 | |||
| 80 | /* Which cgroup do we reclaim from */ | 84 | /* Which cgroup do we reclaim from */ |
| 81 | struct mem_cgroup *mem_cgroup; | 85 | struct mem_cgroup *mem_cgroup; |
| 82 | 86 | ||
| @@ -85,12 +89,6 @@ struct scan_control { | |||
| 85 | * are scanned. | 89 | * are scanned. |
| 86 | */ | 90 | */ |
| 87 | nodemask_t *nodemask; | 91 | nodemask_t *nodemask; |
| 88 | |||
| 89 | /* Pluggable isolate pages callback */ | ||
| 90 | unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, | ||
| 91 | unsigned long *scanned, int order, int mode, | ||
| 92 | struct zone *z, struct mem_cgroup *mem_cont, | ||
| 93 | int active, int file); | ||
| 94 | }; | 92 | }; |
| 95 | 93 | ||
| 96 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 94 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
| @@ -575,7 +573,7 @@ static enum page_references page_check_references(struct page *page, | |||
| 575 | referenced_page = TestClearPageReferenced(page); | 573 | referenced_page = TestClearPageReferenced(page); |
| 576 | 574 | ||
| 577 | /* Lumpy reclaim - ignore references */ | 575 | /* Lumpy reclaim - ignore references */ |
| 578 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | 576 | if (sc->lumpy_reclaim_mode) |
| 579 | return PAGEREF_RECLAIM; | 577 | return PAGEREF_RECLAIM; |
| 580 | 578 | ||
| 581 | /* | 579 | /* |
| @@ -839,11 +837,6 @@ keep: | |||
| 839 | return nr_reclaimed; | 837 | return nr_reclaimed; |
| 840 | } | 838 | } |
| 841 | 839 | ||
| 842 | /* LRU Isolation modes. */ | ||
| 843 | #define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */ | ||
| 844 | #define ISOLATE_ACTIVE 1 /* Isolate active pages. */ | ||
| 845 | #define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */ | ||
| 846 | |||
| 847 | /* | 840 | /* |
| 848 | * Attempt to remove the specified page from its LRU. Only take this page | 841 | * Attempt to remove the specified page from its LRU. Only take this page |
| 849 | * if it is of the appropriate PageActive status. Pages which are being | 842 | * if it is of the appropriate PageActive status. Pages which are being |
| @@ -1011,7 +1004,6 @@ static unsigned long isolate_pages_global(unsigned long nr, | |||
| 1011 | struct list_head *dst, | 1004 | struct list_head *dst, |
| 1012 | unsigned long *scanned, int order, | 1005 | unsigned long *scanned, int order, |
| 1013 | int mode, struct zone *z, | 1006 | int mode, struct zone *z, |
| 1014 | struct mem_cgroup *mem_cont, | ||
| 1015 | int active, int file) | 1007 | int active, int file) |
| 1016 | { | 1008 | { |
| 1017 | int lru = LRU_BASE; | 1009 | int lru = LRU_BASE; |
| @@ -1130,7 +1122,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1130 | unsigned long nr_scanned = 0; | 1122 | unsigned long nr_scanned = 0; |
| 1131 | unsigned long nr_reclaimed = 0; | 1123 | unsigned long nr_reclaimed = 0; |
| 1132 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1124 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
| 1133 | int lumpy_reclaim = 0; | ||
| 1134 | 1125 | ||
| 1135 | while (unlikely(too_many_isolated(zone, file, sc))) { | 1126 | while (unlikely(too_many_isolated(zone, file, sc))) { |
| 1136 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1127 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
| @@ -1140,17 +1131,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1140 | return SWAP_CLUSTER_MAX; | 1131 | return SWAP_CLUSTER_MAX; |
| 1141 | } | 1132 | } |
| 1142 | 1133 | ||
| 1143 | /* | ||
| 1144 | * If we need a large contiguous chunk of memory, or have | ||
| 1145 | * trouble getting a small set of contiguous pages, we | ||
| 1146 | * will reclaim both active and inactive pages. | ||
| 1147 | * | ||
| 1148 | * We use the same threshold as pageout congestion_wait below. | ||
| 1149 | */ | ||
| 1150 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
| 1151 | lumpy_reclaim = 1; | ||
| 1152 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
| 1153 | lumpy_reclaim = 1; | ||
| 1154 | 1134 | ||
| 1155 | pagevec_init(&pvec, 1); | 1135 | pagevec_init(&pvec, 1); |
| 1156 | 1136 | ||
| @@ -1163,15 +1143,15 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1163 | unsigned long nr_freed; | 1143 | unsigned long nr_freed; |
| 1164 | unsigned long nr_active; | 1144 | unsigned long nr_active; |
| 1165 | unsigned int count[NR_LRU_LISTS] = { 0, }; | 1145 | unsigned int count[NR_LRU_LISTS] = { 0, }; |
| 1166 | int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; | 1146 | int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE; |
| 1167 | unsigned long nr_anon; | 1147 | unsigned long nr_anon; |
| 1168 | unsigned long nr_file; | 1148 | unsigned long nr_file; |
| 1169 | 1149 | ||
| 1170 | nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX, | ||
| 1171 | &page_list, &nr_scan, sc->order, mode, | ||
| 1172 | zone, sc->mem_cgroup, 0, file); | ||
| 1173 | |||
| 1174 | if (scanning_global_lru(sc)) { | 1150 | if (scanning_global_lru(sc)) { |
| 1151 | nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX, | ||
| 1152 | &page_list, &nr_scan, | ||
| 1153 | sc->order, mode, | ||
| 1154 | zone, 0, file); | ||
| 1175 | zone->pages_scanned += nr_scan; | 1155 | zone->pages_scanned += nr_scan; |
| 1176 | if (current_is_kswapd()) | 1156 | if (current_is_kswapd()) |
| 1177 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, | 1157 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, |
| @@ -1179,6 +1159,16 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1179 | else | 1159 | else |
| 1180 | __count_zone_vm_events(PGSCAN_DIRECT, zone, | 1160 | __count_zone_vm_events(PGSCAN_DIRECT, zone, |
| 1181 | nr_scan); | 1161 | nr_scan); |
| 1162 | } else { | ||
| 1163 | nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX, | ||
| 1164 | &page_list, &nr_scan, | ||
| 1165 | sc->order, mode, | ||
| 1166 | zone, sc->mem_cgroup, | ||
| 1167 | 0, file); | ||
| 1168 | /* | ||
| 1169 | * mem_cgroup_isolate_pages() keeps track of | ||
| 1170 | * scanned pages on its own. | ||
| 1171 | */ | ||
| 1182 | } | 1172 | } |
| 1183 | 1173 | ||
| 1184 | if (nr_taken == 0) | 1174 | if (nr_taken == 0) |
| @@ -1216,7 +1206,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1216 | * but that should be acceptable to the caller | 1206 | * but that should be acceptable to the caller |
| 1217 | */ | 1207 | */ |
| 1218 | if (nr_freed < nr_taken && !current_is_kswapd() && | 1208 | if (nr_freed < nr_taken && !current_is_kswapd() && |
| 1219 | lumpy_reclaim) { | 1209 | sc->lumpy_reclaim_mode) { |
| 1220 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1210 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
| 1221 | 1211 | ||
| 1222 | /* | 1212 | /* |
| @@ -1356,16 +1346,23 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 1356 | 1346 | ||
| 1357 | lru_add_drain(); | 1347 | lru_add_drain(); |
| 1358 | spin_lock_irq(&zone->lru_lock); | 1348 | spin_lock_irq(&zone->lru_lock); |
| 1359 | nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, | ||
| 1360 | ISOLATE_ACTIVE, zone, | ||
| 1361 | sc->mem_cgroup, 1, file); | ||
| 1362 | /* | ||
| 1363 | * zone->pages_scanned is used for detect zone's oom | ||
| 1364 | * mem_cgroup remembers nr_scan by itself. | ||
| 1365 | */ | ||
| 1366 | if (scanning_global_lru(sc)) { | 1349 | if (scanning_global_lru(sc)) { |
| 1350 | nr_taken = isolate_pages_global(nr_pages, &l_hold, | ||
| 1351 | &pgscanned, sc->order, | ||
| 1352 | ISOLATE_ACTIVE, zone, | ||
| 1353 | 1, file); | ||
| 1367 | zone->pages_scanned += pgscanned; | 1354 | zone->pages_scanned += pgscanned; |
| 1355 | } else { | ||
| 1356 | nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, | ||
| 1357 | &pgscanned, sc->order, | ||
| 1358 | ISOLATE_ACTIVE, zone, | ||
| 1359 | sc->mem_cgroup, 1, file); | ||
| 1360 | /* | ||
| 1361 | * mem_cgroup_isolate_pages() keeps track of | ||
| 1362 | * scanned pages on its own. | ||
| 1363 | */ | ||
| 1368 | } | 1364 | } |
| 1365 | |||
| 1369 | reclaim_stat->recent_scanned[file] += nr_taken; | 1366 | reclaim_stat->recent_scanned[file] += nr_taken; |
| 1370 | 1367 | ||
| 1371 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | 1368 | __count_zone_vm_events(PGREFILL, zone, pgscanned); |
| @@ -1519,21 +1516,52 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | |||
| 1519 | } | 1516 | } |
| 1520 | 1517 | ||
| 1521 | /* | 1518 | /* |
| 1519 | * Smallish @nr_to_scan's are deposited in @nr_saved_scan, | ||
| 1520 | * until we collected @swap_cluster_max pages to scan. | ||
| 1521 | */ | ||
| 1522 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, | ||
| 1523 | unsigned long *nr_saved_scan) | ||
| 1524 | { | ||
| 1525 | unsigned long nr; | ||
| 1526 | |||
| 1527 | *nr_saved_scan += nr_to_scan; | ||
| 1528 | nr = *nr_saved_scan; | ||
| 1529 | |||
| 1530 | if (nr >= SWAP_CLUSTER_MAX) | ||
| 1531 | *nr_saved_scan = 0; | ||
| 1532 | else | ||
| 1533 | nr = 0; | ||
| 1534 | |||
| 1535 | return nr; | ||
| 1536 | } | ||
| 1537 | |||
| 1538 | /* | ||
| 1522 | * Determine how aggressively the anon and file LRU lists should be | 1539 | * Determine how aggressively the anon and file LRU lists should be |
| 1523 | * scanned. The relative value of each set of LRU lists is determined | 1540 | * scanned. The relative value of each set of LRU lists is determined |
| 1524 | * by looking at the fraction of the pages scanned we did rotate back | 1541 | * by looking at the fraction of the pages scanned we did rotate back |
| 1525 | * onto the active list instead of evict. | 1542 | * onto the active list instead of evict. |
| 1526 | * | 1543 | * |
| 1527 | * percent[0] specifies how much pressure to put on ram/swap backed | 1544 | * nr[0] = anon pages to scan; nr[1] = file pages to scan |
| 1528 | * memory, while percent[1] determines pressure on the file LRUs. | ||
| 1529 | */ | 1545 | */ |
| 1530 | static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | 1546 | static void get_scan_count(struct zone *zone, struct scan_control *sc, |
| 1531 | unsigned long *percent) | 1547 | unsigned long *nr, int priority) |
| 1532 | { | 1548 | { |
| 1533 | unsigned long anon, file, free; | 1549 | unsigned long anon, file, free; |
| 1534 | unsigned long anon_prio, file_prio; | 1550 | unsigned long anon_prio, file_prio; |
| 1535 | unsigned long ap, fp; | 1551 | unsigned long ap, fp; |
| 1536 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1552 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
| 1553 | u64 fraction[2], denominator; | ||
| 1554 | enum lru_list l; | ||
| 1555 | int noswap = 0; | ||
| 1556 | |||
| 1557 | /* If we have no swap space, do not bother scanning anon pages. */ | ||
| 1558 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | ||
| 1559 | noswap = 1; | ||
| 1560 | fraction[0] = 0; | ||
| 1561 | fraction[1] = 1; | ||
| 1562 | denominator = 1; | ||
| 1563 | goto out; | ||
| 1564 | } | ||
| 1537 | 1565 | ||
| 1538 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | 1566 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + |
| 1539 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | 1567 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); |
| @@ -1545,9 +1573,10 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
| 1545 | /* If we have very few page cache pages, | 1573 | /* If we have very few page cache pages, |
| 1546 | force-scan anon pages. */ | 1574 | force-scan anon pages. */ |
| 1547 | if (unlikely(file + free <= high_wmark_pages(zone))) { | 1575 | if (unlikely(file + free <= high_wmark_pages(zone))) { |
| 1548 | percent[0] = 100; | 1576 | fraction[0] = 1; |
| 1549 | percent[1] = 0; | 1577 | fraction[1] = 0; |
| 1550 | return; | 1578 | denominator = 1; |
| 1579 | goto out; | ||
| 1551 | } | 1580 | } |
| 1552 | } | 1581 | } |
| 1553 | 1582 | ||
| @@ -1594,29 +1623,37 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
| 1594 | fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); | 1623 | fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); |
| 1595 | fp /= reclaim_stat->recent_rotated[1] + 1; | 1624 | fp /= reclaim_stat->recent_rotated[1] + 1; |
| 1596 | 1625 | ||
| 1597 | /* Normalize to percentages */ | 1626 | fraction[0] = ap; |
| 1598 | percent[0] = 100 * ap / (ap + fp + 1); | 1627 | fraction[1] = fp; |
| 1599 | percent[1] = 100 - percent[0]; | 1628 | denominator = ap + fp + 1; |
| 1629 | out: | ||
| 1630 | for_each_evictable_lru(l) { | ||
| 1631 | int file = is_file_lru(l); | ||
| 1632 | unsigned long scan; | ||
| 1633 | |||
| 1634 | scan = zone_nr_lru_pages(zone, sc, l); | ||
| 1635 | if (priority || noswap) { | ||
| 1636 | scan >>= priority; | ||
| 1637 | scan = div64_u64(scan * fraction[file], denominator); | ||
| 1638 | } | ||
| 1639 | nr[l] = nr_scan_try_batch(scan, | ||
| 1640 | &reclaim_stat->nr_saved_scan[l]); | ||
| 1641 | } | ||
| 1600 | } | 1642 | } |
| 1601 | 1643 | ||
| 1602 | /* | 1644 | static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc) |
| 1603 | * Smallish @nr_to_scan's are deposited in @nr_saved_scan, | ||
| 1604 | * until we collected @swap_cluster_max pages to scan. | ||
| 1605 | */ | ||
| 1606 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, | ||
| 1607 | unsigned long *nr_saved_scan) | ||
| 1608 | { | 1645 | { |
| 1609 | unsigned long nr; | 1646 | /* |
| 1610 | 1647 | * If we need a large contiguous chunk of memory, or have | |
| 1611 | *nr_saved_scan += nr_to_scan; | 1648 | * trouble getting a small set of contiguous pages, we |
| 1612 | nr = *nr_saved_scan; | 1649 | * will reclaim both active and inactive pages. |
| 1613 | 1650 | */ | |
| 1614 | if (nr >= SWAP_CLUSTER_MAX) | 1651 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) |
| 1615 | *nr_saved_scan = 0; | 1652 | sc->lumpy_reclaim_mode = 1; |
| 1653 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
| 1654 | sc->lumpy_reclaim_mode = 1; | ||
| 1616 | else | 1655 | else |
| 1617 | nr = 0; | 1656 | sc->lumpy_reclaim_mode = 0; |
| 1618 | |||
| 1619 | return nr; | ||
| 1620 | } | 1657 | } |
| 1621 | 1658 | ||
| 1622 | /* | 1659 | /* |
| @@ -1627,33 +1664,13 @@ static void shrink_zone(int priority, struct zone *zone, | |||
| 1627 | { | 1664 | { |
| 1628 | unsigned long nr[NR_LRU_LISTS]; | 1665 | unsigned long nr[NR_LRU_LISTS]; |
| 1629 | unsigned long nr_to_scan; | 1666 | unsigned long nr_to_scan; |
| 1630 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ | ||
| 1631 | enum lru_list l; | 1667 | enum lru_list l; |
| 1632 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1668 | unsigned long nr_reclaimed = sc->nr_reclaimed; |
| 1633 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 1669 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
| 1634 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | ||
| 1635 | int noswap = 0; | ||
| 1636 | |||
| 1637 | /* If we have no swap space, do not bother scanning anon pages. */ | ||
| 1638 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | ||
| 1639 | noswap = 1; | ||
| 1640 | percent[0] = 0; | ||
| 1641 | percent[1] = 100; | ||
| 1642 | } else | ||
| 1643 | get_scan_ratio(zone, sc, percent); | ||
| 1644 | 1670 | ||
| 1645 | for_each_evictable_lru(l) { | 1671 | get_scan_count(zone, sc, nr, priority); |
| 1646 | int file = is_file_lru(l); | ||
| 1647 | unsigned long scan; | ||
| 1648 | 1672 | ||
| 1649 | scan = zone_nr_lru_pages(zone, sc, l); | 1673 | set_lumpy_reclaim_mode(priority, sc); |
| 1650 | if (priority || noswap) { | ||
| 1651 | scan >>= priority; | ||
| 1652 | scan = (scan * percent[file]) / 100; | ||
| 1653 | } | ||
| 1654 | nr[l] = nr_scan_try_batch(scan, | ||
| 1655 | &reclaim_stat->nr_saved_scan[l]); | ||
| 1656 | } | ||
| 1657 | 1674 | ||
| 1658 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1675 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
| 1659 | nr[LRU_INACTIVE_FILE]) { | 1676 | nr[LRU_INACTIVE_FILE]) { |
| @@ -1707,14 +1724,14 @@ static void shrink_zone(int priority, struct zone *zone, | |||
| 1707 | * If a zone is deemed to be full of pinned pages then just give it a light | 1724 | * If a zone is deemed to be full of pinned pages then just give it a light |
| 1708 | * scan then give up on it. | 1725 | * scan then give up on it. |
| 1709 | */ | 1726 | */ |
| 1710 | static void shrink_zones(int priority, struct zonelist *zonelist, | 1727 | static int shrink_zones(int priority, struct zonelist *zonelist, |
| 1711 | struct scan_control *sc) | 1728 | struct scan_control *sc) |
| 1712 | { | 1729 | { |
| 1713 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); | 1730 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); |
| 1714 | struct zoneref *z; | 1731 | struct zoneref *z; |
| 1715 | struct zone *zone; | 1732 | struct zone *zone; |
| 1733 | int progress = 0; | ||
| 1716 | 1734 | ||
| 1717 | sc->all_unreclaimable = 1; | ||
| 1718 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, | 1735 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, |
| 1719 | sc->nodemask) { | 1736 | sc->nodemask) { |
| 1720 | if (!populated_zone(zone)) | 1737 | if (!populated_zone(zone)) |
| @@ -1730,19 +1747,19 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
| 1730 | 1747 | ||
| 1731 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1748 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
| 1732 | continue; /* Let kswapd poll it */ | 1749 | continue; /* Let kswapd poll it */ |
| 1733 | sc->all_unreclaimable = 0; | ||
| 1734 | } else { | 1750 | } else { |
| 1735 | /* | 1751 | /* |
| 1736 | * Ignore cpuset limitation here. We just want to reduce | 1752 | * Ignore cpuset limitation here. We just want to reduce |
| 1737 | * # of used pages by us regardless of memory shortage. | 1753 | * # of used pages by us regardless of memory shortage. |
| 1738 | */ | 1754 | */ |
| 1739 | sc->all_unreclaimable = 0; | ||
| 1740 | mem_cgroup_note_reclaim_priority(sc->mem_cgroup, | 1755 | mem_cgroup_note_reclaim_priority(sc->mem_cgroup, |
| 1741 | priority); | 1756 | priority); |
| 1742 | } | 1757 | } |
| 1743 | 1758 | ||
| 1744 | shrink_zone(priority, zone, sc); | 1759 | shrink_zone(priority, zone, sc); |
| 1760 | progress = 1; | ||
| 1745 | } | 1761 | } |
| 1762 | return progress; | ||
| 1746 | } | 1763 | } |
| 1747 | 1764 | ||
| 1748 | /* | 1765 | /* |
| @@ -1774,6 +1791,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
| 1774 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); | 1791 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); |
| 1775 | unsigned long writeback_threshold; | 1792 | unsigned long writeback_threshold; |
| 1776 | 1793 | ||
| 1794 | get_mems_allowed(); | ||
| 1777 | delayacct_freepages_start(); | 1795 | delayacct_freepages_start(); |
| 1778 | 1796 | ||
| 1779 | if (scanning_global_lru(sc)) | 1797 | if (scanning_global_lru(sc)) |
| @@ -1795,7 +1813,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
| 1795 | sc->nr_scanned = 0; | 1813 | sc->nr_scanned = 0; |
| 1796 | if (!priority) | 1814 | if (!priority) |
| 1797 | disable_swap_token(); | 1815 | disable_swap_token(); |
| 1798 | shrink_zones(priority, zonelist, sc); | 1816 | ret = shrink_zones(priority, zonelist, sc); |
| 1799 | /* | 1817 | /* |
| 1800 | * Don't shrink slabs when reclaiming memory from | 1818 | * Don't shrink slabs when reclaiming memory from |
| 1801 | * over limit cgroups | 1819 | * over limit cgroups |
| @@ -1832,7 +1850,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
| 1832 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1850 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
| 1833 | } | 1851 | } |
| 1834 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 1852 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
| 1835 | if (!sc->all_unreclaimable && scanning_global_lru(sc)) | 1853 | if (ret && scanning_global_lru(sc)) |
| 1836 | ret = sc->nr_reclaimed; | 1854 | ret = sc->nr_reclaimed; |
| 1837 | out: | 1855 | out: |
| 1838 | /* | 1856 | /* |
| @@ -1857,6 +1875,7 @@ out: | |||
| 1857 | mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); | 1875 | mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); |
| 1858 | 1876 | ||
| 1859 | delayacct_freepages_end(); | 1877 | delayacct_freepages_end(); |
| 1878 | put_mems_allowed(); | ||
| 1860 | 1879 | ||
| 1861 | return ret; | 1880 | return ret; |
| 1862 | } | 1881 | } |
| @@ -1873,7 +1892,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
| 1873 | .swappiness = vm_swappiness, | 1892 | .swappiness = vm_swappiness, |
| 1874 | .order = order, | 1893 | .order = order, |
| 1875 | .mem_cgroup = NULL, | 1894 | .mem_cgroup = NULL, |
| 1876 | .isolate_pages = isolate_pages_global, | ||
| 1877 | .nodemask = nodemask, | 1895 | .nodemask = nodemask, |
| 1878 | }; | 1896 | }; |
| 1879 | 1897 | ||
| @@ -1894,7 +1912,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
| 1894 | .swappiness = swappiness, | 1912 | .swappiness = swappiness, |
| 1895 | .order = 0, | 1913 | .order = 0, |
| 1896 | .mem_cgroup = mem, | 1914 | .mem_cgroup = mem, |
| 1897 | .isolate_pages = mem_cgroup_isolate_pages, | ||
| 1898 | }; | 1915 | }; |
| 1899 | nodemask_t nm = nodemask_of_node(nid); | 1916 | nodemask_t nm = nodemask_of_node(nid); |
| 1900 | 1917 | ||
| @@ -1928,7 +1945,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
| 1928 | .swappiness = swappiness, | 1945 | .swappiness = swappiness, |
| 1929 | .order = 0, | 1946 | .order = 0, |
| 1930 | .mem_cgroup = mem_cont, | 1947 | .mem_cgroup = mem_cont, |
| 1931 | .isolate_pages = mem_cgroup_isolate_pages, | ||
| 1932 | .nodemask = NULL, /* we don't care the placement */ | 1948 | .nodemask = NULL, /* we don't care the placement */ |
| 1933 | }; | 1949 | }; |
| 1934 | 1950 | ||
| @@ -2006,7 +2022,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
| 2006 | .swappiness = vm_swappiness, | 2022 | .swappiness = vm_swappiness, |
| 2007 | .order = order, | 2023 | .order = order, |
| 2008 | .mem_cgroup = NULL, | 2024 | .mem_cgroup = NULL, |
| 2009 | .isolate_pages = isolate_pages_global, | ||
| 2010 | }; | 2025 | }; |
| 2011 | /* | 2026 | /* |
| 2012 | * temp_priority is used to remember the scanning priority at which | 2027 | * temp_priority is used to remember the scanning priority at which |
| @@ -2385,7 +2400,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
| 2385 | .hibernation_mode = 1, | 2400 | .hibernation_mode = 1, |
| 2386 | .swappiness = vm_swappiness, | 2401 | .swappiness = vm_swappiness, |
| 2387 | .order = 0, | 2402 | .order = 0, |
| 2388 | .isolate_pages = isolate_pages_global, | ||
| 2389 | }; | 2403 | }; |
| 2390 | struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); | 2404 | struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); |
| 2391 | struct task_struct *p = current; | 2405 | struct task_struct *p = current; |
| @@ -2570,7 +2584,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 2570 | .gfp_mask = gfp_mask, | 2584 | .gfp_mask = gfp_mask, |
| 2571 | .swappiness = vm_swappiness, | 2585 | .swappiness = vm_swappiness, |
| 2572 | .order = order, | 2586 | .order = order, |
| 2573 | .isolate_pages = isolate_pages_global, | ||
| 2574 | }; | 2587 | }; |
| 2575 | unsigned long slab_reclaimable; | 2588 | unsigned long slab_reclaimable; |
| 2576 | 2589 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index fa12ea3051fb..7759941d4e77 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/cpu.h> | 16 | #include <linux/cpu.h> |
| 17 | #include <linux/vmstat.h> | 17 | #include <linux/vmstat.h> |
| 18 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
| 19 | #include <linux/math64.h> | ||
| 19 | 20 | ||
| 20 | #ifdef CONFIG_VM_EVENT_COUNTERS | 21 | #ifdef CONFIG_VM_EVENT_COUNTERS |
| 21 | DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; | 22 | DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; |
| @@ -379,7 +380,86 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z) | |||
| 379 | } | 380 | } |
| 380 | #endif | 381 | #endif |
| 381 | 382 | ||
| 382 | #ifdef CONFIG_PROC_FS | 383 | #ifdef CONFIG_COMPACTION |
| 384 | struct contig_page_info { | ||
| 385 | unsigned long free_pages; | ||
| 386 | unsigned long free_blocks_total; | ||
| 387 | unsigned long free_blocks_suitable; | ||
| 388 | }; | ||
| 389 | |||
| 390 | /* | ||
| 391 | * Calculate the number of free pages in a zone, how many contiguous | ||
| 392 | * pages are free and how many are large enough to satisfy an allocation of | ||
| 393 | * the target size. Note that this function makes no attempt to estimate | ||
| 394 | * how many suitable free blocks there *might* be if MOVABLE pages were | ||
| 395 | * migrated. Calculating that is possible, but expensive and can be | ||
| 396 | * figured out from userspace | ||
| 397 | */ | ||
| 398 | static void fill_contig_page_info(struct zone *zone, | ||
| 399 | unsigned int suitable_order, | ||
| 400 | struct contig_page_info *info) | ||
| 401 | { | ||
| 402 | unsigned int order; | ||
| 403 | |||
| 404 | info->free_pages = 0; | ||
| 405 | info->free_blocks_total = 0; | ||
| 406 | info->free_blocks_suitable = 0; | ||
| 407 | |||
| 408 | for (order = 0; order < MAX_ORDER; order++) { | ||
| 409 | unsigned long blocks; | ||
| 410 | |||
| 411 | /* Count number of free blocks */ | ||
| 412 | blocks = zone->free_area[order].nr_free; | ||
| 413 | info->free_blocks_total += blocks; | ||
| 414 | |||
| 415 | /* Count free base pages */ | ||
| 416 | info->free_pages += blocks << order; | ||
| 417 | |||
| 418 | /* Count the suitable free blocks */ | ||
| 419 | if (order >= suitable_order) | ||
| 420 | info->free_blocks_suitable += blocks << | ||
| 421 | (order - suitable_order); | ||
| 422 | } | ||
| 423 | } | ||
| 424 | |||
| 425 | /* | ||
| 426 | * A fragmentation index only makes sense if an allocation of a requested | ||
| 427 | * size would fail. If that is true, the fragmentation index indicates | ||
| 428 | * whether external fragmentation or a lack of memory was the problem. | ||
| 429 | * The value can be used to determine if page reclaim or compaction | ||
| 430 | * should be used | ||
| 431 | */ | ||
| 432 | static int __fragmentation_index(unsigned int order, struct contig_page_info *info) | ||
| 433 | { | ||
| 434 | unsigned long requested = 1UL << order; | ||
| 435 | |||
| 436 | if (!info->free_blocks_total) | ||
| 437 | return 0; | ||
| 438 | |||
| 439 | /* Fragmentation index only makes sense when a request would fail */ | ||
| 440 | if (info->free_blocks_suitable) | ||
| 441 | return -1000; | ||
| 442 | |||
| 443 | /* | ||
| 444 | * Index is between 0 and 1 so return within 3 decimal places | ||
| 445 | * | ||
| 446 | * 0 => allocation would fail due to lack of memory | ||
| 447 | * 1 => allocation would fail due to fragmentation | ||
| 448 | */ | ||
| 449 | return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); | ||
| 450 | } | ||
| 451 | |||
| 452 | /* Same as __fragmentation index but allocs contig_page_info on stack */ | ||
| 453 | int fragmentation_index(struct zone *zone, unsigned int order) | ||
| 454 | { | ||
| 455 | struct contig_page_info info; | ||
| 456 | |||
| 457 | fill_contig_page_info(zone, order, &info); | ||
| 458 | return __fragmentation_index(order, &info); | ||
| 459 | } | ||
| 460 | #endif | ||
| 461 | |||
| 462 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION) | ||
| 383 | #include <linux/proc_fs.h> | 463 | #include <linux/proc_fs.h> |
| 384 | #include <linux/seq_file.h> | 464 | #include <linux/seq_file.h> |
| 385 | 465 | ||
| @@ -432,7 +512,9 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | |||
| 432 | spin_unlock_irqrestore(&zone->lock, flags); | 512 | spin_unlock_irqrestore(&zone->lock, flags); |
| 433 | } | 513 | } |
| 434 | } | 514 | } |
| 515 | #endif | ||
| 435 | 516 | ||
| 517 | #ifdef CONFIG_PROC_FS | ||
| 436 | static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, | 518 | static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, |
| 437 | struct zone *zone) | 519 | struct zone *zone) |
| 438 | { | 520 | { |
| @@ -693,6 +775,16 @@ static const char * const vmstat_text[] = { | |||
| 693 | "allocstall", | 775 | "allocstall", |
| 694 | 776 | ||
| 695 | "pgrotated", | 777 | "pgrotated", |
| 778 | |||
| 779 | #ifdef CONFIG_COMPACTION | ||
| 780 | "compact_blocks_moved", | ||
| 781 | "compact_pages_moved", | ||
| 782 | "compact_pagemigrate_failed", | ||
| 783 | "compact_stall", | ||
| 784 | "compact_fail", | ||
| 785 | "compact_success", | ||
| 786 | #endif | ||
| 787 | |||
| 696 | #ifdef CONFIG_HUGETLB_PAGE | 788 | #ifdef CONFIG_HUGETLB_PAGE |
| 697 | "htlb_buddy_alloc_success", | 789 | "htlb_buddy_alloc_success", |
| 698 | "htlb_buddy_alloc_fail", | 790 | "htlb_buddy_alloc_fail", |
| @@ -954,3 +1046,162 @@ static int __init setup_vmstat(void) | |||
| 954 | return 0; | 1046 | return 0; |
| 955 | } | 1047 | } |
| 956 | module_init(setup_vmstat) | 1048 | module_init(setup_vmstat) |
| 1049 | |||
| 1050 | #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) | ||
| 1051 | #include <linux/debugfs.h> | ||
| 1052 | |||
| 1053 | static struct dentry *extfrag_debug_root; | ||
| 1054 | |||
| 1055 | /* | ||
| 1056 | * Return an index indicating how much of the available free memory is | ||
| 1057 | * unusable for an allocation of the requested size. | ||
| 1058 | */ | ||
| 1059 | static int unusable_free_index(unsigned int order, | ||
| 1060 | struct contig_page_info *info) | ||
| 1061 | { | ||
| 1062 | /* No free memory is interpreted as all free memory is unusable */ | ||
| 1063 | if (info->free_pages == 0) | ||
| 1064 | return 1000; | ||
| 1065 | |||
| 1066 | /* | ||
| 1067 | * Index should be a value between 0 and 1. Return a value to 3 | ||
| 1068 | * decimal places. | ||
| 1069 | * | ||
| 1070 | * 0 => no fragmentation | ||
| 1071 | * 1 => high fragmentation | ||
| 1072 | */ | ||
| 1073 | return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages); | ||
| 1074 | |||
| 1075 | } | ||
| 1076 | |||
| 1077 | static void unusable_show_print(struct seq_file *m, | ||
| 1078 | pg_data_t *pgdat, struct zone *zone) | ||
| 1079 | { | ||
| 1080 | unsigned int order; | ||
| 1081 | int index; | ||
| 1082 | struct contig_page_info info; | ||
| 1083 | |||
| 1084 | seq_printf(m, "Node %d, zone %8s ", | ||
| 1085 | pgdat->node_id, | ||
| 1086 | zone->name); | ||
| 1087 | for (order = 0; order < MAX_ORDER; ++order) { | ||
| 1088 | fill_contig_page_info(zone, order, &info); | ||
| 1089 | index = unusable_free_index(order, &info); | ||
| 1090 | seq_printf(m, "%d.%03d ", index / 1000, index % 1000); | ||
| 1091 | } | ||
| 1092 | |||
| 1093 | seq_putc(m, '\n'); | ||
| 1094 | } | ||
| 1095 | |||
| 1096 | /* | ||
| 1097 | * Display unusable free space index | ||
| 1098 | * | ||
| 1099 | * The unusable free space index measures how much of the available free | ||
| 1100 | * memory cannot be used to satisfy an allocation of a given size and is a | ||
| 1101 | * value between 0 and 1. The higher the value, the more of free memory is | ||
| 1102 | * unusable and by implication, the worse the external fragmentation is. This | ||
| 1103 | * can be expressed as a percentage by multiplying by 100. | ||
| 1104 | */ | ||
| 1105 | static int unusable_show(struct seq_file *m, void *arg) | ||
| 1106 | { | ||
| 1107 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
| 1108 | |||
| 1109 | /* check memoryless node */ | ||
| 1110 | if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) | ||
| 1111 | return 0; | ||
| 1112 | |||
| 1113 | walk_zones_in_node(m, pgdat, unusable_show_print); | ||
| 1114 | |||
| 1115 | return 0; | ||
| 1116 | } | ||
| 1117 | |||
| 1118 | static const struct seq_operations unusable_op = { | ||
| 1119 | .start = frag_start, | ||
| 1120 | .next = frag_next, | ||
| 1121 | .stop = frag_stop, | ||
| 1122 | .show = unusable_show, | ||
| 1123 | }; | ||
| 1124 | |||
| 1125 | static int unusable_open(struct inode *inode, struct file *file) | ||
| 1126 | { | ||
| 1127 | return seq_open(file, &unusable_op); | ||
| 1128 | } | ||
| 1129 | |||
| 1130 | static const struct file_operations unusable_file_ops = { | ||
| 1131 | .open = unusable_open, | ||
| 1132 | .read = seq_read, | ||
| 1133 | .llseek = seq_lseek, | ||
| 1134 | .release = seq_release, | ||
| 1135 | }; | ||
| 1136 | |||
| 1137 | static void extfrag_show_print(struct seq_file *m, | ||
| 1138 | pg_data_t *pgdat, struct zone *zone) | ||
| 1139 | { | ||
| 1140 | unsigned int order; | ||
| 1141 | int index; | ||
| 1142 | |||
| 1143 | /* Alloc on stack as interrupts are disabled for zone walk */ | ||
| 1144 | struct contig_page_info info; | ||
| 1145 | |||
| 1146 | seq_printf(m, "Node %d, zone %8s ", | ||
| 1147 | pgdat->node_id, | ||
| 1148 | zone->name); | ||
| 1149 | for (order = 0; order < MAX_ORDER; ++order) { | ||
| 1150 | fill_contig_page_info(zone, order, &info); | ||
| 1151 | index = __fragmentation_index(order, &info); | ||
| 1152 | seq_printf(m, "%d.%03d ", index / 1000, index % 1000); | ||
| 1153 | } | ||
| 1154 | |||
| 1155 | seq_putc(m, '\n'); | ||
| 1156 | } | ||
| 1157 | |||
| 1158 | /* | ||
| 1159 | * Display fragmentation index for orders that allocations would fail for | ||
| 1160 | */ | ||
| 1161 | static int extfrag_show(struct seq_file *m, void *arg) | ||
| 1162 | { | ||
| 1163 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
| 1164 | |||
| 1165 | walk_zones_in_node(m, pgdat, extfrag_show_print); | ||
| 1166 | |||
| 1167 | return 0; | ||
| 1168 | } | ||
| 1169 | |||
| 1170 | static const struct seq_operations extfrag_op = { | ||
| 1171 | .start = frag_start, | ||
| 1172 | .next = frag_next, | ||
| 1173 | .stop = frag_stop, | ||
| 1174 | .show = extfrag_show, | ||
| 1175 | }; | ||
| 1176 | |||
| 1177 | static int extfrag_open(struct inode *inode, struct file *file) | ||
| 1178 | { | ||
| 1179 | return seq_open(file, &extfrag_op); | ||
| 1180 | } | ||
| 1181 | |||
| 1182 | static const struct file_operations extfrag_file_ops = { | ||
| 1183 | .open = extfrag_open, | ||
| 1184 | .read = seq_read, | ||
| 1185 | .llseek = seq_lseek, | ||
| 1186 | .release = seq_release, | ||
| 1187 | }; | ||
| 1188 | |||
| 1189 | static int __init extfrag_debug_init(void) | ||
| 1190 | { | ||
| 1191 | extfrag_debug_root = debugfs_create_dir("extfrag", NULL); | ||
| 1192 | if (!extfrag_debug_root) | ||
| 1193 | return -ENOMEM; | ||
| 1194 | |||
| 1195 | if (!debugfs_create_file("unusable_index", 0444, | ||
| 1196 | extfrag_debug_root, NULL, &unusable_file_ops)) | ||
| 1197 | return -ENOMEM; | ||
| 1198 | |||
| 1199 | if (!debugfs_create_file("extfrag_index", 0444, | ||
| 1200 | extfrag_debug_root, NULL, &extfrag_file_ops)) | ||
| 1201 | return -ENOMEM; | ||
| 1202 | |||
| 1203 | return 0; | ||
| 1204 | } | ||
| 1205 | |||
| 1206 | module_init(extfrag_debug_init); | ||
| 1207 | #endif | ||
