diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 18 | ||||
| -rw-r--r-- | mm/Makefile | 3 | ||||
| -rw-r--r-- | mm/bootmem.c | 2 | ||||
| -rw-r--r-- | mm/bounce.c | 2 | ||||
| -rw-r--r-- | mm/fadvise.c | 2 | ||||
| -rw-r--r-- | mm/filemap.c | 297 | ||||
| -rw-r--r-- | mm/fremap.c | 27 | ||||
| -rw-r--r-- | mm/highmem.c | 5 | ||||
| -rw-r--r-- | mm/hugetlb.c | 124 | ||||
| -rw-r--r-- | mm/internal.h | 160 | ||||
| -rw-r--r-- | mm/memcontrol.c | 465 | ||||
| -rw-r--r-- | mm/memory.c | 125 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 19 | ||||
| -rw-r--r-- | mm/mempolicy.c | 29 | ||||
| -rw-r--r-- | mm/migrate.c | 276 | ||||
| -rw-r--r-- | mm/mlock.c | 439 | ||||
| -rw-r--r-- | mm/mmap.c | 82 | ||||
| -rw-r--r-- | mm/mremap.c | 8 | ||||
| -rw-r--r-- | mm/nommu.c | 47 | ||||
| -rw-r--r-- | mm/oom_kill.c | 3 | ||||
| -rw-r--r-- | mm/page-writeback.c | 22 | ||||
| -rw-r--r-- | mm/page_alloc.c | 155 | ||||
| -rw-r--r-- | mm/page_cgroup.c | 256 | ||||
| -rw-r--r-- | mm/page_isolation.c | 5 | ||||
| -rw-r--r-- | mm/pdflush.c | 2 | ||||
| -rw-r--r-- | mm/readahead.c | 4 | ||||
| -rw-r--r-- | mm/rmap.c | 319 | ||||
| -rw-r--r-- | mm/shmem.c | 20 | ||||
| -rw-r--r-- | mm/slab.c | 52 | ||||
| -rw-r--r-- | mm/slub.c | 29 | ||||
| -rw-r--r-- | mm/sparse-vmemmap.c | 2 | ||||
| -rw-r--r-- | mm/swap.c | 172 | ||||
| -rw-r--r-- | mm/swap_state.c | 11 | ||||
| -rw-r--r-- | mm/swapfile.c | 27 | ||||
| -rw-r--r-- | mm/tiny-shmem.c | 1 | ||||
| -rw-r--r-- | mm/truncate.c | 6 | ||||
| -rw-r--r-- | mm/vmalloc.c | 1038 | ||||
| -rw-r--r-- | mm/vmscan.c | 991 | ||||
| -rw-r--r-- | mm/vmstat.c | 102 |
39 files changed, 3940 insertions, 1407 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 0bd9c2dbb2a0..5b5790f8a816 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -101,7 +101,7 @@ config HAVE_MEMORY_PRESENT | |||
| 101 | # with gcc 3.4 and later. | 101 | # with gcc 3.4 and later. |
| 102 | # | 102 | # |
| 103 | config SPARSEMEM_STATIC | 103 | config SPARSEMEM_STATIC |
| 104 | def_bool n | 104 | bool |
| 105 | 105 | ||
| 106 | # | 106 | # |
| 107 | # Architecture platforms which require a two level mem_section in SPARSEMEM | 107 | # Architecture platforms which require a two level mem_section in SPARSEMEM |
| @@ -113,7 +113,7 @@ config SPARSEMEM_EXTREME | |||
| 113 | depends on SPARSEMEM && !SPARSEMEM_STATIC | 113 | depends on SPARSEMEM && !SPARSEMEM_STATIC |
| 114 | 114 | ||
| 115 | config SPARSEMEM_VMEMMAP_ENABLE | 115 | config SPARSEMEM_VMEMMAP_ENABLE |
| 116 | def_bool n | 116 | bool |
| 117 | 117 | ||
| 118 | config SPARSEMEM_VMEMMAP | 118 | config SPARSEMEM_VMEMMAP |
| 119 | bool "Sparse Memory virtual memmap" | 119 | bool "Sparse Memory virtual memmap" |
| @@ -187,6 +187,9 @@ config RESOURCES_64BIT | |||
| 187 | help | 187 | help |
| 188 | This option allows memory and IO resources to be 64 bit. | 188 | This option allows memory and IO resources to be 64 bit. |
| 189 | 189 | ||
| 190 | config PHYS_ADDR_T_64BIT | ||
| 191 | def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT | ||
| 192 | |||
| 190 | config ZONE_DMA_FLAG | 193 | config ZONE_DMA_FLAG |
| 191 | int | 194 | int |
| 192 | default "0" if !ZONE_DMA | 195 | default "0" if !ZONE_DMA |
| @@ -206,5 +209,16 @@ config VIRT_TO_BUS | |||
| 206 | def_bool y | 209 | def_bool y |
| 207 | depends on !ARCH_NO_VIRT_TO_BUS | 210 | depends on !ARCH_NO_VIRT_TO_BUS |
| 208 | 211 | ||
| 212 | config UNEVICTABLE_LRU | ||
| 213 | bool "Add LRU list to track non-evictable pages" | ||
| 214 | default y | ||
| 215 | depends on MMU | ||
| 216 | help | ||
| 217 | Keeps unevictable pages off of the active and inactive pageout | ||
| 218 | lists, so kswapd will not waste CPU time or have its balancing | ||
| 219 | algorithms thrown off by scanning these pages. Selecting this | ||
| 220 | will use one page flag and increase the code size a little, | ||
| 221 | say Y unless you know what you are doing. | ||
| 222 | |||
| 209 | config MMU_NOTIFIER | 223 | config MMU_NOTIFIER |
| 210 | bool | 224 | bool |
diff --git a/mm/Makefile b/mm/Makefile index da4ccf015aea..c06b45a1ff5f 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -33,5 +33,4 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o | |||
| 33 | obj-$(CONFIG_MIGRATION) += migrate.o | 33 | obj-$(CONFIG_MIGRATION) += migrate.o |
| 34 | obj-$(CONFIG_SMP) += allocpercpu.o | 34 | obj-$(CONFIG_SMP) += allocpercpu.o |
| 35 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 35 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
| 36 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o | 36 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o |
| 37 | |||
diff --git a/mm/bootmem.c b/mm/bootmem.c index ad8eec6e44a8..ac5a891f142a 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
| @@ -48,7 +48,7 @@ early_param("bootmem_debug", bootmem_debug_setup); | |||
| 48 | if (unlikely(bootmem_debug)) \ | 48 | if (unlikely(bootmem_debug)) \ |
| 49 | printk(KERN_INFO \ | 49 | printk(KERN_INFO \ |
| 50 | "bootmem::%s " fmt, \ | 50 | "bootmem::%s " fmt, \ |
| 51 | __FUNCTION__, ## args); \ | 51 | __func__, ## args); \ |
| 52 | }) | 52 | }) |
| 53 | 53 | ||
| 54 | static unsigned long __init bootmap_bytes(unsigned long pages) | 54 | static unsigned long __init bootmap_bytes(unsigned long pages) |
diff --git a/mm/bounce.c b/mm/bounce.c index b6d2d0f1019b..06722c403058 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
| @@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) | |||
| 267 | /* | 267 | /* |
| 268 | * Data-less bio, nothing to bounce | 268 | * Data-less bio, nothing to bounce |
| 269 | */ | 269 | */ |
| 270 | if (bio_empty_barrier(*bio_orig)) | 270 | if (!bio_has_data(*bio_orig)) |
| 271 | return; | 271 | return; |
| 272 | 272 | ||
| 273 | /* | 273 | /* |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 343cfdfebd9e..a1da969bd980 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
| @@ -3,7 +3,7 @@ | |||
| 3 | * | 3 | * |
| 4 | * Copyright (C) 2002, Linus Torvalds | 4 | * Copyright (C) 2002, Linus Torvalds |
| 5 | * | 5 | * |
| 6 | * 11Jan2003 akpm@digeo.com | 6 | * 11Jan2003 Andrew Morton |
| 7 | * Initial version. | 7 | * Initial version. |
| 8 | */ | 8 | */ |
| 9 | 9 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 876bc595d0f8..f3e5f8944d17 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -33,6 +33,7 @@ | |||
| 33 | #include <linux/cpuset.h> | 33 | #include <linux/cpuset.h> |
| 34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
| 35 | #include <linux/memcontrol.h> | 35 | #include <linux/memcontrol.h> |
| 36 | #include <linux/mm_inline.h> /* for page_is_file_cache() */ | ||
| 36 | #include "internal.h" | 37 | #include "internal.h" |
| 37 | 38 | ||
| 38 | /* | 39 | /* |
| @@ -115,12 +116,12 @@ void __remove_from_page_cache(struct page *page) | |||
| 115 | { | 116 | { |
| 116 | struct address_space *mapping = page->mapping; | 117 | struct address_space *mapping = page->mapping; |
| 117 | 118 | ||
| 118 | mem_cgroup_uncharge_cache_page(page); | ||
| 119 | radix_tree_delete(&mapping->page_tree, page->index); | 119 | radix_tree_delete(&mapping->page_tree, page->index); |
| 120 | page->mapping = NULL; | 120 | page->mapping = NULL; |
| 121 | mapping->nrpages--; | 121 | mapping->nrpages--; |
| 122 | __dec_zone_page_state(page, NR_FILE_PAGES); | 122 | __dec_zone_page_state(page, NR_FILE_PAGES); |
| 123 | BUG_ON(page_mapped(page)); | 123 | BUG_ON(page_mapped(page)); |
| 124 | mem_cgroup_uncharge_cache_page(page); | ||
| 124 | 125 | ||
| 125 | /* | 126 | /* |
| 126 | * Some filesystems seem to re-dirty the page even after | 127 | * Some filesystems seem to re-dirty the page even after |
| @@ -492,9 +493,24 @@ EXPORT_SYMBOL(add_to_page_cache_locked); | |||
| 492 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | 493 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, |
| 493 | pgoff_t offset, gfp_t gfp_mask) | 494 | pgoff_t offset, gfp_t gfp_mask) |
| 494 | { | 495 | { |
| 495 | int ret = add_to_page_cache(page, mapping, offset, gfp_mask); | 496 | int ret; |
| 496 | if (ret == 0) | 497 | |
| 497 | lru_cache_add(page); | 498 | /* |
| 499 | * Splice_read and readahead add shmem/tmpfs pages into the page cache | ||
| 500 | * before shmem_readpage has a chance to mark them as SwapBacked: they | ||
| 501 | * need to go on the active_anon lru below, and mem_cgroup_cache_charge | ||
| 502 | * (called in add_to_page_cache) needs to know where they're going too. | ||
| 503 | */ | ||
| 504 | if (mapping_cap_swap_backed(mapping)) | ||
| 505 | SetPageSwapBacked(page); | ||
| 506 | |||
| 507 | ret = add_to_page_cache(page, mapping, offset, gfp_mask); | ||
| 508 | if (ret == 0) { | ||
| 509 | if (page_is_file_cache(page)) | ||
| 510 | lru_cache_add_file(page); | ||
| 511 | else | ||
| 512 | lru_cache_add_active_anon(page); | ||
| 513 | } | ||
| 498 | return ret; | 514 | return ret; |
| 499 | } | 515 | } |
| 500 | 516 | ||
| @@ -557,17 +573,14 @@ EXPORT_SYMBOL(wait_on_page_bit); | |||
| 557 | * mechananism between PageLocked pages and PageWriteback pages is shared. | 573 | * mechananism between PageLocked pages and PageWriteback pages is shared. |
| 558 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. | 574 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. |
| 559 | * | 575 | * |
| 560 | * The first mb is necessary to safely close the critical section opened by the | 576 | * The mb is necessary to enforce ordering between the clear_bit and the read |
| 561 | * test_and_set_bit() to lock the page; the second mb is necessary to enforce | 577 | * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). |
| 562 | * ordering between the clear_bit and the read of the waitqueue (to avoid SMP | ||
| 563 | * races with a parallel wait_on_page_locked()). | ||
| 564 | */ | 578 | */ |
| 565 | void unlock_page(struct page *page) | 579 | void unlock_page(struct page *page) |
| 566 | { | 580 | { |
| 567 | smp_mb__before_clear_bit(); | 581 | VM_BUG_ON(!PageLocked(page)); |
| 568 | if (!test_and_clear_bit(PG_locked, &page->flags)) | 582 | clear_bit_unlock(PG_locked, &page->flags); |
| 569 | BUG(); | 583 | smp_mb__after_clear_bit(); |
| 570 | smp_mb__after_clear_bit(); | ||
| 571 | wake_up_page(page, PG_locked); | 584 | wake_up_page(page, PG_locked); |
| 572 | } | 585 | } |
| 573 | EXPORT_SYMBOL(unlock_page); | 586 | EXPORT_SYMBOL(unlock_page); |
| @@ -1100,8 +1113,9 @@ page_ok: | |||
| 1100 | 1113 | ||
| 1101 | page_not_up_to_date: | 1114 | page_not_up_to_date: |
| 1102 | /* Get exclusive access to the page ... */ | 1115 | /* Get exclusive access to the page ... */ |
| 1103 | if (lock_page_killable(page)) | 1116 | error = lock_page_killable(page); |
| 1104 | goto readpage_eio; | 1117 | if (unlikely(error)) |
| 1118 | goto readpage_error; | ||
| 1105 | 1119 | ||
| 1106 | page_not_up_to_date_locked: | 1120 | page_not_up_to_date_locked: |
| 1107 | /* Did it get truncated before we got the lock? */ | 1121 | /* Did it get truncated before we got the lock? */ |
| @@ -1130,8 +1144,9 @@ readpage: | |||
| 1130 | } | 1144 | } |
| 1131 | 1145 | ||
| 1132 | if (!PageUptodate(page)) { | 1146 | if (!PageUptodate(page)) { |
| 1133 | if (lock_page_killable(page)) | 1147 | error = lock_page_killable(page); |
| 1134 | goto readpage_eio; | 1148 | if (unlikely(error)) |
| 1149 | goto readpage_error; | ||
| 1135 | if (!PageUptodate(page)) { | 1150 | if (!PageUptodate(page)) { |
| 1136 | if (page->mapping == NULL) { | 1151 | if (page->mapping == NULL) { |
| 1137 | /* | 1152 | /* |
| @@ -1143,15 +1158,14 @@ readpage: | |||
| 1143 | } | 1158 | } |
| 1144 | unlock_page(page); | 1159 | unlock_page(page); |
| 1145 | shrink_readahead_size_eio(filp, ra); | 1160 | shrink_readahead_size_eio(filp, ra); |
| 1146 | goto readpage_eio; | 1161 | error = -EIO; |
| 1162 | goto readpage_error; | ||
| 1147 | } | 1163 | } |
| 1148 | unlock_page(page); | 1164 | unlock_page(page); |
| 1149 | } | 1165 | } |
| 1150 | 1166 | ||
| 1151 | goto page_ok; | 1167 | goto page_ok; |
| 1152 | 1168 | ||
| 1153 | readpage_eio: | ||
| 1154 | error = -EIO; | ||
| 1155 | readpage_error: | 1169 | readpage_error: |
| 1156 | /* UHHUH! A synchronous read error occurred. Report it */ | 1170 | /* UHHUH! A synchronous read error occurred. Report it */ |
| 1157 | desc->error = error; | 1171 | desc->error = error; |
| @@ -1186,8 +1200,7 @@ out: | |||
| 1186 | ra->prev_pos |= prev_offset; | 1200 | ra->prev_pos |= prev_offset; |
| 1187 | 1201 | ||
| 1188 | *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; | 1202 | *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; |
| 1189 | if (filp) | 1203 | file_accessed(filp); |
| 1190 | file_accessed(filp); | ||
| 1191 | } | 1204 | } |
| 1192 | 1205 | ||
| 1193 | int file_read_actor(read_descriptor_t *desc, struct page *page, | 1206 | int file_read_actor(read_descriptor_t *desc, struct page *page, |
| @@ -2016,48 +2029,8 @@ int pagecache_write_begin(struct file *file, struct address_space *mapping, | |||
| 2016 | { | 2029 | { |
| 2017 | const struct address_space_operations *aops = mapping->a_ops; | 2030 | const struct address_space_operations *aops = mapping->a_ops; |
| 2018 | 2031 | ||
| 2019 | if (aops->write_begin) { | 2032 | return aops->write_begin(file, mapping, pos, len, flags, |
| 2020 | return aops->write_begin(file, mapping, pos, len, flags, | ||
| 2021 | pagep, fsdata); | 2033 | pagep, fsdata); |
| 2022 | } else { | ||
| 2023 | int ret; | ||
| 2024 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | ||
| 2025 | unsigned offset = pos & (PAGE_CACHE_SIZE - 1); | ||
| 2026 | struct inode *inode = mapping->host; | ||
| 2027 | struct page *page; | ||
| 2028 | again: | ||
| 2029 | page = __grab_cache_page(mapping, index); | ||
| 2030 | *pagep = page; | ||
| 2031 | if (!page) | ||
| 2032 | return -ENOMEM; | ||
| 2033 | |||
| 2034 | if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { | ||
| 2035 | /* | ||
| 2036 | * There is no way to resolve a short write situation | ||
| 2037 | * for a !Uptodate page (except by double copying in | ||
| 2038 | * the caller done by generic_perform_write_2copy). | ||
| 2039 | * | ||
| 2040 | * Instead, we have to bring it uptodate here. | ||
| 2041 | */ | ||
| 2042 | ret = aops->readpage(file, page); | ||
| 2043 | page_cache_release(page); | ||
| 2044 | if (ret) { | ||
| 2045 | if (ret == AOP_TRUNCATED_PAGE) | ||
| 2046 | goto again; | ||
| 2047 | return ret; | ||
| 2048 | } | ||
| 2049 | goto again; | ||
| 2050 | } | ||
| 2051 | |||
| 2052 | ret = aops->prepare_write(file, page, offset, offset+len); | ||
| 2053 | if (ret) { | ||
| 2054 | unlock_page(page); | ||
| 2055 | page_cache_release(page); | ||
| 2056 | if (pos + len > inode->i_size) | ||
| 2057 | vmtruncate(inode, inode->i_size); | ||
| 2058 | } | ||
| 2059 | return ret; | ||
| 2060 | } | ||
| 2061 | } | 2034 | } |
| 2062 | EXPORT_SYMBOL(pagecache_write_begin); | 2035 | EXPORT_SYMBOL(pagecache_write_begin); |
| 2063 | 2036 | ||
| @@ -2066,32 +2039,9 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, | |||
| 2066 | struct page *page, void *fsdata) | 2039 | struct page *page, void *fsdata) |
| 2067 | { | 2040 | { |
| 2068 | const struct address_space_operations *aops = mapping->a_ops; | 2041 | const struct address_space_operations *aops = mapping->a_ops; |
| 2069 | int ret; | ||
| 2070 | |||
| 2071 | if (aops->write_end) { | ||
| 2072 | mark_page_accessed(page); | ||
| 2073 | ret = aops->write_end(file, mapping, pos, len, copied, | ||
| 2074 | page, fsdata); | ||
| 2075 | } else { | ||
| 2076 | unsigned offset = pos & (PAGE_CACHE_SIZE - 1); | ||
| 2077 | struct inode *inode = mapping->host; | ||
| 2078 | |||
| 2079 | flush_dcache_page(page); | ||
| 2080 | ret = aops->commit_write(file, page, offset, offset+len); | ||
| 2081 | unlock_page(page); | ||
| 2082 | mark_page_accessed(page); | ||
| 2083 | page_cache_release(page); | ||
| 2084 | 2042 | ||
| 2085 | if (ret < 0) { | 2043 | mark_page_accessed(page); |
| 2086 | if (pos + len > inode->i_size) | 2044 | return aops->write_end(file, mapping, pos, len, copied, page, fsdata); |
| 2087 | vmtruncate(inode, inode->i_size); | ||
| 2088 | } else if (ret > 0) | ||
| 2089 | ret = min_t(size_t, copied, ret); | ||
| 2090 | else | ||
| 2091 | ret = copied; | ||
| 2092 | } | ||
| 2093 | |||
| 2094 | return ret; | ||
| 2095 | } | 2045 | } |
| 2096 | EXPORT_SYMBOL(pagecache_write_end); | 2046 | EXPORT_SYMBOL(pagecache_write_end); |
| 2097 | 2047 | ||
| @@ -2213,174 +2163,6 @@ repeat: | |||
| 2213 | } | 2163 | } |
| 2214 | EXPORT_SYMBOL(__grab_cache_page); | 2164 | EXPORT_SYMBOL(__grab_cache_page); |
| 2215 | 2165 | ||
| 2216 | static ssize_t generic_perform_write_2copy(struct file *file, | ||
| 2217 | struct iov_iter *i, loff_t pos) | ||
| 2218 | { | ||
| 2219 | struct address_space *mapping = file->f_mapping; | ||
| 2220 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
| 2221 | struct inode *inode = mapping->host; | ||
| 2222 | long status = 0; | ||
| 2223 | ssize_t written = 0; | ||
| 2224 | |||
| 2225 | do { | ||
| 2226 | struct page *src_page; | ||
| 2227 | struct page *page; | ||
| 2228 | pgoff_t index; /* Pagecache index for current page */ | ||
| 2229 | unsigned long offset; /* Offset into pagecache page */ | ||
| 2230 | unsigned long bytes; /* Bytes to write to page */ | ||
| 2231 | size_t copied; /* Bytes copied from user */ | ||
| 2232 | |||
| 2233 | offset = (pos & (PAGE_CACHE_SIZE - 1)); | ||
| 2234 | index = pos >> PAGE_CACHE_SHIFT; | ||
| 2235 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | ||
| 2236 | iov_iter_count(i)); | ||
| 2237 | |||
| 2238 | /* | ||
| 2239 | * a non-NULL src_page indicates that we're doing the | ||
| 2240 | * copy via get_user_pages and kmap. | ||
| 2241 | */ | ||
| 2242 | src_page = NULL; | ||
| 2243 | |||
| 2244 | /* | ||
| 2245 | * Bring in the user page that we will copy from _first_. | ||
| 2246 | * Otherwise there's a nasty deadlock on copying from the | ||
| 2247 | * same page as we're writing to, without it being marked | ||
| 2248 | * up-to-date. | ||
| 2249 | * | ||
| 2250 | * Not only is this an optimisation, but it is also required | ||
| 2251 | * to check that the address is actually valid, when atomic | ||
| 2252 | * usercopies are used, below. | ||
| 2253 | */ | ||
| 2254 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { | ||
| 2255 | status = -EFAULT; | ||
| 2256 | break; | ||
| 2257 | } | ||
| 2258 | |||
| 2259 | page = __grab_cache_page(mapping, index); | ||
| 2260 | if (!page) { | ||
| 2261 | status = -ENOMEM; | ||
| 2262 | break; | ||
| 2263 | } | ||
| 2264 | |||
| 2265 | /* | ||
| 2266 | * non-uptodate pages cannot cope with short copies, and we | ||
| 2267 | * cannot take a pagefault with the destination page locked. | ||
| 2268 | * So pin the source page to copy it. | ||
| 2269 | */ | ||
| 2270 | if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) { | ||
| 2271 | unlock_page(page); | ||
| 2272 | |||
| 2273 | src_page = alloc_page(GFP_KERNEL); | ||
| 2274 | if (!src_page) { | ||
| 2275 | page_cache_release(page); | ||
| 2276 | status = -ENOMEM; | ||
| 2277 | break; | ||
| 2278 | } | ||
| 2279 | |||
| 2280 | /* | ||
| 2281 | * Cannot get_user_pages with a page locked for the | ||
| 2282 | * same reason as we can't take a page fault with a | ||
| 2283 | * page locked (as explained below). | ||
| 2284 | */ | ||
| 2285 | copied = iov_iter_copy_from_user(src_page, i, | ||
| 2286 | offset, bytes); | ||
| 2287 | if (unlikely(copied == 0)) { | ||
| 2288 | status = -EFAULT; | ||
| 2289 | page_cache_release(page); | ||
| 2290 | page_cache_release(src_page); | ||
| 2291 | break; | ||
| 2292 | } | ||
| 2293 | bytes = copied; | ||
| 2294 | |||
| 2295 | lock_page(page); | ||
| 2296 | /* | ||
| 2297 | * Can't handle the page going uptodate here, because | ||
| 2298 | * that means we would use non-atomic usercopies, which | ||
| 2299 | * zero out the tail of the page, which can cause | ||
| 2300 | * zeroes to become transiently visible. We could just | ||
| 2301 | * use a non-zeroing copy, but the APIs aren't too | ||
| 2302 | * consistent. | ||
| 2303 | */ | ||
| 2304 | if (unlikely(!page->mapping || PageUptodate(page))) { | ||
| 2305 | unlock_page(page); | ||
| 2306 | page_cache_release(page); | ||
| 2307 | page_cache_release(src_page); | ||
| 2308 | continue; | ||
| 2309 | } | ||
| 2310 | } | ||
| 2311 | |||
| 2312 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | ||
| 2313 | if (unlikely(status)) | ||
| 2314 | goto fs_write_aop_error; | ||
| 2315 | |||
| 2316 | if (!src_page) { | ||
| 2317 | /* | ||
| 2318 | * Must not enter the pagefault handler here, because | ||
| 2319 | * we hold the page lock, so we might recursively | ||
| 2320 | * deadlock on the same lock, or get an ABBA deadlock | ||
| 2321 | * against a different lock, or against the mmap_sem | ||
| 2322 | * (which nests outside the page lock). So increment | ||
| 2323 | * preempt count, and use _atomic usercopies. | ||
| 2324 | * | ||
| 2325 | * The page is uptodate so we are OK to encounter a | ||
| 2326 | * short copy: if unmodified parts of the page are | ||
| 2327 | * marked dirty and written out to disk, it doesn't | ||
| 2328 | * really matter. | ||
| 2329 | */ | ||
| 2330 | pagefault_disable(); | ||
| 2331 | copied = iov_iter_copy_from_user_atomic(page, i, | ||
| 2332 | offset, bytes); | ||
| 2333 | pagefault_enable(); | ||
| 2334 | } else { | ||
| 2335 | void *src, *dst; | ||
| 2336 | src = kmap_atomic(src_page, KM_USER0); | ||
| 2337 | dst = kmap_atomic(page, KM_USER1); | ||
| 2338 | memcpy(dst + offset, src + offset, bytes); | ||
| 2339 | kunmap_atomic(dst, KM_USER1); | ||
| 2340 | kunmap_atomic(src, KM_USER0); | ||
| 2341 | copied = bytes; | ||
| 2342 | } | ||
| 2343 | flush_dcache_page(page); | ||
| 2344 | |||
| 2345 | status = a_ops->commit_write(file, page, offset, offset+bytes); | ||
| 2346 | if (unlikely(status < 0)) | ||
| 2347 | goto fs_write_aop_error; | ||
| 2348 | if (unlikely(status > 0)) /* filesystem did partial write */ | ||
| 2349 | copied = min_t(size_t, copied, status); | ||
| 2350 | |||
| 2351 | unlock_page(page); | ||
| 2352 | mark_page_accessed(page); | ||
| 2353 | page_cache_release(page); | ||
| 2354 | if (src_page) | ||
| 2355 | page_cache_release(src_page); | ||
| 2356 | |||
| 2357 | iov_iter_advance(i, copied); | ||
| 2358 | pos += copied; | ||
| 2359 | written += copied; | ||
| 2360 | |||
| 2361 | balance_dirty_pages_ratelimited(mapping); | ||
| 2362 | cond_resched(); | ||
| 2363 | continue; | ||
| 2364 | |||
| 2365 | fs_write_aop_error: | ||
| 2366 | unlock_page(page); | ||
| 2367 | page_cache_release(page); | ||
| 2368 | if (src_page) | ||
| 2369 | page_cache_release(src_page); | ||
| 2370 | |||
| 2371 | /* | ||
| 2372 | * prepare_write() may have instantiated a few blocks | ||
| 2373 | * outside i_size. Trim these off again. Don't need | ||
| 2374 | * i_size_read because we hold i_mutex. | ||
| 2375 | */ | ||
| 2376 | if (pos + bytes > inode->i_size) | ||
| 2377 | vmtruncate(inode, inode->i_size); | ||
| 2378 | break; | ||
| 2379 | } while (iov_iter_count(i)); | ||
| 2380 | |||
| 2381 | return written ? written : status; | ||
| 2382 | } | ||
| 2383 | |||
| 2384 | static ssize_t generic_perform_write(struct file *file, | 2166 | static ssize_t generic_perform_write(struct file *file, |
| 2385 | struct iov_iter *i, loff_t pos) | 2167 | struct iov_iter *i, loff_t pos) |
| 2386 | { | 2168 | { |
| @@ -2481,10 +2263,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2481 | struct iov_iter i; | 2263 | struct iov_iter i; |
| 2482 | 2264 | ||
| 2483 | iov_iter_init(&i, iov, nr_segs, count, written); | 2265 | iov_iter_init(&i, iov, nr_segs, count, written); |
| 2484 | if (a_ops->write_begin) | 2266 | status = generic_perform_write(file, &i, pos); |
| 2485 | status = generic_perform_write(file, &i, pos); | ||
| 2486 | else | ||
| 2487 | status = generic_perform_write_2copy(file, &i, pos); | ||
| 2488 | 2267 | ||
| 2489 | if (likely(status >= 0)) { | 2268 | if (likely(status >= 0)) { |
| 2490 | written += status; | 2269 | written += status; |
diff --git a/mm/fremap.c b/mm/fremap.c index 7881638e4a12..7d12ca70ef7b 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
| @@ -21,6 +21,8 @@ | |||
| 21 | #include <asm/cacheflush.h> | 21 | #include <asm/cacheflush.h> |
| 22 | #include <asm/tlbflush.h> | 22 | #include <asm/tlbflush.h> |
| 23 | 23 | ||
| 24 | #include "internal.h" | ||
| 25 | |||
| 24 | static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | 26 | static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, |
| 25 | unsigned long addr, pte_t *ptep) | 27 | unsigned long addr, pte_t *ptep) |
| 26 | { | 28 | { |
| @@ -215,15 +217,31 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, | |||
| 215 | spin_unlock(&mapping->i_mmap_lock); | 217 | spin_unlock(&mapping->i_mmap_lock); |
| 216 | } | 218 | } |
| 217 | 219 | ||
| 220 | if (vma->vm_flags & VM_LOCKED) { | ||
| 221 | /* | ||
| 222 | * drop PG_Mlocked flag for over-mapped range | ||
| 223 | */ | ||
| 224 | unsigned int saved_flags = vma->vm_flags; | ||
| 225 | munlock_vma_pages_range(vma, start, start + size); | ||
| 226 | vma->vm_flags = saved_flags; | ||
| 227 | } | ||
| 228 | |||
| 218 | mmu_notifier_invalidate_range_start(mm, start, start + size); | 229 | mmu_notifier_invalidate_range_start(mm, start, start + size); |
| 219 | err = populate_range(mm, vma, start, size, pgoff); | 230 | err = populate_range(mm, vma, start, size, pgoff); |
| 220 | mmu_notifier_invalidate_range_end(mm, start, start + size); | 231 | mmu_notifier_invalidate_range_end(mm, start, start + size); |
| 221 | if (!err && !(flags & MAP_NONBLOCK)) { | 232 | if (!err && !(flags & MAP_NONBLOCK)) { |
| 222 | if (unlikely(has_write_lock)) { | 233 | if (vma->vm_flags & VM_LOCKED) { |
| 223 | downgrade_write(&mm->mmap_sem); | 234 | /* |
| 224 | has_write_lock = 0; | 235 | * might be mapping previously unmapped range of file |
| 236 | */ | ||
| 237 | mlock_vma_pages_range(vma, start, start + size); | ||
| 238 | } else { | ||
| 239 | if (unlikely(has_write_lock)) { | ||
| 240 | downgrade_write(&mm->mmap_sem); | ||
| 241 | has_write_lock = 0; | ||
| 242 | } | ||
| 243 | make_pages_present(start, start+size); | ||
| 225 | } | 244 | } |
| 226 | make_pages_present(start, start+size); | ||
| 227 | } | 245 | } |
| 228 | 246 | ||
| 229 | /* | 247 | /* |
| @@ -240,4 +258,3 @@ out: | |||
| 240 | 258 | ||
| 241 | return err; | 259 | return err; |
| 242 | } | 260 | } |
| 243 | |||
diff --git a/mm/highmem.c b/mm/highmem.c index e16e1523b688..b36b83b920ff 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
| @@ -70,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); | |||
| 70 | static void flush_all_zero_pkmaps(void) | 70 | static void flush_all_zero_pkmaps(void) |
| 71 | { | 71 | { |
| 72 | int i; | 72 | int i; |
| 73 | int need_flush = 0; | ||
| 73 | 74 | ||
| 74 | flush_cache_kmaps(); | 75 | flush_cache_kmaps(); |
| 75 | 76 | ||
| @@ -101,8 +102,10 @@ static void flush_all_zero_pkmaps(void) | |||
| 101 | &pkmap_page_table[i]); | 102 | &pkmap_page_table[i]); |
| 102 | 103 | ||
| 103 | set_page_address(page, NULL); | 104 | set_page_address(page, NULL); |
| 105 | need_flush = 1; | ||
| 104 | } | 106 | } |
| 105 | flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); | 107 | if (need_flush) |
| 108 | flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); | ||
| 106 | } | 109 | } |
| 107 | 110 | ||
| 108 | /** | 111 | /** |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 67a71191136e..6058b53dcb89 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -7,6 +7,7 @@ | |||
| 7 | #include <linux/init.h> | 7 | #include <linux/init.h> |
| 8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
| 9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
| 10 | #include <linux/seq_file.h> | ||
| 10 | #include <linux/sysctl.h> | 11 | #include <linux/sysctl.h> |
| 11 | #include <linux/highmem.h> | 12 | #include <linux/highmem.h> |
| 12 | #include <linux/mmu_notifier.h> | 13 | #include <linux/mmu_notifier.h> |
| @@ -262,7 +263,7 @@ struct resv_map { | |||
| 262 | struct list_head regions; | 263 | struct list_head regions; |
| 263 | }; | 264 | }; |
| 264 | 265 | ||
| 265 | struct resv_map *resv_map_alloc(void) | 266 | static struct resv_map *resv_map_alloc(void) |
| 266 | { | 267 | { |
| 267 | struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); | 268 | struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); |
| 268 | if (!resv_map) | 269 | if (!resv_map) |
| @@ -274,7 +275,7 @@ struct resv_map *resv_map_alloc(void) | |||
| 274 | return resv_map; | 275 | return resv_map; |
| 275 | } | 276 | } |
| 276 | 277 | ||
| 277 | void resv_map_release(struct kref *ref) | 278 | static void resv_map_release(struct kref *ref) |
| 278 | { | 279 | { |
| 279 | struct resv_map *resv_map = container_of(ref, struct resv_map, refs); | 280 | struct resv_map *resv_map = container_of(ref, struct resv_map, refs); |
| 280 | 281 | ||
| @@ -289,7 +290,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) | |||
| 289 | if (!(vma->vm_flags & VM_SHARED)) | 290 | if (!(vma->vm_flags & VM_SHARED)) |
| 290 | return (struct resv_map *)(get_vma_private_data(vma) & | 291 | return (struct resv_map *)(get_vma_private_data(vma) & |
| 291 | ~HPAGE_RESV_MASK); | 292 | ~HPAGE_RESV_MASK); |
| 292 | return 0; | 293 | return NULL; |
| 293 | } | 294 | } |
| 294 | 295 | ||
| 295 | static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) | 296 | static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) |
| @@ -353,11 +354,26 @@ static int vma_has_reserves(struct vm_area_struct *vma) | |||
| 353 | return 0; | 354 | return 0; |
| 354 | } | 355 | } |
| 355 | 356 | ||
| 357 | static void clear_gigantic_page(struct page *page, | ||
| 358 | unsigned long addr, unsigned long sz) | ||
| 359 | { | ||
| 360 | int i; | ||
| 361 | struct page *p = page; | ||
| 362 | |||
| 363 | might_sleep(); | ||
| 364 | for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { | ||
| 365 | cond_resched(); | ||
| 366 | clear_user_highpage(p, addr + i * PAGE_SIZE); | ||
| 367 | } | ||
| 368 | } | ||
| 356 | static void clear_huge_page(struct page *page, | 369 | static void clear_huge_page(struct page *page, |
| 357 | unsigned long addr, unsigned long sz) | 370 | unsigned long addr, unsigned long sz) |
| 358 | { | 371 | { |
| 359 | int i; | 372 | int i; |
| 360 | 373 | ||
| 374 | if (unlikely(sz > MAX_ORDER_NR_PAGES)) | ||
| 375 | return clear_gigantic_page(page, addr, sz); | ||
| 376 | |||
| 361 | might_sleep(); | 377 | might_sleep(); |
| 362 | for (i = 0; i < sz/PAGE_SIZE; i++) { | 378 | for (i = 0; i < sz/PAGE_SIZE; i++) { |
| 363 | cond_resched(); | 379 | cond_resched(); |
| @@ -365,12 +381,32 @@ static void clear_huge_page(struct page *page, | |||
| 365 | } | 381 | } |
| 366 | } | 382 | } |
| 367 | 383 | ||
| 384 | static void copy_gigantic_page(struct page *dst, struct page *src, | ||
| 385 | unsigned long addr, struct vm_area_struct *vma) | ||
| 386 | { | ||
| 387 | int i; | ||
| 388 | struct hstate *h = hstate_vma(vma); | ||
| 389 | struct page *dst_base = dst; | ||
| 390 | struct page *src_base = src; | ||
| 391 | might_sleep(); | ||
| 392 | for (i = 0; i < pages_per_huge_page(h); ) { | ||
| 393 | cond_resched(); | ||
| 394 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); | ||
| 395 | |||
| 396 | i++; | ||
| 397 | dst = mem_map_next(dst, dst_base, i); | ||
| 398 | src = mem_map_next(src, src_base, i); | ||
| 399 | } | ||
| 400 | } | ||
| 368 | static void copy_huge_page(struct page *dst, struct page *src, | 401 | static void copy_huge_page(struct page *dst, struct page *src, |
| 369 | unsigned long addr, struct vm_area_struct *vma) | 402 | unsigned long addr, struct vm_area_struct *vma) |
| 370 | { | 403 | { |
| 371 | int i; | 404 | int i; |
| 372 | struct hstate *h = hstate_vma(vma); | 405 | struct hstate *h = hstate_vma(vma); |
| 373 | 406 | ||
| 407 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) | ||
| 408 | return copy_gigantic_page(dst, src, addr, vma); | ||
| 409 | |||
| 374 | might_sleep(); | 410 | might_sleep(); |
| 375 | for (i = 0; i < pages_per_huge_page(h); i++) { | 411 | for (i = 0; i < pages_per_huge_page(h); i++) { |
| 376 | cond_resched(); | 412 | cond_resched(); |
| @@ -455,6 +491,8 @@ static void update_and_free_page(struct hstate *h, struct page *page) | |||
| 455 | { | 491 | { |
| 456 | int i; | 492 | int i; |
| 457 | 493 | ||
| 494 | VM_BUG_ON(h->order >= MAX_ORDER); | ||
| 495 | |||
| 458 | h->nr_huge_pages--; | 496 | h->nr_huge_pages--; |
| 459 | h->nr_huge_pages_node[page_to_nid(page)]--; | 497 | h->nr_huge_pages_node[page_to_nid(page)]--; |
| 460 | for (i = 0; i < pages_per_huge_page(h); i++) { | 498 | for (i = 0; i < pages_per_huge_page(h); i++) { |
| @@ -969,6 +1007,14 @@ found: | |||
| 969 | return 1; | 1007 | return 1; |
| 970 | } | 1008 | } |
| 971 | 1009 | ||
| 1010 | static void prep_compound_huge_page(struct page *page, int order) | ||
| 1011 | { | ||
| 1012 | if (unlikely(order > (MAX_ORDER - 1))) | ||
| 1013 | prep_compound_gigantic_page(page, order); | ||
| 1014 | else | ||
| 1015 | prep_compound_page(page, order); | ||
| 1016 | } | ||
| 1017 | |||
| 972 | /* Put bootmem huge pages into the standard lists after mem_map is up */ | 1018 | /* Put bootmem huge pages into the standard lists after mem_map is up */ |
| 973 | static void __init gather_bootmem_prealloc(void) | 1019 | static void __init gather_bootmem_prealloc(void) |
| 974 | { | 1020 | { |
| @@ -979,7 +1025,7 @@ static void __init gather_bootmem_prealloc(void) | |||
| 979 | struct hstate *h = m->hstate; | 1025 | struct hstate *h = m->hstate; |
| 980 | __ClearPageReserved(page); | 1026 | __ClearPageReserved(page); |
| 981 | WARN_ON(page_count(page) != 1); | 1027 | WARN_ON(page_count(page) != 1); |
| 982 | prep_compound_page(page, h->order); | 1028 | prep_compound_huge_page(page, h->order); |
| 983 | prep_new_huge_page(h, page, page_to_nid(page)); | 1029 | prep_new_huge_page(h, page, page_to_nid(page)); |
| 984 | } | 1030 | } |
| 985 | } | 1031 | } |
| @@ -1455,15 +1501,15 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
| 1455 | 1501 | ||
| 1456 | #endif /* CONFIG_SYSCTL */ | 1502 | #endif /* CONFIG_SYSCTL */ |
| 1457 | 1503 | ||
| 1458 | int hugetlb_report_meminfo(char *buf) | 1504 | void hugetlb_report_meminfo(struct seq_file *m) |
| 1459 | { | 1505 | { |
| 1460 | struct hstate *h = &default_hstate; | 1506 | struct hstate *h = &default_hstate; |
| 1461 | return sprintf(buf, | 1507 | seq_printf(m, |
| 1462 | "HugePages_Total: %5lu\n" | 1508 | "HugePages_Total: %5lu\n" |
| 1463 | "HugePages_Free: %5lu\n" | 1509 | "HugePages_Free: %5lu\n" |
| 1464 | "HugePages_Rsvd: %5lu\n" | 1510 | "HugePages_Rsvd: %5lu\n" |
| 1465 | "HugePages_Surp: %5lu\n" | 1511 | "HugePages_Surp: %5lu\n" |
| 1466 | "Hugepagesize: %5lu kB\n", | 1512 | "Hugepagesize: %8lu kB\n", |
| 1467 | h->nr_huge_pages, | 1513 | h->nr_huge_pages, |
| 1468 | h->free_huge_pages, | 1514 | h->free_huge_pages, |
| 1469 | h->resv_huge_pages, | 1515 | h->resv_huge_pages, |
| @@ -1747,11 +1793,10 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
| 1747 | * from other VMAs and let the children be SIGKILLed if they are faulting the | 1793 | * from other VMAs and let the children be SIGKILLed if they are faulting the |
| 1748 | * same region. | 1794 | * same region. |
| 1749 | */ | 1795 | */ |
| 1750 | int unmap_ref_private(struct mm_struct *mm, | 1796 | static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, |
| 1751 | struct vm_area_struct *vma, | 1797 | struct page *page, unsigned long address) |
| 1752 | struct page *page, | ||
| 1753 | unsigned long address) | ||
| 1754 | { | 1798 | { |
| 1799 | struct hstate *h = hstate_vma(vma); | ||
| 1755 | struct vm_area_struct *iter_vma; | 1800 | struct vm_area_struct *iter_vma; |
| 1756 | struct address_space *mapping; | 1801 | struct address_space *mapping; |
| 1757 | struct prio_tree_iter iter; | 1802 | struct prio_tree_iter iter; |
| @@ -1761,7 +1806,7 @@ int unmap_ref_private(struct mm_struct *mm, | |||
| 1761 | * vm_pgoff is in PAGE_SIZE units, hence the different calculation | 1806 | * vm_pgoff is in PAGE_SIZE units, hence the different calculation |
| 1762 | * from page cache lookup which is in HPAGE_SIZE units. | 1807 | * from page cache lookup which is in HPAGE_SIZE units. |
| 1763 | */ | 1808 | */ |
| 1764 | address = address & huge_page_mask(hstate_vma(vma)); | 1809 | address = address & huge_page_mask(h); |
| 1765 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) | 1810 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) |
| 1766 | + (vma->vm_pgoff >> PAGE_SHIFT); | 1811 | + (vma->vm_pgoff >> PAGE_SHIFT); |
| 1767 | mapping = (struct address_space *)page_private(page); | 1812 | mapping = (struct address_space *)page_private(page); |
| @@ -1780,7 +1825,7 @@ int unmap_ref_private(struct mm_struct *mm, | |||
| 1780 | */ | 1825 | */ |
| 1781 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | 1826 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) |
| 1782 | unmap_hugepage_range(iter_vma, | 1827 | unmap_hugepage_range(iter_vma, |
| 1783 | address, address + HPAGE_SIZE, | 1828 | address, address + huge_page_size(h), |
| 1784 | page); | 1829 | page); |
| 1785 | } | 1830 | } |
| 1786 | 1831 | ||
| @@ -2008,7 +2053,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2008 | entry = huge_ptep_get(ptep); | 2053 | entry = huge_ptep_get(ptep); |
| 2009 | if (huge_pte_none(entry)) { | 2054 | if (huge_pte_none(entry)) { |
| 2010 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); | 2055 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); |
| 2011 | goto out_unlock; | 2056 | goto out_mutex; |
| 2012 | } | 2057 | } |
| 2013 | 2058 | ||
| 2014 | ret = 0; | 2059 | ret = 0; |
| @@ -2024,7 +2069,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2024 | if (write_access && !pte_write(entry)) { | 2069 | if (write_access && !pte_write(entry)) { |
| 2025 | if (vma_needs_reservation(h, vma, address) < 0) { | 2070 | if (vma_needs_reservation(h, vma, address) < 0) { |
| 2026 | ret = VM_FAULT_OOM; | 2071 | ret = VM_FAULT_OOM; |
| 2027 | goto out_unlock; | 2072 | goto out_mutex; |
| 2028 | } | 2073 | } |
| 2029 | 2074 | ||
| 2030 | if (!(vma->vm_flags & VM_SHARED)) | 2075 | if (!(vma->vm_flags & VM_SHARED)) |
| @@ -2034,10 +2079,23 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2034 | 2079 | ||
| 2035 | spin_lock(&mm->page_table_lock); | 2080 | spin_lock(&mm->page_table_lock); |
| 2036 | /* Check for a racing update before calling hugetlb_cow */ | 2081 | /* Check for a racing update before calling hugetlb_cow */ |
| 2037 | if (likely(pte_same(entry, huge_ptep_get(ptep)))) | 2082 | if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) |
| 2038 | if (write_access && !pte_write(entry)) | 2083 | goto out_page_table_lock; |
| 2084 | |||
| 2085 | |||
| 2086 | if (write_access) { | ||
| 2087 | if (!pte_write(entry)) { | ||
| 2039 | ret = hugetlb_cow(mm, vma, address, ptep, entry, | 2088 | ret = hugetlb_cow(mm, vma, address, ptep, entry, |
| 2040 | pagecache_page); | 2089 | pagecache_page); |
| 2090 | goto out_page_table_lock; | ||
| 2091 | } | ||
| 2092 | entry = pte_mkdirty(entry); | ||
| 2093 | } | ||
| 2094 | entry = pte_mkyoung(entry); | ||
| 2095 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) | ||
| 2096 | update_mmu_cache(vma, address, entry); | ||
| 2097 | |||
| 2098 | out_page_table_lock: | ||
| 2041 | spin_unlock(&mm->page_table_lock); | 2099 | spin_unlock(&mm->page_table_lock); |
| 2042 | 2100 | ||
| 2043 | if (pagecache_page) { | 2101 | if (pagecache_page) { |
| @@ -2045,7 +2103,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2045 | put_page(pagecache_page); | 2103 | put_page(pagecache_page); |
| 2046 | } | 2104 | } |
| 2047 | 2105 | ||
| 2048 | out_unlock: | 2106 | out_mutex: |
| 2049 | mutex_unlock(&hugetlb_instantiation_mutex); | 2107 | mutex_unlock(&hugetlb_instantiation_mutex); |
| 2050 | 2108 | ||
| 2051 | return ret; | 2109 | return ret; |
| @@ -2060,6 +2118,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, | |||
| 2060 | return NULL; | 2118 | return NULL; |
| 2061 | } | 2119 | } |
| 2062 | 2120 | ||
| 2121 | static int huge_zeropage_ok(pte_t *ptep, int write, int shared) | ||
| 2122 | { | ||
| 2123 | if (!ptep || write || shared) | ||
| 2124 | return 0; | ||
| 2125 | else | ||
| 2126 | return huge_pte_none(huge_ptep_get(ptep)); | ||
| 2127 | } | ||
| 2128 | |||
| 2063 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2129 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 2064 | struct page **pages, struct vm_area_struct **vmas, | 2130 | struct page **pages, struct vm_area_struct **vmas, |
| 2065 | unsigned long *position, int *length, int i, | 2131 | unsigned long *position, int *length, int i, |
| @@ -2069,6 +2135,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2069 | unsigned long vaddr = *position; | 2135 | unsigned long vaddr = *position; |
| 2070 | int remainder = *length; | 2136 | int remainder = *length; |
| 2071 | struct hstate *h = hstate_vma(vma); | 2137 | struct hstate *h = hstate_vma(vma); |
| 2138 | int zeropage_ok = 0; | ||
| 2139 | int shared = vma->vm_flags & VM_SHARED; | ||
| 2072 | 2140 | ||
| 2073 | spin_lock(&mm->page_table_lock); | 2141 | spin_lock(&mm->page_table_lock); |
| 2074 | while (vaddr < vma->vm_end && remainder) { | 2142 | while (vaddr < vma->vm_end && remainder) { |
| @@ -2081,8 +2149,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2081 | * first, for the page indexing below to work. | 2149 | * first, for the page indexing below to work. |
| 2082 | */ | 2150 | */ |
| 2083 | pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); | 2151 | pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); |
| 2152 | if (huge_zeropage_ok(pte, write, shared)) | ||
| 2153 | zeropage_ok = 1; | ||
| 2084 | 2154 | ||
| 2085 | if (!pte || huge_pte_none(huge_ptep_get(pte)) || | 2155 | if (!pte || |
| 2156 | (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || | ||
| 2086 | (write && !pte_write(huge_ptep_get(pte)))) { | 2157 | (write && !pte_write(huge_ptep_get(pte)))) { |
| 2087 | int ret; | 2158 | int ret; |
| 2088 | 2159 | ||
| @@ -2102,8 +2173,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2102 | page = pte_page(huge_ptep_get(pte)); | 2173 | page = pte_page(huge_ptep_get(pte)); |
| 2103 | same_page: | 2174 | same_page: |
| 2104 | if (pages) { | 2175 | if (pages) { |
| 2105 | get_page(page); | 2176 | if (zeropage_ok) |
| 2106 | pages[i] = page + pfn_offset; | 2177 | pages[i] = ZERO_PAGE(0); |
| 2178 | else | ||
| 2179 | pages[i] = mem_map_offset(page, pfn_offset); | ||
| 2180 | get_page(pages[i]); | ||
| 2107 | } | 2181 | } |
| 2108 | 2182 | ||
| 2109 | if (vmas) | 2183 | if (vmas) |
diff --git a/mm/internal.h b/mm/internal.h index 1f43f7416972..13333bc2eb68 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -17,6 +17,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | |||
| 17 | unsigned long floor, unsigned long ceiling); | 17 | unsigned long floor, unsigned long ceiling); |
| 18 | 18 | ||
| 19 | extern void prep_compound_page(struct page *page, unsigned long order); | 19 | extern void prep_compound_page(struct page *page, unsigned long order); |
| 20 | extern void prep_compound_gigantic_page(struct page *page, unsigned long order); | ||
| 20 | 21 | ||
| 21 | static inline void set_page_count(struct page *page, int v) | 22 | static inline void set_page_count(struct page *page, int v) |
| 22 | { | 23 | { |
| @@ -39,6 +40,15 @@ static inline void __put_page(struct page *page) | |||
| 39 | atomic_dec(&page->_count); | 40 | atomic_dec(&page->_count); |
| 40 | } | 41 | } |
| 41 | 42 | ||
| 43 | /* | ||
| 44 | * in mm/vmscan.c: | ||
| 45 | */ | ||
| 46 | extern int isolate_lru_page(struct page *page); | ||
| 47 | extern void putback_lru_page(struct page *page); | ||
| 48 | |||
| 49 | /* | ||
| 50 | * in mm/page_alloc.c | ||
| 51 | */ | ||
| 42 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 52 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
| 43 | 53 | ||
| 44 | /* | 54 | /* |
| @@ -52,6 +62,148 @@ static inline unsigned long page_order(struct page *page) | |||
| 52 | return page_private(page); | 62 | return page_private(page); |
| 53 | } | 63 | } |
| 54 | 64 | ||
| 65 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, | ||
| 66 | unsigned long start, unsigned long end); | ||
| 67 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, | ||
| 68 | unsigned long start, unsigned long end); | ||
| 69 | static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | ||
| 70 | { | ||
| 71 | munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); | ||
| 72 | } | ||
| 73 | |||
| 74 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 75 | /* | ||
| 76 | * unevictable_migrate_page() called only from migrate_page_copy() to | ||
| 77 | * migrate unevictable flag to new page. | ||
| 78 | * Note that the old page has been isolated from the LRU lists at this | ||
| 79 | * point so we don't need to worry about LRU statistics. | ||
| 80 | */ | ||
| 81 | static inline void unevictable_migrate_page(struct page *new, struct page *old) | ||
| 82 | { | ||
| 83 | if (TestClearPageUnevictable(old)) | ||
| 84 | SetPageUnevictable(new); | ||
| 85 | } | ||
| 86 | #else | ||
| 87 | static inline void unevictable_migrate_page(struct page *new, struct page *old) | ||
| 88 | { | ||
| 89 | } | ||
| 90 | #endif | ||
| 91 | |||
| 92 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 93 | /* | ||
| 94 | * Called only in fault path via page_evictable() for a new page | ||
| 95 | * to determine if it's being mapped into a LOCKED vma. | ||
| 96 | * If so, mark page as mlocked. | ||
| 97 | */ | ||
| 98 | static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) | ||
| 99 | { | ||
| 100 | VM_BUG_ON(PageLRU(page)); | ||
| 101 | |||
| 102 | if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) | ||
| 103 | return 0; | ||
| 104 | |||
| 105 | if (!TestSetPageMlocked(page)) { | ||
| 106 | inc_zone_page_state(page, NR_MLOCK); | ||
| 107 | count_vm_event(UNEVICTABLE_PGMLOCKED); | ||
| 108 | } | ||
| 109 | return 1; | ||
| 110 | } | ||
| 111 | |||
| 112 | /* | ||
| 113 | * must be called with vma's mmap_sem held for read, and page locked. | ||
| 114 | */ | ||
| 115 | extern void mlock_vma_page(struct page *page); | ||
| 116 | |||
| 117 | /* | ||
| 118 | * Clear the page's PageMlocked(). This can be useful in a situation where | ||
| 119 | * we want to unconditionally remove a page from the pagecache -- e.g., | ||
| 120 | * on truncation or freeing. | ||
| 121 | * | ||
| 122 | * It is legal to call this function for any page, mlocked or not. | ||
| 123 | * If called for a page that is still mapped by mlocked vmas, all we do | ||
| 124 | * is revert to lazy LRU behaviour -- semantics are not broken. | ||
| 125 | */ | ||
| 126 | extern void __clear_page_mlock(struct page *page); | ||
| 127 | static inline void clear_page_mlock(struct page *page) | ||
| 128 | { | ||
| 129 | if (unlikely(TestClearPageMlocked(page))) | ||
| 130 | __clear_page_mlock(page); | ||
| 131 | } | ||
| 132 | |||
| 133 | /* | ||
| 134 | * mlock_migrate_page - called only from migrate_page_copy() to | ||
| 135 | * migrate the Mlocked page flag; update statistics. | ||
| 136 | */ | ||
| 137 | static inline void mlock_migrate_page(struct page *newpage, struct page *page) | ||
| 138 | { | ||
| 139 | if (TestClearPageMlocked(page)) { | ||
| 140 | unsigned long flags; | ||
| 141 | |||
| 142 | local_irq_save(flags); | ||
| 143 | __dec_zone_page_state(page, NR_MLOCK); | ||
| 144 | SetPageMlocked(newpage); | ||
| 145 | __inc_zone_page_state(newpage, NR_MLOCK); | ||
| 146 | local_irq_restore(flags); | ||
| 147 | } | ||
| 148 | } | ||
| 149 | |||
| 150 | /* | ||
| 151 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | ||
| 152 | * Page should not be on lru, so no need to fix that up. | ||
| 153 | * free_pages_check() will verify... | ||
| 154 | */ | ||
| 155 | static inline void free_page_mlock(struct page *page) | ||
| 156 | { | ||
| 157 | if (unlikely(TestClearPageMlocked(page))) { | ||
| 158 | unsigned long flags; | ||
| 159 | |||
| 160 | local_irq_save(flags); | ||
| 161 | __dec_zone_page_state(page, NR_MLOCK); | ||
| 162 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | ||
| 163 | local_irq_restore(flags); | ||
| 164 | } | ||
| 165 | } | ||
| 166 | |||
| 167 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
| 168 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | ||
| 169 | { | ||
| 170 | return 0; | ||
| 171 | } | ||
| 172 | static inline void clear_page_mlock(struct page *page) { } | ||
| 173 | static inline void mlock_vma_page(struct page *page) { } | ||
| 174 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } | ||
| 175 | static inline void free_page_mlock(struct page *page) { } | ||
| 176 | |||
| 177 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
| 178 | |||
| 179 | /* | ||
| 180 | * Return the mem_map entry representing the 'offset' subpage within | ||
| 181 | * the maximally aligned gigantic page 'base'. Handle any discontiguity | ||
| 182 | * in the mem_map at MAX_ORDER_NR_PAGES boundaries. | ||
| 183 | */ | ||
| 184 | static inline struct page *mem_map_offset(struct page *base, int offset) | ||
| 185 | { | ||
| 186 | if (unlikely(offset >= MAX_ORDER_NR_PAGES)) | ||
| 187 | return pfn_to_page(page_to_pfn(base) + offset); | ||
| 188 | return base + offset; | ||
| 189 | } | ||
| 190 | |||
| 191 | /* | ||
| 192 | * Iterator over all subpages withing the maximally aligned gigantic | ||
| 193 | * page 'base'. Handle any discontiguity in the mem_map. | ||
| 194 | */ | ||
| 195 | static inline struct page *mem_map_next(struct page *iter, | ||
| 196 | struct page *base, int offset) | ||
| 197 | { | ||
| 198 | if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) { | ||
| 199 | unsigned long pfn = page_to_pfn(base) + offset; | ||
| 200 | if (!pfn_valid(pfn)) | ||
| 201 | return NULL; | ||
| 202 | return pfn_to_page(pfn); | ||
| 203 | } | ||
| 204 | return iter + 1; | ||
| 205 | } | ||
| 206 | |||
| 55 | /* | 207 | /* |
| 56 | * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, | 208 | * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, |
| 57 | * so all functions starting at paging_init should be marked __init | 209 | * so all functions starting at paging_init should be marked __init |
| @@ -120,4 +272,12 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | |||
| 120 | } | 272 | } |
| 121 | #endif /* CONFIG_SPARSEMEM */ | 273 | #endif /* CONFIG_SPARSEMEM */ |
| 122 | 274 | ||
| 275 | #define GUP_FLAGS_WRITE 0x1 | ||
| 276 | #define GUP_FLAGS_FORCE 0x2 | ||
| 277 | #define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 | ||
| 278 | |||
| 279 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
| 280 | unsigned long start, int len, int flags, | ||
| 281 | struct page **pages, struct vm_area_struct **vmas); | ||
| 282 | |||
| 123 | #endif | 283 | #endif |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 36896f3eb7f5..866dcc7eeb0c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -32,11 +32,12 @@ | |||
| 32 | #include <linux/fs.h> | 32 | #include <linux/fs.h> |
| 33 | #include <linux/seq_file.h> | 33 | #include <linux/seq_file.h> |
| 34 | #include <linux/vmalloc.h> | 34 | #include <linux/vmalloc.h> |
| 35 | #include <linux/mm_inline.h> | ||
| 36 | #include <linux/page_cgroup.h> | ||
| 35 | 37 | ||
| 36 | #include <asm/uaccess.h> | 38 | #include <asm/uaccess.h> |
| 37 | 39 | ||
| 38 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 40 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
| 39 | static struct kmem_cache *page_cgroup_cache __read_mostly; | ||
| 40 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 41 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
| 41 | 42 | ||
| 42 | /* | 43 | /* |
| @@ -65,11 +66,10 @@ struct mem_cgroup_stat { | |||
| 65 | /* | 66 | /* |
| 66 | * For accounting under irq disable, no need for increment preempt count. | 67 | * For accounting under irq disable, no need for increment preempt count. |
| 67 | */ | 68 | */ |
| 68 | static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, | 69 | static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, |
| 69 | enum mem_cgroup_stat_index idx, int val) | 70 | enum mem_cgroup_stat_index idx, int val) |
| 70 | { | 71 | { |
| 71 | int cpu = smp_processor_id(); | 72 | stat->count[idx] += val; |
| 72 | stat->cpustat[cpu].count[idx] += val; | ||
| 73 | } | 73 | } |
| 74 | 74 | ||
| 75 | static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, | 75 | static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, |
| @@ -85,22 +85,13 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, | |||
| 85 | /* | 85 | /* |
| 86 | * per-zone information in memory controller. | 86 | * per-zone information in memory controller. |
| 87 | */ | 87 | */ |
| 88 | |||
| 89 | enum mem_cgroup_zstat_index { | ||
| 90 | MEM_CGROUP_ZSTAT_ACTIVE, | ||
| 91 | MEM_CGROUP_ZSTAT_INACTIVE, | ||
| 92 | |||
| 93 | NR_MEM_CGROUP_ZSTAT, | ||
| 94 | }; | ||
| 95 | |||
| 96 | struct mem_cgroup_per_zone { | 88 | struct mem_cgroup_per_zone { |
| 97 | /* | 89 | /* |
| 98 | * spin_lock to protect the per cgroup LRU | 90 | * spin_lock to protect the per cgroup LRU |
| 99 | */ | 91 | */ |
| 100 | spinlock_t lru_lock; | 92 | spinlock_t lru_lock; |
| 101 | struct list_head active_list; | 93 | struct list_head lists[NR_LRU_LISTS]; |
| 102 | struct list_head inactive_list; | 94 | unsigned long count[NR_LRU_LISTS]; |
| 103 | unsigned long count[NR_MEM_CGROUP_ZSTAT]; | ||
| 104 | }; | 95 | }; |
| 105 | /* Macro for accessing counter */ | 96 | /* Macro for accessing counter */ |
| 106 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | 97 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) |
| @@ -144,69 +135,52 @@ struct mem_cgroup { | |||
| 144 | }; | 135 | }; |
| 145 | static struct mem_cgroup init_mem_cgroup; | 136 | static struct mem_cgroup init_mem_cgroup; |
| 146 | 137 | ||
| 147 | /* | ||
| 148 | * We use the lower bit of the page->page_cgroup pointer as a bit spin | ||
| 149 | * lock. We need to ensure that page->page_cgroup is at least two | ||
| 150 | * byte aligned (based on comments from Nick Piggin). But since | ||
| 151 | * bit_spin_lock doesn't actually set that lock bit in a non-debug | ||
| 152 | * uniprocessor kernel, we should avoid setting it here too. | ||
| 153 | */ | ||
| 154 | #define PAGE_CGROUP_LOCK_BIT 0x0 | ||
| 155 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) | ||
| 156 | #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) | ||
| 157 | #else | ||
| 158 | #define PAGE_CGROUP_LOCK 0x0 | ||
| 159 | #endif | ||
| 160 | |||
| 161 | /* | ||
| 162 | * A page_cgroup page is associated with every page descriptor. The | ||
| 163 | * page_cgroup helps us identify information about the cgroup | ||
| 164 | */ | ||
| 165 | struct page_cgroup { | ||
| 166 | struct list_head lru; /* per cgroup LRU list */ | ||
| 167 | struct page *page; | ||
| 168 | struct mem_cgroup *mem_cgroup; | ||
| 169 | int flags; | ||
| 170 | }; | ||
| 171 | #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ | ||
| 172 | #define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ | ||
| 173 | |||
| 174 | static int page_cgroup_nid(struct page_cgroup *pc) | ||
| 175 | { | ||
| 176 | return page_to_nid(pc->page); | ||
| 177 | } | ||
| 178 | |||
| 179 | static enum zone_type page_cgroup_zid(struct page_cgroup *pc) | ||
| 180 | { | ||
| 181 | return page_zonenum(pc->page); | ||
| 182 | } | ||
| 183 | |||
| 184 | enum charge_type { | 138 | enum charge_type { |
| 185 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 139 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
| 186 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 140 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
| 141 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ | ||
| 187 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ | 142 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ |
| 143 | NR_CHARGE_TYPE, | ||
| 144 | }; | ||
| 145 | |||
| 146 | /* only for here (for easy reading.) */ | ||
| 147 | #define PCGF_CACHE (1UL << PCG_CACHE) | ||
| 148 | #define PCGF_USED (1UL << PCG_USED) | ||
| 149 | #define PCGF_ACTIVE (1UL << PCG_ACTIVE) | ||
| 150 | #define PCGF_LOCK (1UL << PCG_LOCK) | ||
| 151 | #define PCGF_FILE (1UL << PCG_FILE) | ||
| 152 | static const unsigned long | ||
| 153 | pcg_default_flags[NR_CHARGE_TYPE] = { | ||
| 154 | PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ | ||
| 155 | PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ | ||
| 156 | PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ | ||
| 157 | 0, /* FORCE */ | ||
| 188 | }; | 158 | }; |
| 189 | 159 | ||
| 190 | /* | 160 | /* |
| 191 | * Always modified under lru lock. Then, not necessary to preempt_disable() | 161 | * Always modified under lru lock. Then, not necessary to preempt_disable() |
| 192 | */ | 162 | */ |
| 193 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, | 163 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
| 194 | bool charge) | 164 | struct page_cgroup *pc, |
| 165 | bool charge) | ||
| 195 | { | 166 | { |
| 196 | int val = (charge)? 1 : -1; | 167 | int val = (charge)? 1 : -1; |
| 197 | struct mem_cgroup_stat *stat = &mem->stat; | 168 | struct mem_cgroup_stat *stat = &mem->stat; |
| 169 | struct mem_cgroup_stat_cpu *cpustat; | ||
| 198 | 170 | ||
| 199 | VM_BUG_ON(!irqs_disabled()); | 171 | VM_BUG_ON(!irqs_disabled()); |
| 200 | if (flags & PAGE_CGROUP_FLAG_CACHE) | 172 | |
| 201 | __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); | 173 | cpustat = &stat->cpustat[smp_processor_id()]; |
| 174 | if (PageCgroupCache(pc)) | ||
| 175 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); | ||
| 202 | else | 176 | else |
| 203 | __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); | 177 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); |
| 204 | 178 | ||
| 205 | if (charge) | 179 | if (charge) |
| 206 | __mem_cgroup_stat_add_safe(stat, | 180 | __mem_cgroup_stat_add_safe(cpustat, |
| 207 | MEM_CGROUP_STAT_PGPGIN_COUNT, 1); | 181 | MEM_CGROUP_STAT_PGPGIN_COUNT, 1); |
| 208 | else | 182 | else |
| 209 | __mem_cgroup_stat_add_safe(stat, | 183 | __mem_cgroup_stat_add_safe(cpustat, |
| 210 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 184 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); |
| 211 | } | 185 | } |
| 212 | 186 | ||
| @@ -227,7 +201,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc) | |||
| 227 | } | 201 | } |
| 228 | 202 | ||
| 229 | static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, | 203 | static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, |
| 230 | enum mem_cgroup_zstat_index idx) | 204 | enum lru_list idx) |
| 231 | { | 205 | { |
| 232 | int nid, zid; | 206 | int nid, zid; |
| 233 | struct mem_cgroup_per_zone *mz; | 207 | struct mem_cgroup_per_zone *mz; |
| @@ -262,85 +236,77 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
| 262 | struct mem_cgroup, css); | 236 | struct mem_cgroup, css); |
| 263 | } | 237 | } |
| 264 | 238 | ||
| 265 | static inline int page_cgroup_locked(struct page *page) | ||
| 266 | { | ||
| 267 | return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
| 268 | } | ||
| 269 | |||
| 270 | static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) | ||
| 271 | { | ||
| 272 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
| 273 | page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK); | ||
| 274 | } | ||
| 275 | |||
| 276 | struct page_cgroup *page_get_page_cgroup(struct page *page) | ||
| 277 | { | ||
| 278 | return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); | ||
| 279 | } | ||
| 280 | |||
| 281 | static void lock_page_cgroup(struct page *page) | ||
| 282 | { | ||
| 283 | bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
| 284 | } | ||
| 285 | |||
| 286 | static int try_lock_page_cgroup(struct page *page) | ||
| 287 | { | ||
| 288 | return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
| 289 | } | ||
| 290 | |||
| 291 | static void unlock_page_cgroup(struct page *page) | ||
| 292 | { | ||
| 293 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
| 294 | } | ||
| 295 | |||
| 296 | static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, | 239 | static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, |
| 297 | struct page_cgroup *pc) | 240 | struct page_cgroup *pc) |
| 298 | { | 241 | { |
| 299 | int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | 242 | int lru = LRU_BASE; |
| 243 | |||
| 244 | if (PageCgroupUnevictable(pc)) | ||
| 245 | lru = LRU_UNEVICTABLE; | ||
| 246 | else { | ||
| 247 | if (PageCgroupActive(pc)) | ||
| 248 | lru += LRU_ACTIVE; | ||
| 249 | if (PageCgroupFile(pc)) | ||
| 250 | lru += LRU_FILE; | ||
| 251 | } | ||
| 300 | 252 | ||
| 301 | if (from) | 253 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; |
| 302 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; | ||
| 303 | else | ||
| 304 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; | ||
| 305 | 254 | ||
| 306 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); | 255 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); |
| 307 | list_del(&pc->lru); | 256 | list_del(&pc->lru); |
| 308 | } | 257 | } |
| 309 | 258 | ||
| 310 | static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, | 259 | static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, |
| 311 | struct page_cgroup *pc) | 260 | struct page_cgroup *pc) |
| 312 | { | 261 | { |
| 313 | int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | 262 | int lru = LRU_BASE; |
| 314 | 263 | ||
| 315 | if (!to) { | 264 | if (PageCgroupUnevictable(pc)) |
| 316 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; | 265 | lru = LRU_UNEVICTABLE; |
| 317 | list_add(&pc->lru, &mz->inactive_list); | 266 | else { |
| 318 | } else { | 267 | if (PageCgroupActive(pc)) |
| 319 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; | 268 | lru += LRU_ACTIVE; |
| 320 | list_add(&pc->lru, &mz->active_list); | 269 | if (PageCgroupFile(pc)) |
| 270 | lru += LRU_FILE; | ||
| 321 | } | 271 | } |
| 322 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); | 272 | |
| 273 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | ||
| 274 | list_add(&pc->lru, &mz->lists[lru]); | ||
| 275 | |||
| 276 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); | ||
| 323 | } | 277 | } |
| 324 | 278 | ||
| 325 | static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) | 279 | static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) |
| 326 | { | 280 | { |
| 327 | int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | ||
| 328 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | 281 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); |
| 282 | int active = PageCgroupActive(pc); | ||
| 283 | int file = PageCgroupFile(pc); | ||
| 284 | int unevictable = PageCgroupUnevictable(pc); | ||
| 285 | enum lru_list from = unevictable ? LRU_UNEVICTABLE : | ||
| 286 | (LRU_FILE * !!file + !!active); | ||
| 329 | 287 | ||
| 330 | if (from) | 288 | if (lru == from) |
| 331 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; | 289 | return; |
| 332 | else | ||
| 333 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; | ||
| 334 | 290 | ||
| 335 | if (active) { | 291 | MEM_CGROUP_ZSTAT(mz, from) -= 1; |
| 336 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; | 292 | /* |
| 337 | pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; | 293 | * However this is done under mz->lru_lock, another flags, which |
| 338 | list_move(&pc->lru, &mz->active_list); | 294 | * are not related to LRU, will be modified from out-of-lock. |
| 295 | * We have to use atomic set/clear flags. | ||
| 296 | */ | ||
| 297 | if (is_unevictable_lru(lru)) { | ||
| 298 | ClearPageCgroupActive(pc); | ||
| 299 | SetPageCgroupUnevictable(pc); | ||
| 339 | } else { | 300 | } else { |
| 340 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; | 301 | if (is_active_lru(lru)) |
| 341 | pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; | 302 | SetPageCgroupActive(pc); |
| 342 | list_move(&pc->lru, &mz->inactive_list); | 303 | else |
| 304 | ClearPageCgroupActive(pc); | ||
| 305 | ClearPageCgroupUnevictable(pc); | ||
| 343 | } | 306 | } |
| 307 | |||
| 308 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | ||
| 309 | list_move(&pc->lru, &mz->lists[lru]); | ||
| 344 | } | 310 | } |
| 345 | 311 | ||
| 346 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | 312 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) |
| @@ -356,7 +322,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
| 356 | /* | 322 | /* |
| 357 | * This routine assumes that the appropriate zone's lru lock is already held | 323 | * This routine assumes that the appropriate zone's lru lock is already held |
| 358 | */ | 324 | */ |
| 359 | void mem_cgroup_move_lists(struct page *page, bool active) | 325 | void mem_cgroup_move_lists(struct page *page, enum lru_list lru) |
| 360 | { | 326 | { |
| 361 | struct page_cgroup *pc; | 327 | struct page_cgroup *pc; |
| 362 | struct mem_cgroup_per_zone *mz; | 328 | struct mem_cgroup_per_zone *mz; |
| @@ -372,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, bool active) | |||
| 372 | * safely get to page_cgroup without it, so just try_lock it: | 338 | * safely get to page_cgroup without it, so just try_lock it: |
| 373 | * mem_cgroup_isolate_pages allows for page left on wrong list. | 339 | * mem_cgroup_isolate_pages allows for page left on wrong list. |
| 374 | */ | 340 | */ |
| 375 | if (!try_lock_page_cgroup(page)) | 341 | pc = lookup_page_cgroup(page); |
| 342 | if (!trylock_page_cgroup(pc)) | ||
| 376 | return; | 343 | return; |
| 377 | 344 | if (pc && PageCgroupUsed(pc)) { | |
| 378 | pc = page_get_page_cgroup(page); | ||
| 379 | if (pc) { | ||
| 380 | mz = page_cgroup_zoneinfo(pc); | 345 | mz = page_cgroup_zoneinfo(pc); |
| 381 | spin_lock_irqsave(&mz->lru_lock, flags); | 346 | spin_lock_irqsave(&mz->lru_lock, flags); |
| 382 | __mem_cgroup_move_lists(pc, active); | 347 | __mem_cgroup_move_lists(pc, lru); |
| 383 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 348 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
| 384 | } | 349 | } |
| 385 | unlock_page_cgroup(page); | 350 | unlock_page_cgroup(pc); |
| 386 | } | 351 | } |
| 387 | 352 | ||
| 388 | /* | 353 | /* |
| @@ -403,21 +368,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) | |||
| 403 | } | 368 | } |
| 404 | 369 | ||
| 405 | /* | 370 | /* |
| 406 | * This function is called from vmscan.c. In page reclaiming loop. balance | ||
| 407 | * between active and inactive list is calculated. For memory controller | ||
| 408 | * page reclaiming, we should use using mem_cgroup's imbalance rather than | ||
| 409 | * zone's global lru imbalance. | ||
| 410 | */ | ||
| 411 | long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem) | ||
| 412 | { | ||
| 413 | unsigned long active, inactive; | ||
| 414 | /* active and inactive are the number of pages. 'long' is ok.*/ | ||
| 415 | active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE); | ||
| 416 | inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE); | ||
| 417 | return (long) (active / (inactive + 1)); | ||
| 418 | } | ||
| 419 | |||
| 420 | /* | ||
| 421 | * prev_priority control...this will be used in memory reclaim path. | 371 | * prev_priority control...this will be used in memory reclaim path. |
| 422 | */ | 372 | */ |
| 423 | int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) | 373 | int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) |
| @@ -444,28 +394,17 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) | |||
| 444 | * (see include/linux/mmzone.h) | 394 | * (see include/linux/mmzone.h) |
| 445 | */ | 395 | */ |
| 446 | 396 | ||
| 447 | long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, | 397 | long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, |
| 448 | struct zone *zone, int priority) | 398 | int priority, enum lru_list lru) |
| 449 | { | 399 | { |
| 450 | long nr_active; | 400 | long nr_pages; |
| 451 | int nid = zone->zone_pgdat->node_id; | 401 | int nid = zone->zone_pgdat->node_id; |
| 452 | int zid = zone_idx(zone); | 402 | int zid = zone_idx(zone); |
| 453 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); | 403 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); |
| 454 | 404 | ||
| 455 | nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); | 405 | nr_pages = MEM_CGROUP_ZSTAT(mz, lru); |
| 456 | return (nr_active >> priority); | ||
| 457 | } | ||
| 458 | 406 | ||
| 459 | long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, | 407 | return (nr_pages >> priority); |
| 460 | struct zone *zone, int priority) | ||
| 461 | { | ||
| 462 | long nr_inactive; | ||
| 463 | int nid = zone->zone_pgdat->node_id; | ||
| 464 | int zid = zone_idx(zone); | ||
| 465 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
| 466 | |||
| 467 | nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); | ||
| 468 | return (nr_inactive >> priority); | ||
| 469 | } | 408 | } |
| 470 | 409 | ||
| 471 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | 410 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, |
| @@ -473,7 +412,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
| 473 | unsigned long *scanned, int order, | 412 | unsigned long *scanned, int order, |
| 474 | int mode, struct zone *z, | 413 | int mode, struct zone *z, |
| 475 | struct mem_cgroup *mem_cont, | 414 | struct mem_cgroup *mem_cont, |
| 476 | int active) | 415 | int active, int file) |
| 477 | { | 416 | { |
| 478 | unsigned long nr_taken = 0; | 417 | unsigned long nr_taken = 0; |
| 479 | struct page *page; | 418 | struct page *page; |
| @@ -484,38 +423,38 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
| 484 | int nid = z->zone_pgdat->node_id; | 423 | int nid = z->zone_pgdat->node_id; |
| 485 | int zid = zone_idx(z); | 424 | int zid = zone_idx(z); |
| 486 | struct mem_cgroup_per_zone *mz; | 425 | struct mem_cgroup_per_zone *mz; |
| 426 | int lru = LRU_FILE * !!file + !!active; | ||
| 487 | 427 | ||
| 488 | BUG_ON(!mem_cont); | 428 | BUG_ON(!mem_cont); |
| 489 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | 429 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); |
| 490 | if (active) | 430 | src = &mz->lists[lru]; |
| 491 | src = &mz->active_list; | ||
| 492 | else | ||
| 493 | src = &mz->inactive_list; | ||
| 494 | |||
| 495 | 431 | ||
| 496 | spin_lock(&mz->lru_lock); | 432 | spin_lock(&mz->lru_lock); |
| 497 | scan = 0; | 433 | scan = 0; |
| 498 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { | 434 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { |
| 499 | if (scan >= nr_to_scan) | 435 | if (scan >= nr_to_scan) |
| 500 | break; | 436 | break; |
| 437 | if (unlikely(!PageCgroupUsed(pc))) | ||
| 438 | continue; | ||
| 501 | page = pc->page; | 439 | page = pc->page; |
| 502 | 440 | ||
| 503 | if (unlikely(!PageLRU(page))) | 441 | if (unlikely(!PageLRU(page))) |
| 504 | continue; | 442 | continue; |
| 505 | 443 | ||
| 506 | if (PageActive(page) && !active) { | 444 | /* |
| 507 | __mem_cgroup_move_lists(pc, true); | 445 | * TODO: play better with lumpy reclaim, grabbing anything. |
| 508 | continue; | 446 | */ |
| 509 | } | 447 | if (PageUnevictable(page) || |
| 510 | if (!PageActive(page) && active) { | 448 | (PageActive(page) && !active) || |
| 511 | __mem_cgroup_move_lists(pc, false); | 449 | (!PageActive(page) && active)) { |
| 450 | __mem_cgroup_move_lists(pc, page_lru(page)); | ||
| 512 | continue; | 451 | continue; |
| 513 | } | 452 | } |
| 514 | 453 | ||
| 515 | scan++; | 454 | scan++; |
| 516 | list_move(&pc->lru, &pc_list); | 455 | list_move(&pc->lru, &pc_list); |
| 517 | 456 | ||
| 518 | if (__isolate_lru_page(page, mode) == 0) { | 457 | if (__isolate_lru_page(page, mode, file) == 0) { |
| 519 | list_move(&page->lru, dst); | 458 | list_move(&page->lru, dst); |
| 520 | nr_taken++; | 459 | nr_taken++; |
| 521 | } | 460 | } |
| @@ -540,26 +479,27 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
| 540 | { | 479 | { |
| 541 | struct mem_cgroup *mem; | 480 | struct mem_cgroup *mem; |
| 542 | struct page_cgroup *pc; | 481 | struct page_cgroup *pc; |
| 543 | unsigned long flags; | ||
| 544 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 482 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
| 545 | struct mem_cgroup_per_zone *mz; | 483 | struct mem_cgroup_per_zone *mz; |
| 484 | unsigned long flags; | ||
| 546 | 485 | ||
| 547 | pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); | 486 | pc = lookup_page_cgroup(page); |
| 548 | if (unlikely(pc == NULL)) | 487 | /* can happen at boot */ |
| 549 | goto err; | 488 | if (unlikely(!pc)) |
| 550 | 489 | return 0; | |
| 490 | prefetchw(pc); | ||
| 551 | /* | 491 | /* |
| 552 | * We always charge the cgroup the mm_struct belongs to. | 492 | * We always charge the cgroup the mm_struct belongs to. |
| 553 | * The mm_struct's mem_cgroup changes on task migration if the | 493 | * The mm_struct's mem_cgroup changes on task migration if the |
| 554 | * thread group leader migrates. It's possible that mm is not | 494 | * thread group leader migrates. It's possible that mm is not |
| 555 | * set, if so charge the init_mm (happens for pagecache usage). | 495 | * set, if so charge the init_mm (happens for pagecache usage). |
| 556 | */ | 496 | */ |
| 497 | |||
| 557 | if (likely(!memcg)) { | 498 | if (likely(!memcg)) { |
| 558 | rcu_read_lock(); | 499 | rcu_read_lock(); |
| 559 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 500 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
| 560 | if (unlikely(!mem)) { | 501 | if (unlikely(!mem)) { |
| 561 | rcu_read_unlock(); | 502 | rcu_read_unlock(); |
| 562 | kmem_cache_free(page_cgroup_cache, pc); | ||
| 563 | return 0; | 503 | return 0; |
| 564 | } | 504 | } |
| 565 | /* | 505 | /* |
| @@ -572,7 +512,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
| 572 | css_get(&memcg->css); | 512 | css_get(&memcg->css); |
| 573 | } | 513 | } |
| 574 | 514 | ||
| 575 | while (res_counter_charge(&mem->res, PAGE_SIZE)) { | 515 | while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { |
| 576 | if (!(gfp_mask & __GFP_WAIT)) | 516 | if (!(gfp_mask & __GFP_WAIT)) |
| 577 | goto out; | 517 | goto out; |
| 578 | 518 | ||
| @@ -595,39 +535,33 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
| 595 | } | 535 | } |
| 596 | } | 536 | } |
| 597 | 537 | ||
| 598 | pc->mem_cgroup = mem; | ||
| 599 | pc->page = page; | ||
| 600 | /* | ||
| 601 | * If a page is accounted as a page cache, insert to inactive list. | ||
| 602 | * If anon, insert to active list. | ||
| 603 | */ | ||
| 604 | if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) | ||
| 605 | pc->flags = PAGE_CGROUP_FLAG_CACHE; | ||
| 606 | else | ||
| 607 | pc->flags = PAGE_CGROUP_FLAG_ACTIVE; | ||
| 608 | 538 | ||
| 609 | lock_page_cgroup(page); | 539 | lock_page_cgroup(pc); |
| 610 | if (unlikely(page_get_page_cgroup(page))) { | 540 | if (unlikely(PageCgroupUsed(pc))) { |
| 611 | unlock_page_cgroup(page); | 541 | unlock_page_cgroup(pc); |
| 612 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 542 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
| 613 | css_put(&mem->css); | 543 | css_put(&mem->css); |
| 614 | kmem_cache_free(page_cgroup_cache, pc); | 544 | |
| 615 | goto done; | 545 | goto done; |
| 616 | } | 546 | } |
| 617 | page_assign_page_cgroup(page, pc); | 547 | pc->mem_cgroup = mem; |
| 548 | /* | ||
| 549 | * If a page is accounted as a page cache, insert to inactive list. | ||
| 550 | * If anon, insert to active list. | ||
| 551 | */ | ||
| 552 | pc->flags = pcg_default_flags[ctype]; | ||
| 618 | 553 | ||
| 619 | mz = page_cgroup_zoneinfo(pc); | 554 | mz = page_cgroup_zoneinfo(pc); |
| 555 | |||
| 620 | spin_lock_irqsave(&mz->lru_lock, flags); | 556 | spin_lock_irqsave(&mz->lru_lock, flags); |
| 621 | __mem_cgroup_add_list(mz, pc); | 557 | __mem_cgroup_add_list(mz, pc); |
| 622 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 558 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
| 559 | unlock_page_cgroup(pc); | ||
| 623 | 560 | ||
| 624 | unlock_page_cgroup(page); | ||
| 625 | done: | 561 | done: |
| 626 | return 0; | 562 | return 0; |
| 627 | out: | 563 | out: |
| 628 | css_put(&mem->css); | 564 | css_put(&mem->css); |
| 629 | kmem_cache_free(page_cgroup_cache, pc); | ||
| 630 | err: | ||
| 631 | return -ENOMEM; | 565 | return -ENOMEM; |
| 632 | } | 566 | } |
| 633 | 567 | ||
| @@ -635,7 +569,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) | |||
| 635 | { | 569 | { |
| 636 | if (mem_cgroup_subsys.disabled) | 570 | if (mem_cgroup_subsys.disabled) |
| 637 | return 0; | 571 | return 0; |
| 638 | 572 | if (PageCompound(page)) | |
| 573 | return 0; | ||
| 639 | /* | 574 | /* |
| 640 | * If already mapped, we don't have to account. | 575 | * If already mapped, we don't have to account. |
| 641 | * If page cache, page->mapping has address_space. | 576 | * If page cache, page->mapping has address_space. |
| @@ -656,7 +591,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
| 656 | { | 591 | { |
| 657 | if (mem_cgroup_subsys.disabled) | 592 | if (mem_cgroup_subsys.disabled) |
| 658 | return 0; | 593 | return 0; |
| 659 | 594 | if (PageCompound(page)) | |
| 595 | return 0; | ||
| 660 | /* | 596 | /* |
| 661 | * Corner case handling. This is called from add_to_page_cache() | 597 | * Corner case handling. This is called from add_to_page_cache() |
| 662 | * in usual. But some FS (shmem) precharges this page before calling it | 598 | * in usual. But some FS (shmem) precharges this page before calling it |
| @@ -669,22 +605,27 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
| 669 | if (!(gfp_mask & __GFP_WAIT)) { | 605 | if (!(gfp_mask & __GFP_WAIT)) { |
| 670 | struct page_cgroup *pc; | 606 | struct page_cgroup *pc; |
| 671 | 607 | ||
| 672 | lock_page_cgroup(page); | 608 | |
| 673 | pc = page_get_page_cgroup(page); | 609 | pc = lookup_page_cgroup(page); |
| 674 | if (pc) { | 610 | if (!pc) |
| 675 | VM_BUG_ON(pc->page != page); | 611 | return 0; |
| 676 | VM_BUG_ON(!pc->mem_cgroup); | 612 | lock_page_cgroup(pc); |
| 677 | unlock_page_cgroup(page); | 613 | if (PageCgroupUsed(pc)) { |
| 614 | unlock_page_cgroup(pc); | ||
| 678 | return 0; | 615 | return 0; |
| 679 | } | 616 | } |
| 680 | unlock_page_cgroup(page); | 617 | unlock_page_cgroup(pc); |
| 681 | } | 618 | } |
| 682 | 619 | ||
| 683 | if (unlikely(!mm)) | 620 | if (unlikely(!mm)) |
| 684 | mm = &init_mm; | 621 | mm = &init_mm; |
| 685 | 622 | ||
| 686 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 623 | if (page_is_file_cache(page)) |
| 624 | return mem_cgroup_charge_common(page, mm, gfp_mask, | ||
| 687 | MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); | 625 | MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); |
| 626 | else | ||
| 627 | return mem_cgroup_charge_common(page, mm, gfp_mask, | ||
| 628 | MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); | ||
| 688 | } | 629 | } |
| 689 | 630 | ||
| 690 | /* | 631 | /* |
| @@ -704,44 +645,46 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 704 | /* | 645 | /* |
| 705 | * Check if our page_cgroup is valid | 646 | * Check if our page_cgroup is valid |
| 706 | */ | 647 | */ |
| 707 | lock_page_cgroup(page); | 648 | pc = lookup_page_cgroup(page); |
| 708 | pc = page_get_page_cgroup(page); | 649 | if (unlikely(!pc || !PageCgroupUsed(pc))) |
| 709 | if (unlikely(!pc)) | 650 | return; |
| 710 | goto unlock; | ||
| 711 | |||
| 712 | VM_BUG_ON(pc->page != page); | ||
| 713 | 651 | ||
| 714 | if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | 652 | lock_page_cgroup(pc); |
| 715 | && ((pc->flags & PAGE_CGROUP_FLAG_CACHE) | 653 | if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page)) |
| 716 | || page_mapped(page))) | 654 | || !PageCgroupUsed(pc)) { |
| 717 | goto unlock; | 655 | /* This happens at race in zap_pte_range() and do_swap_page()*/ |
| 656 | unlock_page_cgroup(pc); | ||
| 657 | return; | ||
| 658 | } | ||
| 659 | ClearPageCgroupUsed(pc); | ||
| 660 | mem = pc->mem_cgroup; | ||
| 718 | 661 | ||
| 719 | mz = page_cgroup_zoneinfo(pc); | 662 | mz = page_cgroup_zoneinfo(pc); |
| 720 | spin_lock_irqsave(&mz->lru_lock, flags); | 663 | spin_lock_irqsave(&mz->lru_lock, flags); |
| 721 | __mem_cgroup_remove_list(mz, pc); | 664 | __mem_cgroup_remove_list(mz, pc); |
| 722 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 665 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
| 666 | unlock_page_cgroup(pc); | ||
| 723 | 667 | ||
| 724 | page_assign_page_cgroup(page, NULL); | ||
| 725 | unlock_page_cgroup(page); | ||
| 726 | |||
| 727 | mem = pc->mem_cgroup; | ||
| 728 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 668 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
| 729 | css_put(&mem->css); | 669 | css_put(&mem->css); |
| 730 | 670 | ||
| 731 | kmem_cache_free(page_cgroup_cache, pc); | ||
| 732 | return; | 671 | return; |
| 733 | unlock: | ||
| 734 | unlock_page_cgroup(page); | ||
| 735 | } | 672 | } |
| 736 | 673 | ||
| 737 | void mem_cgroup_uncharge_page(struct page *page) | 674 | void mem_cgroup_uncharge_page(struct page *page) |
| 738 | { | 675 | { |
| 676 | /* early check. */ | ||
| 677 | if (page_mapped(page)) | ||
| 678 | return; | ||
| 679 | if (page->mapping && !PageAnon(page)) | ||
| 680 | return; | ||
| 739 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); | 681 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); |
| 740 | } | 682 | } |
| 741 | 683 | ||
| 742 | void mem_cgroup_uncharge_cache_page(struct page *page) | 684 | void mem_cgroup_uncharge_cache_page(struct page *page) |
| 743 | { | 685 | { |
| 744 | VM_BUG_ON(page_mapped(page)); | 686 | VM_BUG_ON(page_mapped(page)); |
| 687 | VM_BUG_ON(page->mapping); | ||
| 745 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 688 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); |
| 746 | } | 689 | } |
| 747 | 690 | ||
| @@ -758,15 +701,19 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) | |||
| 758 | if (mem_cgroup_subsys.disabled) | 701 | if (mem_cgroup_subsys.disabled) |
| 759 | return 0; | 702 | return 0; |
| 760 | 703 | ||
| 761 | lock_page_cgroup(page); | 704 | pc = lookup_page_cgroup(page); |
| 762 | pc = page_get_page_cgroup(page); | 705 | lock_page_cgroup(pc); |
| 763 | if (pc) { | 706 | if (PageCgroupUsed(pc)) { |
| 764 | mem = pc->mem_cgroup; | 707 | mem = pc->mem_cgroup; |
| 765 | css_get(&mem->css); | 708 | css_get(&mem->css); |
| 766 | if (pc->flags & PAGE_CGROUP_FLAG_CACHE) | 709 | if (PageCgroupCache(pc)) { |
| 767 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | 710 | if (page_is_file_cache(page)) |
| 711 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
| 712 | else | ||
| 713 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
| 714 | } | ||
| 768 | } | 715 | } |
| 769 | unlock_page_cgroup(page); | 716 | unlock_page_cgroup(pc); |
| 770 | if (mem) { | 717 | if (mem) { |
| 771 | ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, | 718 | ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, |
| 772 | ctype, mem); | 719 | ctype, mem); |
| @@ -791,7 +738,7 @@ void mem_cgroup_end_migration(struct page *newpage) | |||
| 791 | */ | 738 | */ |
| 792 | if (!newpage->mapping) | 739 | if (!newpage->mapping) |
| 793 | __mem_cgroup_uncharge_common(newpage, | 740 | __mem_cgroup_uncharge_common(newpage, |
| 794 | MEM_CGROUP_CHARGE_TYPE_FORCE); | 741 | MEM_CGROUP_CHARGE_TYPE_FORCE); |
| 795 | else if (PageAnon(newpage)) | 742 | else if (PageAnon(newpage)) |
| 796 | mem_cgroup_uncharge_page(newpage); | 743 | mem_cgroup_uncharge_page(newpage); |
| 797 | } | 744 | } |
| @@ -863,7 +810,7 @@ int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) | |||
| 863 | #define FORCE_UNCHARGE_BATCH (128) | 810 | #define FORCE_UNCHARGE_BATCH (128) |
| 864 | static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, | 811 | static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, |
| 865 | struct mem_cgroup_per_zone *mz, | 812 | struct mem_cgroup_per_zone *mz, |
| 866 | int active) | 813 | enum lru_list lru) |
| 867 | { | 814 | { |
| 868 | struct page_cgroup *pc; | 815 | struct page_cgroup *pc; |
| 869 | struct page *page; | 816 | struct page *page; |
| @@ -871,15 +818,14 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
| 871 | unsigned long flags; | 818 | unsigned long flags; |
| 872 | struct list_head *list; | 819 | struct list_head *list; |
| 873 | 820 | ||
| 874 | if (active) | 821 | list = &mz->lists[lru]; |
| 875 | list = &mz->active_list; | ||
| 876 | else | ||
| 877 | list = &mz->inactive_list; | ||
| 878 | 822 | ||
| 879 | spin_lock_irqsave(&mz->lru_lock, flags); | 823 | spin_lock_irqsave(&mz->lru_lock, flags); |
| 880 | while (!list_empty(list)) { | 824 | while (!list_empty(list)) { |
| 881 | pc = list_entry(list->prev, struct page_cgroup, lru); | 825 | pc = list_entry(list->prev, struct page_cgroup, lru); |
| 882 | page = pc->page; | 826 | page = pc->page; |
| 827 | if (!PageCgroupUsed(pc)) | ||
| 828 | break; | ||
| 883 | get_page(page); | 829 | get_page(page); |
| 884 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 830 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
| 885 | /* | 831 | /* |
| @@ -894,8 +840,10 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
| 894 | count = FORCE_UNCHARGE_BATCH; | 840 | count = FORCE_UNCHARGE_BATCH; |
| 895 | cond_resched(); | 841 | cond_resched(); |
| 896 | } | 842 | } |
| 897 | } else | 843 | } else { |
| 898 | cond_resched(); | 844 | spin_lock_irqsave(&mz->lru_lock, flags); |
| 845 | break; | ||
| 846 | } | ||
| 899 | spin_lock_irqsave(&mz->lru_lock, flags); | 847 | spin_lock_irqsave(&mz->lru_lock, flags); |
| 900 | } | 848 | } |
| 901 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 849 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
| @@ -919,15 +867,17 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem) | |||
| 919 | while (mem->res.usage > 0) { | 867 | while (mem->res.usage > 0) { |
| 920 | if (atomic_read(&mem->css.cgroup->count) > 0) | 868 | if (atomic_read(&mem->css.cgroup->count) > 0) |
| 921 | goto out; | 869 | goto out; |
| 870 | /* This is for making all *used* pages to be on LRU. */ | ||
| 871 | lru_add_drain_all(); | ||
| 922 | for_each_node_state(node, N_POSSIBLE) | 872 | for_each_node_state(node, N_POSSIBLE) |
| 923 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 873 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
| 924 | struct mem_cgroup_per_zone *mz; | 874 | struct mem_cgroup_per_zone *mz; |
| 875 | enum lru_list l; | ||
| 925 | mz = mem_cgroup_zoneinfo(mem, node, zid); | 876 | mz = mem_cgroup_zoneinfo(mem, node, zid); |
| 926 | /* drop all page_cgroup in active_list */ | 877 | for_each_lru(l) |
| 927 | mem_cgroup_force_empty_list(mem, mz, 1); | 878 | mem_cgroup_force_empty_list(mem, mz, l); |
| 928 | /* drop all page_cgroup in inactive_list */ | ||
| 929 | mem_cgroup_force_empty_list(mem, mz, 0); | ||
| 930 | } | 879 | } |
| 880 | cond_resched(); | ||
| 931 | } | 881 | } |
| 932 | ret = 0; | 882 | ret = 0; |
| 933 | out: | 883 | out: |
| @@ -1012,14 +962,27 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
| 1012 | } | 962 | } |
| 1013 | /* showing # of active pages */ | 963 | /* showing # of active pages */ |
| 1014 | { | 964 | { |
| 1015 | unsigned long active, inactive; | 965 | unsigned long active_anon, inactive_anon; |
| 1016 | 966 | unsigned long active_file, inactive_file; | |
| 1017 | inactive = mem_cgroup_get_all_zonestat(mem_cont, | 967 | unsigned long unevictable; |
| 1018 | MEM_CGROUP_ZSTAT_INACTIVE); | 968 | |
| 1019 | active = mem_cgroup_get_all_zonestat(mem_cont, | 969 | inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, |
| 1020 | MEM_CGROUP_ZSTAT_ACTIVE); | 970 | LRU_INACTIVE_ANON); |
| 1021 | cb->fill(cb, "active", (active) * PAGE_SIZE); | 971 | active_anon = mem_cgroup_get_all_zonestat(mem_cont, |
| 1022 | cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); | 972 | LRU_ACTIVE_ANON); |
| 973 | inactive_file = mem_cgroup_get_all_zonestat(mem_cont, | ||
| 974 | LRU_INACTIVE_FILE); | ||
| 975 | active_file = mem_cgroup_get_all_zonestat(mem_cont, | ||
| 976 | LRU_ACTIVE_FILE); | ||
| 977 | unevictable = mem_cgroup_get_all_zonestat(mem_cont, | ||
| 978 | LRU_UNEVICTABLE); | ||
| 979 | |||
| 980 | cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); | ||
| 981 | cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); | ||
| 982 | cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); | ||
| 983 | cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); | ||
| 984 | cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); | ||
| 985 | |||
| 1023 | } | 986 | } |
| 1024 | return 0; | 987 | return 0; |
| 1025 | } | 988 | } |
| @@ -1062,6 +1025,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
| 1062 | { | 1025 | { |
| 1063 | struct mem_cgroup_per_node *pn; | 1026 | struct mem_cgroup_per_node *pn; |
| 1064 | struct mem_cgroup_per_zone *mz; | 1027 | struct mem_cgroup_per_zone *mz; |
| 1028 | enum lru_list l; | ||
| 1065 | int zone, tmp = node; | 1029 | int zone, tmp = node; |
| 1066 | /* | 1030 | /* |
| 1067 | * This routine is called against possible nodes. | 1031 | * This routine is called against possible nodes. |
| @@ -1082,9 +1046,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
| 1082 | 1046 | ||
| 1083 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 1047 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
| 1084 | mz = &pn->zoneinfo[zone]; | 1048 | mz = &pn->zoneinfo[zone]; |
| 1085 | INIT_LIST_HEAD(&mz->active_list); | ||
| 1086 | INIT_LIST_HEAD(&mz->inactive_list); | ||
| 1087 | spin_lock_init(&mz->lru_lock); | 1049 | spin_lock_init(&mz->lru_lock); |
| 1050 | for_each_lru(l) | ||
| 1051 | INIT_LIST_HEAD(&mz->lists[l]); | ||
| 1088 | } | 1052 | } |
| 1089 | return 0; | 1053 | return 0; |
| 1090 | } | 1054 | } |
| @@ -1125,7 +1089,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 1125 | 1089 | ||
| 1126 | if (unlikely((cont->parent) == NULL)) { | 1090 | if (unlikely((cont->parent) == NULL)) { |
| 1127 | mem = &init_mem_cgroup; | 1091 | mem = &init_mem_cgroup; |
| 1128 | page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC); | ||
| 1129 | } else { | 1092 | } else { |
| 1130 | mem = mem_cgroup_alloc(); | 1093 | mem = mem_cgroup_alloc(); |
| 1131 | if (!mem) | 1094 | if (!mem) |
diff --git a/mm/memory.c b/mm/memory.c index 1002f473f497..164951c47305 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -1129,12 +1129,17 @@ static inline int use_zero_page(struct vm_area_struct *vma) | |||
| 1129 | return !vma->vm_ops || !vma->vm_ops->fault; | 1129 | return !vma->vm_ops || !vma->vm_ops->fault; |
| 1130 | } | 1130 | } |
| 1131 | 1131 | ||
| 1132 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1132 | |
| 1133 | unsigned long start, int len, int write, int force, | 1133 | |
| 1134 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
| 1135 | unsigned long start, int len, int flags, | ||
| 1134 | struct page **pages, struct vm_area_struct **vmas) | 1136 | struct page **pages, struct vm_area_struct **vmas) |
| 1135 | { | 1137 | { |
| 1136 | int i; | 1138 | int i; |
| 1137 | unsigned int vm_flags; | 1139 | unsigned int vm_flags = 0; |
| 1140 | int write = !!(flags & GUP_FLAGS_WRITE); | ||
| 1141 | int force = !!(flags & GUP_FLAGS_FORCE); | ||
| 1142 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); | ||
| 1138 | 1143 | ||
| 1139 | if (len <= 0) | 1144 | if (len <= 0) |
| 1140 | return 0; | 1145 | return 0; |
| @@ -1158,7 +1163,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1158 | pud_t *pud; | 1163 | pud_t *pud; |
| 1159 | pmd_t *pmd; | 1164 | pmd_t *pmd; |
| 1160 | pte_t *pte; | 1165 | pte_t *pte; |
| 1161 | if (write) /* user gate pages are read-only */ | 1166 | |
| 1167 | /* user gate pages are read-only */ | ||
| 1168 | if (!ignore && write) | ||
| 1162 | return i ? : -EFAULT; | 1169 | return i ? : -EFAULT; |
| 1163 | if (pg > TASK_SIZE) | 1170 | if (pg > TASK_SIZE) |
| 1164 | pgd = pgd_offset_k(pg); | 1171 | pgd = pgd_offset_k(pg); |
| @@ -1190,8 +1197,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1190 | continue; | 1197 | continue; |
| 1191 | } | 1198 | } |
| 1192 | 1199 | ||
| 1193 | if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) | 1200 | if (!vma || |
| 1194 | || !(vm_flags & vma->vm_flags)) | 1201 | (vma->vm_flags & (VM_IO | VM_PFNMAP)) || |
| 1202 | (!ignore && !(vm_flags & vma->vm_flags))) | ||
| 1195 | return i ? : -EFAULT; | 1203 | return i ? : -EFAULT; |
| 1196 | 1204 | ||
| 1197 | if (is_vm_hugetlb_page(vma)) { | 1205 | if (is_vm_hugetlb_page(vma)) { |
| @@ -1266,6 +1274,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1266 | } while (len); | 1274 | } while (len); |
| 1267 | return i; | 1275 | return i; |
| 1268 | } | 1276 | } |
| 1277 | |||
| 1278 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
| 1279 | unsigned long start, int len, int write, int force, | ||
| 1280 | struct page **pages, struct vm_area_struct **vmas) | ||
| 1281 | { | ||
| 1282 | int flags = 0; | ||
| 1283 | |||
| 1284 | if (write) | ||
| 1285 | flags |= GUP_FLAGS_WRITE; | ||
| 1286 | if (force) | ||
| 1287 | flags |= GUP_FLAGS_FORCE; | ||
| 1288 | |||
| 1289 | return __get_user_pages(tsk, mm, | ||
| 1290 | start, len, flags, | ||
| 1291 | pages, vmas); | ||
| 1292 | } | ||
| 1293 | |||
| 1269 | EXPORT_SYMBOL(get_user_pages); | 1294 | EXPORT_SYMBOL(get_user_pages); |
| 1270 | 1295 | ||
| 1271 | pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, | 1296 | pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, |
| @@ -1296,18 +1321,14 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
| 1296 | pte_t *pte; | 1321 | pte_t *pte; |
| 1297 | spinlock_t *ptl; | 1322 | spinlock_t *ptl; |
| 1298 | 1323 | ||
| 1299 | retval = mem_cgroup_charge(page, mm, GFP_KERNEL); | ||
| 1300 | if (retval) | ||
| 1301 | goto out; | ||
| 1302 | |||
| 1303 | retval = -EINVAL; | 1324 | retval = -EINVAL; |
| 1304 | if (PageAnon(page)) | 1325 | if (PageAnon(page)) |
| 1305 | goto out_uncharge; | 1326 | goto out; |
| 1306 | retval = -ENOMEM; | 1327 | retval = -ENOMEM; |
| 1307 | flush_dcache_page(page); | 1328 | flush_dcache_page(page); |
| 1308 | pte = get_locked_pte(mm, addr, &ptl); | 1329 | pte = get_locked_pte(mm, addr, &ptl); |
| 1309 | if (!pte) | 1330 | if (!pte) |
| 1310 | goto out_uncharge; | 1331 | goto out; |
| 1311 | retval = -EBUSY; | 1332 | retval = -EBUSY; |
| 1312 | if (!pte_none(*pte)) | 1333 | if (!pte_none(*pte)) |
| 1313 | goto out_unlock; | 1334 | goto out_unlock; |
| @@ -1323,8 +1344,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
| 1323 | return retval; | 1344 | return retval; |
| 1324 | out_unlock: | 1345 | out_unlock: |
| 1325 | pte_unmap_unlock(pte, ptl); | 1346 | pte_unmap_unlock(pte, ptl); |
| 1326 | out_uncharge: | ||
| 1327 | mem_cgroup_uncharge_page(page); | ||
| 1328 | out: | 1347 | out: |
| 1329 | return retval; | 1348 | return retval; |
| 1330 | } | 1349 | } |
| @@ -1858,6 +1877,15 @@ gotten: | |||
| 1858 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 1877 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
| 1859 | if (!new_page) | 1878 | if (!new_page) |
| 1860 | goto oom; | 1879 | goto oom; |
| 1880 | /* | ||
| 1881 | * Don't let another task, with possibly unlocked vma, | ||
| 1882 | * keep the mlocked page. | ||
| 1883 | */ | ||
| 1884 | if (vma->vm_flags & VM_LOCKED) { | ||
| 1885 | lock_page(old_page); /* for LRU manipulation */ | ||
| 1886 | clear_page_mlock(old_page); | ||
| 1887 | unlock_page(old_page); | ||
| 1888 | } | ||
| 1861 | cow_user_page(new_page, old_page, address, vma); | 1889 | cow_user_page(new_page, old_page, address, vma); |
| 1862 | __SetPageUptodate(new_page); | 1890 | __SetPageUptodate(new_page); |
| 1863 | 1891 | ||
| @@ -1886,11 +1914,13 @@ gotten: | |||
| 1886 | * thread doing COW. | 1914 | * thread doing COW. |
| 1887 | */ | 1915 | */ |
| 1888 | ptep_clear_flush_notify(vma, address, page_table); | 1916 | ptep_clear_flush_notify(vma, address, page_table); |
| 1889 | set_pte_at(mm, address, page_table, entry); | 1917 | SetPageSwapBacked(new_page); |
| 1890 | update_mmu_cache(vma, address, entry); | 1918 | lru_cache_add_active_or_unevictable(new_page, vma); |
| 1891 | lru_cache_add_active(new_page); | ||
| 1892 | page_add_new_anon_rmap(new_page, vma, address); | 1919 | page_add_new_anon_rmap(new_page, vma, address); |
| 1893 | 1920 | ||
| 1921 | //TODO: is this safe? do_anonymous_page() does it this way. | ||
| 1922 | set_pte_at(mm, address, page_table, entry); | ||
| 1923 | update_mmu_cache(vma, address, entry); | ||
| 1894 | if (old_page) { | 1924 | if (old_page) { |
| 1895 | /* | 1925 | /* |
| 1896 | * Only after switching the pte to the new page may | 1926 | * Only after switching the pte to the new page may |
| @@ -2288,16 +2318,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2288 | count_vm_event(PGMAJFAULT); | 2318 | count_vm_event(PGMAJFAULT); |
| 2289 | } | 2319 | } |
| 2290 | 2320 | ||
| 2321 | mark_page_accessed(page); | ||
| 2322 | |||
| 2323 | lock_page(page); | ||
| 2324 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | ||
| 2325 | |||
| 2291 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { | 2326 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { |
| 2292 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | ||
| 2293 | ret = VM_FAULT_OOM; | 2327 | ret = VM_FAULT_OOM; |
| 2328 | unlock_page(page); | ||
| 2294 | goto out; | 2329 | goto out; |
| 2295 | } | 2330 | } |
| 2296 | 2331 | ||
| 2297 | mark_page_accessed(page); | ||
| 2298 | lock_page(page); | ||
| 2299 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | ||
| 2300 | |||
| 2301 | /* | 2332 | /* |
| 2302 | * Back out if somebody else already faulted in this pte. | 2333 | * Back out if somebody else already faulted in this pte. |
| 2303 | */ | 2334 | */ |
| @@ -2324,7 +2355,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2324 | page_add_anon_rmap(page, vma, address); | 2355 | page_add_anon_rmap(page, vma, address); |
| 2325 | 2356 | ||
| 2326 | swap_free(entry); | 2357 | swap_free(entry); |
| 2327 | if (vm_swap_full()) | 2358 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) |
| 2328 | remove_exclusive_swap_page(page); | 2359 | remove_exclusive_swap_page(page); |
| 2329 | unlock_page(page); | 2360 | unlock_page(page); |
| 2330 | 2361 | ||
| @@ -2382,7 +2413,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2382 | if (!pte_none(*page_table)) | 2413 | if (!pte_none(*page_table)) |
| 2383 | goto release; | 2414 | goto release; |
| 2384 | inc_mm_counter(mm, anon_rss); | 2415 | inc_mm_counter(mm, anon_rss); |
| 2385 | lru_cache_add_active(page); | 2416 | SetPageSwapBacked(page); |
| 2417 | lru_cache_add_active_or_unevictable(page, vma); | ||
| 2386 | page_add_new_anon_rmap(page, vma, address); | 2418 | page_add_new_anon_rmap(page, vma, address); |
| 2387 | set_pte_at(mm, address, page_table, entry); | 2419 | set_pte_at(mm, address, page_table, entry); |
| 2388 | 2420 | ||
| @@ -2423,6 +2455,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2423 | struct page *page; | 2455 | struct page *page; |
| 2424 | pte_t entry; | 2456 | pte_t entry; |
| 2425 | int anon = 0; | 2457 | int anon = 0; |
| 2458 | int charged = 0; | ||
| 2426 | struct page *dirty_page = NULL; | 2459 | struct page *dirty_page = NULL; |
| 2427 | struct vm_fault vmf; | 2460 | struct vm_fault vmf; |
| 2428 | int ret; | 2461 | int ret; |
| @@ -2463,6 +2496,18 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2463 | ret = VM_FAULT_OOM; | 2496 | ret = VM_FAULT_OOM; |
| 2464 | goto out; | 2497 | goto out; |
| 2465 | } | 2498 | } |
| 2499 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { | ||
| 2500 | ret = VM_FAULT_OOM; | ||
| 2501 | page_cache_release(page); | ||
| 2502 | goto out; | ||
| 2503 | } | ||
| 2504 | charged = 1; | ||
| 2505 | /* | ||
| 2506 | * Don't let another task, with possibly unlocked vma, | ||
| 2507 | * keep the mlocked page. | ||
| 2508 | */ | ||
| 2509 | if (vma->vm_flags & VM_LOCKED) | ||
| 2510 | clear_page_mlock(vmf.page); | ||
| 2466 | copy_user_highpage(page, vmf.page, address, vma); | 2511 | copy_user_highpage(page, vmf.page, address, vma); |
| 2467 | __SetPageUptodate(page); | 2512 | __SetPageUptodate(page); |
| 2468 | } else { | 2513 | } else { |
| @@ -2497,11 +2542,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2497 | 2542 | ||
| 2498 | } | 2543 | } |
| 2499 | 2544 | ||
| 2500 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { | ||
| 2501 | ret = VM_FAULT_OOM; | ||
| 2502 | goto out; | ||
| 2503 | } | ||
| 2504 | |||
| 2505 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2545 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
| 2506 | 2546 | ||
| 2507 | /* | 2547 | /* |
| @@ -2520,11 +2560,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2520 | entry = mk_pte(page, vma->vm_page_prot); | 2560 | entry = mk_pte(page, vma->vm_page_prot); |
| 2521 | if (flags & FAULT_FLAG_WRITE) | 2561 | if (flags & FAULT_FLAG_WRITE) |
| 2522 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2562 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 2523 | set_pte_at(mm, address, page_table, entry); | ||
| 2524 | if (anon) { | 2563 | if (anon) { |
| 2525 | inc_mm_counter(mm, anon_rss); | 2564 | inc_mm_counter(mm, anon_rss); |
| 2526 | lru_cache_add_active(page); | 2565 | SetPageSwapBacked(page); |
| 2527 | page_add_new_anon_rmap(page, vma, address); | 2566 | lru_cache_add_active_or_unevictable(page, vma); |
| 2567 | page_add_new_anon_rmap(page, vma, address); | ||
| 2528 | } else { | 2568 | } else { |
| 2529 | inc_mm_counter(mm, file_rss); | 2569 | inc_mm_counter(mm, file_rss); |
| 2530 | page_add_file_rmap(page); | 2570 | page_add_file_rmap(page); |
| @@ -2533,11 +2573,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2533 | get_page(dirty_page); | 2573 | get_page(dirty_page); |
| 2534 | } | 2574 | } |
| 2535 | } | 2575 | } |
| 2576 | //TODO: is this safe? do_anonymous_page() does it this way. | ||
| 2577 | set_pte_at(mm, address, page_table, entry); | ||
| 2536 | 2578 | ||
| 2537 | /* no need to invalidate: a not-present page won't be cached */ | 2579 | /* no need to invalidate: a not-present page won't be cached */ |
| 2538 | update_mmu_cache(vma, address, entry); | 2580 | update_mmu_cache(vma, address, entry); |
| 2539 | } else { | 2581 | } else { |
| 2540 | mem_cgroup_uncharge_page(page); | 2582 | if (charged) |
| 2583 | mem_cgroup_uncharge_page(page); | ||
| 2541 | if (anon) | 2584 | if (anon) |
| 2542 | page_cache_release(page); | 2585 | page_cache_release(page); |
| 2543 | else | 2586 | else |
| @@ -2772,19 +2815,9 @@ int make_pages_present(unsigned long addr, unsigned long end) | |||
| 2772 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; | 2815 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; |
| 2773 | ret = get_user_pages(current, current->mm, addr, | 2816 | ret = get_user_pages(current, current->mm, addr, |
| 2774 | len, write, 0, NULL, NULL); | 2817 | len, write, 0, NULL, NULL); |
| 2775 | if (ret < 0) { | 2818 | if (ret < 0) |
| 2776 | /* | ||
| 2777 | SUS require strange return value to mlock | ||
| 2778 | - invalid addr generate to ENOMEM. | ||
| 2779 | - out of memory should generate EAGAIN. | ||
| 2780 | */ | ||
| 2781 | if (ret == -EFAULT) | ||
| 2782 | ret = -ENOMEM; | ||
| 2783 | else if (ret == -ENOMEM) | ||
| 2784 | ret = -EAGAIN; | ||
| 2785 | return ret; | 2819 | return ret; |
| 2786 | } | 2820 | return ret == len ? 0 : -EFAULT; |
| 2787 | return ret == len ? 0 : -ENOMEM; | ||
| 2788 | } | 2821 | } |
| 2789 | 2822 | ||
| 2790 | #if !defined(__HAVE_ARCH_GATE_AREA) | 2823 | #if !defined(__HAVE_ARCH_GATE_AREA) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 89fee2dcb039..6837a1014372 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -26,6 +26,7 @@ | |||
| 26 | #include <linux/delay.h> | 26 | #include <linux/delay.h> |
| 27 | #include <linux/migrate.h> | 27 | #include <linux/migrate.h> |
| 28 | #include <linux/page-isolation.h> | 28 | #include <linux/page-isolation.h> |
| 29 | #include <linux/pfn.h> | ||
| 29 | 30 | ||
| 30 | #include <asm/tlbflush.h> | 31 | #include <asm/tlbflush.h> |
| 31 | 32 | ||
| @@ -323,11 +324,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
| 323 | BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); | 324 | BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); |
| 324 | BUG_ON(nr_pages % PAGES_PER_SECTION); | 325 | BUG_ON(nr_pages % PAGES_PER_SECTION); |
| 325 | 326 | ||
| 326 | release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); | ||
| 327 | |||
| 328 | sections_to_remove = nr_pages / PAGES_PER_SECTION; | 327 | sections_to_remove = nr_pages / PAGES_PER_SECTION; |
| 329 | for (i = 0; i < sections_to_remove; i++) { | 328 | for (i = 0; i < sections_to_remove; i++) { |
| 330 | unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; | 329 | unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; |
| 330 | release_mem_region(pfn << PAGE_SHIFT, | ||
| 331 | PAGES_PER_SECTION << PAGE_SHIFT); | ||
| 331 | ret = __remove_section(zone, __pfn_to_section(pfn)); | 332 | ret = __remove_section(zone, __pfn_to_section(pfn)); |
| 332 | if (ret) | 333 | if (ret) |
| 333 | break; | 334 | break; |
| @@ -657,8 +658,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
| 657 | * We can skip free pages. And we can only deal with pages on | 658 | * We can skip free pages. And we can only deal with pages on |
| 658 | * LRU. | 659 | * LRU. |
| 659 | */ | 660 | */ |
| 660 | ret = isolate_lru_page(page, &source); | 661 | ret = isolate_lru_page(page); |
| 661 | if (!ret) { /* Success */ | 662 | if (!ret) { /* Success */ |
| 663 | list_add_tail(&page->lru, &source); | ||
| 662 | move_pages--; | 664 | move_pages--; |
| 663 | } else { | 665 | } else { |
| 664 | /* Becasue we don't have big zone->lock. we should | 666 | /* Becasue we don't have big zone->lock. we should |
| @@ -849,10 +851,19 @@ failed_removal: | |||
| 849 | 851 | ||
| 850 | return ret; | 852 | return ret; |
| 851 | } | 853 | } |
| 854 | |||
| 855 | int remove_memory(u64 start, u64 size) | ||
| 856 | { | ||
| 857 | unsigned long start_pfn, end_pfn; | ||
| 858 | |||
| 859 | start_pfn = PFN_DOWN(start); | ||
| 860 | end_pfn = start_pfn + PFN_DOWN(size); | ||
| 861 | return offline_pages(start_pfn, end_pfn, 120 * HZ); | ||
| 862 | } | ||
| 852 | #else | 863 | #else |
| 853 | int remove_memory(u64 start, u64 size) | 864 | int remove_memory(u64 start, u64 size) |
| 854 | { | 865 | { |
| 855 | return -EINVAL; | 866 | return -EINVAL; |
| 856 | } | 867 | } |
| 857 | EXPORT_SYMBOL_GPL(remove_memory); | ||
| 858 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 868 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
| 869 | EXPORT_SYMBOL_GPL(remove_memory); | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 83369058ec13..e9493b1c1117 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -93,6 +93,8 @@ | |||
| 93 | #include <asm/tlbflush.h> | 93 | #include <asm/tlbflush.h> |
| 94 | #include <asm/uaccess.h> | 94 | #include <asm/uaccess.h> |
| 95 | 95 | ||
| 96 | #include "internal.h" | ||
| 97 | |||
| 96 | /* Internal flags */ | 98 | /* Internal flags */ |
| 97 | #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ | 99 | #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ |
| 98 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ | 100 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ |
| @@ -487,12 +489,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
| 487 | int err; | 489 | int err; |
| 488 | struct vm_area_struct *first, *vma, *prev; | 490 | struct vm_area_struct *first, *vma, *prev; |
| 489 | 491 | ||
| 490 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { | ||
| 491 | |||
| 492 | err = migrate_prep(); | ||
| 493 | if (err) | ||
| 494 | return ERR_PTR(err); | ||
| 495 | } | ||
| 496 | 492 | ||
| 497 | first = find_vma(mm, start); | 493 | first = find_vma(mm, start); |
| 498 | if (!first) | 494 | if (!first) |
| @@ -762,8 +758,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
| 762 | /* | 758 | /* |
| 763 | * Avoid migrating a page that is shared with others. | 759 | * Avoid migrating a page that is shared with others. |
| 764 | */ | 760 | */ |
| 765 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) | 761 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { |
| 766 | isolate_lru_page(page, pagelist); | 762 | if (!isolate_lru_page(page)) { |
| 763 | list_add_tail(&page->lru, pagelist); | ||
| 764 | } | ||
| 765 | } | ||
| 767 | } | 766 | } |
| 768 | 767 | ||
| 769 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) | 768 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) |
| @@ -804,9 +803,13 @@ int do_migrate_pages(struct mm_struct *mm, | |||
| 804 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | 803 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) |
| 805 | { | 804 | { |
| 806 | int busy = 0; | 805 | int busy = 0; |
| 807 | int err = 0; | 806 | int err; |
| 808 | nodemask_t tmp; | 807 | nodemask_t tmp; |
| 809 | 808 | ||
| 809 | err = migrate_prep(); | ||
| 810 | if (err) | ||
| 811 | return err; | ||
| 812 | |||
| 810 | down_read(&mm->mmap_sem); | 813 | down_read(&mm->mmap_sem); |
| 811 | 814 | ||
| 812 | err = migrate_vmas(mm, from_nodes, to_nodes, flags); | 815 | err = migrate_vmas(mm, from_nodes, to_nodes, flags); |
| @@ -969,6 +972,12 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
| 969 | start, start + len, mode, mode_flags, | 972 | start, start + len, mode, mode_flags, |
| 970 | nmask ? nodes_addr(*nmask)[0] : -1); | 973 | nmask ? nodes_addr(*nmask)[0] : -1); |
| 971 | 974 | ||
| 975 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { | ||
| 976 | |||
| 977 | err = migrate_prep(); | ||
| 978 | if (err) | ||
| 979 | return err; | ||
| 980 | } | ||
| 972 | down_write(&mm->mmap_sem); | 981 | down_write(&mm->mmap_sem); |
| 973 | vma = check_range(mm, start, end, nmask, | 982 | vma = check_range(mm, start, end, nmask, |
| 974 | flags | MPOL_MF_INVERT, &pagelist); | 983 | flags | MPOL_MF_INVERT, &pagelist); |
| @@ -2197,7 +2206,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty) | |||
| 2197 | if (PageSwapCache(page)) | 2206 | if (PageSwapCache(page)) |
| 2198 | md->swapcache++; | 2207 | md->swapcache++; |
| 2199 | 2208 | ||
| 2200 | if (PageActive(page)) | 2209 | if (PageActive(page) || PageUnevictable(page)) |
| 2201 | md->active++; | 2210 | md->active++; |
| 2202 | 2211 | ||
| 2203 | if (PageWriteback(page)) | 2212 | if (PageWriteback(page)) |
diff --git a/mm/migrate.c b/mm/migrate.c index 2a80136b23bb..385db89f0c33 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -37,36 +37,6 @@ | |||
| 37 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 37 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
| 38 | 38 | ||
| 39 | /* | 39 | /* |
| 40 | * Isolate one page from the LRU lists. If successful put it onto | ||
| 41 | * the indicated list with elevated page count. | ||
| 42 | * | ||
| 43 | * Result: | ||
| 44 | * -EBUSY: page not on LRU list | ||
| 45 | * 0: page removed from LRU list and added to the specified list. | ||
| 46 | */ | ||
| 47 | int isolate_lru_page(struct page *page, struct list_head *pagelist) | ||
| 48 | { | ||
| 49 | int ret = -EBUSY; | ||
| 50 | |||
| 51 | if (PageLRU(page)) { | ||
| 52 | struct zone *zone = page_zone(page); | ||
| 53 | |||
| 54 | spin_lock_irq(&zone->lru_lock); | ||
| 55 | if (PageLRU(page) && get_page_unless_zero(page)) { | ||
| 56 | ret = 0; | ||
| 57 | ClearPageLRU(page); | ||
| 58 | if (PageActive(page)) | ||
| 59 | del_page_from_active_list(zone, page); | ||
| 60 | else | ||
| 61 | del_page_from_inactive_list(zone, page); | ||
| 62 | list_add_tail(&page->lru, pagelist); | ||
| 63 | } | ||
| 64 | spin_unlock_irq(&zone->lru_lock); | ||
| 65 | } | ||
| 66 | return ret; | ||
| 67 | } | ||
| 68 | |||
| 69 | /* | ||
| 70 | * migrate_prep() needs to be called before we start compiling a list of pages | 40 | * migrate_prep() needs to be called before we start compiling a list of pages |
| 71 | * to be migrated using isolate_lru_page(). | 41 | * to be migrated using isolate_lru_page(). |
| 72 | */ | 42 | */ |
| @@ -83,23 +53,9 @@ int migrate_prep(void) | |||
| 83 | return 0; | 53 | return 0; |
| 84 | } | 54 | } |
| 85 | 55 | ||
| 86 | static inline void move_to_lru(struct page *page) | ||
| 87 | { | ||
| 88 | if (PageActive(page)) { | ||
| 89 | /* | ||
| 90 | * lru_cache_add_active checks that | ||
| 91 | * the PG_active bit is off. | ||
| 92 | */ | ||
| 93 | ClearPageActive(page); | ||
| 94 | lru_cache_add_active(page); | ||
| 95 | } else { | ||
| 96 | lru_cache_add(page); | ||
| 97 | } | ||
| 98 | put_page(page); | ||
| 99 | } | ||
| 100 | |||
| 101 | /* | 56 | /* |
| 102 | * Add isolated pages on the list back to the LRU. | 57 | * Add isolated pages on the list back to the LRU under page lock |
| 58 | * to avoid leaking evictable pages back onto unevictable list. | ||
| 103 | * | 59 | * |
| 104 | * returns the number of pages put back. | 60 | * returns the number of pages put back. |
| 105 | */ | 61 | */ |
| @@ -111,7 +67,7 @@ int putback_lru_pages(struct list_head *l) | |||
| 111 | 67 | ||
| 112 | list_for_each_entry_safe(page, page2, l, lru) { | 68 | list_for_each_entry_safe(page, page2, l, lru) { |
| 113 | list_del(&page->lru); | 69 | list_del(&page->lru); |
| 114 | move_to_lru(page); | 70 | putback_lru_page(page); |
| 115 | count++; | 71 | count++; |
| 116 | } | 72 | } |
| 117 | return count; | 73 | return count; |
| @@ -374,8 +330,6 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
| 374 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | 330 | __inc_zone_page_state(newpage, NR_FILE_PAGES); |
| 375 | 331 | ||
| 376 | spin_unlock_irq(&mapping->tree_lock); | 332 | spin_unlock_irq(&mapping->tree_lock); |
| 377 | if (!PageSwapCache(newpage)) | ||
| 378 | mem_cgroup_uncharge_cache_page(page); | ||
| 379 | 333 | ||
| 380 | return 0; | 334 | return 0; |
| 381 | } | 335 | } |
| @@ -385,6 +339,8 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
| 385 | */ | 339 | */ |
| 386 | static void migrate_page_copy(struct page *newpage, struct page *page) | 340 | static void migrate_page_copy(struct page *newpage, struct page *page) |
| 387 | { | 341 | { |
| 342 | int anon; | ||
| 343 | |||
| 388 | copy_highpage(newpage, page); | 344 | copy_highpage(newpage, page); |
| 389 | 345 | ||
| 390 | if (PageError(page)) | 346 | if (PageError(page)) |
| @@ -393,8 +349,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
| 393 | SetPageReferenced(newpage); | 349 | SetPageReferenced(newpage); |
| 394 | if (PageUptodate(page)) | 350 | if (PageUptodate(page)) |
| 395 | SetPageUptodate(newpage); | 351 | SetPageUptodate(newpage); |
| 396 | if (PageActive(page)) | 352 | if (TestClearPageActive(page)) { |
| 353 | VM_BUG_ON(PageUnevictable(page)); | ||
| 397 | SetPageActive(newpage); | 354 | SetPageActive(newpage); |
| 355 | } else | ||
| 356 | unevictable_migrate_page(newpage, page); | ||
| 398 | if (PageChecked(page)) | 357 | if (PageChecked(page)) |
| 399 | SetPageChecked(newpage); | 358 | SetPageChecked(newpage); |
| 400 | if (PageMappedToDisk(page)) | 359 | if (PageMappedToDisk(page)) |
| @@ -412,14 +371,20 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
| 412 | __set_page_dirty_nobuffers(newpage); | 371 | __set_page_dirty_nobuffers(newpage); |
| 413 | } | 372 | } |
| 414 | 373 | ||
| 374 | mlock_migrate_page(newpage, page); | ||
| 375 | |||
| 415 | #ifdef CONFIG_SWAP | 376 | #ifdef CONFIG_SWAP |
| 416 | ClearPageSwapCache(page); | 377 | ClearPageSwapCache(page); |
| 417 | #endif | 378 | #endif |
| 418 | ClearPageActive(page); | ||
| 419 | ClearPagePrivate(page); | 379 | ClearPagePrivate(page); |
| 420 | set_page_private(page, 0); | 380 | set_page_private(page, 0); |
| 381 | /* page->mapping contains a flag for PageAnon() */ | ||
| 382 | anon = PageAnon(page); | ||
| 421 | page->mapping = NULL; | 383 | page->mapping = NULL; |
| 422 | 384 | ||
| 385 | if (!anon) /* This page was removed from radix-tree. */ | ||
| 386 | mem_cgroup_uncharge_cache_page(page); | ||
| 387 | |||
| 423 | /* | 388 | /* |
| 424 | * If any waiters have accumulated on the new page then | 389 | * If any waiters have accumulated on the new page then |
| 425 | * wake them up. | 390 | * wake them up. |
| @@ -594,6 +559,10 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
| 594 | * | 559 | * |
| 595 | * The new page will have replaced the old page if this function | 560 | * The new page will have replaced the old page if this function |
| 596 | * is successful. | 561 | * is successful. |
| 562 | * | ||
| 563 | * Return value: | ||
| 564 | * < 0 - error code | ||
| 565 | * == 0 - success | ||
| 597 | */ | 566 | */ |
| 598 | static int move_to_new_page(struct page *newpage, struct page *page) | 567 | static int move_to_new_page(struct page *newpage, struct page *page) |
| 599 | { | 568 | { |
| @@ -611,6 +580,8 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
| 611 | /* Prepare mapping for the new page.*/ | 580 | /* Prepare mapping for the new page.*/ |
| 612 | newpage->index = page->index; | 581 | newpage->index = page->index; |
| 613 | newpage->mapping = page->mapping; | 582 | newpage->mapping = page->mapping; |
| 583 | if (PageSwapBacked(page)) | ||
| 584 | SetPageSwapBacked(newpage); | ||
| 614 | 585 | ||
| 615 | mapping = page_mapping(page); | 586 | mapping = page_mapping(page); |
| 616 | if (!mapping) | 587 | if (!mapping) |
| @@ -654,9 +625,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 654 | if (!newpage) | 625 | if (!newpage) |
| 655 | return -ENOMEM; | 626 | return -ENOMEM; |
| 656 | 627 | ||
| 657 | if (page_count(page) == 1) | 628 | if (page_count(page) == 1) { |
| 658 | /* page was freed from under us. So we are done. */ | 629 | /* page was freed from under us. So we are done. */ |
| 659 | goto move_newpage; | 630 | goto move_newpage; |
| 631 | } | ||
| 660 | 632 | ||
| 661 | charge = mem_cgroup_prepare_migration(page, newpage); | 633 | charge = mem_cgroup_prepare_migration(page, newpage); |
| 662 | if (charge == -ENOMEM) { | 634 | if (charge == -ENOMEM) { |
| @@ -730,7 +702,6 @@ rcu_unlock: | |||
| 730 | rcu_read_unlock(); | 702 | rcu_read_unlock(); |
| 731 | 703 | ||
| 732 | unlock: | 704 | unlock: |
| 733 | |||
| 734 | unlock_page(page); | 705 | unlock_page(page); |
| 735 | 706 | ||
| 736 | if (rc != -EAGAIN) { | 707 | if (rc != -EAGAIN) { |
| @@ -741,17 +712,19 @@ unlock: | |||
| 741 | * restored. | 712 | * restored. |
| 742 | */ | 713 | */ |
| 743 | list_del(&page->lru); | 714 | list_del(&page->lru); |
| 744 | move_to_lru(page); | 715 | putback_lru_page(page); |
| 745 | } | 716 | } |
| 746 | 717 | ||
| 747 | move_newpage: | 718 | move_newpage: |
| 748 | if (!charge) | 719 | if (!charge) |
| 749 | mem_cgroup_end_migration(newpage); | 720 | mem_cgroup_end_migration(newpage); |
| 721 | |||
| 750 | /* | 722 | /* |
| 751 | * Move the new page to the LRU. If migration was not successful | 723 | * Move the new page to the LRU. If migration was not successful |
| 752 | * then this will free the page. | 724 | * then this will free the page. |
| 753 | */ | 725 | */ |
| 754 | move_to_lru(newpage); | 726 | putback_lru_page(newpage); |
| 727 | |||
| 755 | if (result) { | 728 | if (result) { |
| 756 | if (rc) | 729 | if (rc) |
| 757 | *result = rc; | 730 | *result = rc; |
| @@ -858,20 +831,22 @@ static struct page *new_page_node(struct page *p, unsigned long private, | |||
| 858 | * Move a set of pages as indicated in the pm array. The addr | 831 | * Move a set of pages as indicated in the pm array. The addr |
| 859 | * field must be set to the virtual address of the page to be moved | 832 | * field must be set to the virtual address of the page to be moved |
| 860 | * and the node number must contain a valid target node. | 833 | * and the node number must contain a valid target node. |
| 834 | * The pm array ends with node = MAX_NUMNODES. | ||
| 861 | */ | 835 | */ |
| 862 | static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, | 836 | static int do_move_page_to_node_array(struct mm_struct *mm, |
| 863 | int migrate_all) | 837 | struct page_to_node *pm, |
| 838 | int migrate_all) | ||
| 864 | { | 839 | { |
| 865 | int err; | 840 | int err; |
| 866 | struct page_to_node *pp; | 841 | struct page_to_node *pp; |
| 867 | LIST_HEAD(pagelist); | 842 | LIST_HEAD(pagelist); |
| 868 | 843 | ||
| 844 | migrate_prep(); | ||
| 869 | down_read(&mm->mmap_sem); | 845 | down_read(&mm->mmap_sem); |
| 870 | 846 | ||
| 871 | /* | 847 | /* |
| 872 | * Build a list of pages to migrate | 848 | * Build a list of pages to migrate |
| 873 | */ | 849 | */ |
| 874 | migrate_prep(); | ||
| 875 | for (pp = pm; pp->node != MAX_NUMNODES; pp++) { | 850 | for (pp = pm; pp->node != MAX_NUMNODES; pp++) { |
| 876 | struct vm_area_struct *vma; | 851 | struct vm_area_struct *vma; |
| 877 | struct page *page; | 852 | struct page *page; |
| @@ -914,7 +889,9 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, | |||
| 914 | !migrate_all) | 889 | !migrate_all) |
| 915 | goto put_and_set; | 890 | goto put_and_set; |
| 916 | 891 | ||
| 917 | err = isolate_lru_page(page, &pagelist); | 892 | err = isolate_lru_page(page); |
| 893 | if (!err) | ||
| 894 | list_add_tail(&page->lru, &pagelist); | ||
| 918 | put_and_set: | 895 | put_and_set: |
| 919 | /* | 896 | /* |
| 920 | * Either remove the duplicate refcount from | 897 | * Either remove the duplicate refcount from |
| @@ -926,36 +903,118 @@ set_status: | |||
| 926 | pp->status = err; | 903 | pp->status = err; |
| 927 | } | 904 | } |
| 928 | 905 | ||
| 906 | err = 0; | ||
| 929 | if (!list_empty(&pagelist)) | 907 | if (!list_empty(&pagelist)) |
| 930 | err = migrate_pages(&pagelist, new_page_node, | 908 | err = migrate_pages(&pagelist, new_page_node, |
| 931 | (unsigned long)pm); | 909 | (unsigned long)pm); |
| 932 | else | ||
| 933 | err = -ENOENT; | ||
| 934 | 910 | ||
| 935 | up_read(&mm->mmap_sem); | 911 | up_read(&mm->mmap_sem); |
| 936 | return err; | 912 | return err; |
| 937 | } | 913 | } |
| 938 | 914 | ||
| 939 | /* | 915 | /* |
| 940 | * Determine the nodes of a list of pages. The addr in the pm array | 916 | * Migrate an array of page address onto an array of nodes and fill |
| 941 | * must have been set to the virtual address of which we want to determine | 917 | * the corresponding array of status. |
| 942 | * the node number. | ||
| 943 | */ | 918 | */ |
| 944 | static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) | 919 | static int do_pages_move(struct mm_struct *mm, struct task_struct *task, |
| 920 | unsigned long nr_pages, | ||
| 921 | const void __user * __user *pages, | ||
| 922 | const int __user *nodes, | ||
| 923 | int __user *status, int flags) | ||
| 945 | { | 924 | { |
| 925 | struct page_to_node *pm = NULL; | ||
| 926 | nodemask_t task_nodes; | ||
| 927 | int err = 0; | ||
| 928 | int i; | ||
| 929 | |||
| 930 | task_nodes = cpuset_mems_allowed(task); | ||
| 931 | |||
| 932 | /* Limit nr_pages so that the multiplication may not overflow */ | ||
| 933 | if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { | ||
| 934 | err = -E2BIG; | ||
| 935 | goto out; | ||
| 936 | } | ||
| 937 | |||
| 938 | pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); | ||
| 939 | if (!pm) { | ||
| 940 | err = -ENOMEM; | ||
| 941 | goto out; | ||
| 942 | } | ||
| 943 | |||
| 944 | /* | ||
| 945 | * Get parameters from user space and initialize the pm | ||
| 946 | * array. Return various errors if the user did something wrong. | ||
| 947 | */ | ||
| 948 | for (i = 0; i < nr_pages; i++) { | ||
| 949 | const void __user *p; | ||
| 950 | |||
| 951 | err = -EFAULT; | ||
| 952 | if (get_user(p, pages + i)) | ||
| 953 | goto out_pm; | ||
| 954 | |||
| 955 | pm[i].addr = (unsigned long)p; | ||
| 956 | if (nodes) { | ||
| 957 | int node; | ||
| 958 | |||
| 959 | if (get_user(node, nodes + i)) | ||
| 960 | goto out_pm; | ||
| 961 | |||
| 962 | err = -ENODEV; | ||
| 963 | if (!node_state(node, N_HIGH_MEMORY)) | ||
| 964 | goto out_pm; | ||
| 965 | |||
| 966 | err = -EACCES; | ||
| 967 | if (!node_isset(node, task_nodes)) | ||
| 968 | goto out_pm; | ||
| 969 | |||
| 970 | pm[i].node = node; | ||
| 971 | } else | ||
| 972 | pm[i].node = 0; /* anything to not match MAX_NUMNODES */ | ||
| 973 | } | ||
| 974 | /* End marker */ | ||
| 975 | pm[nr_pages].node = MAX_NUMNODES; | ||
| 976 | |||
| 977 | err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL); | ||
| 978 | if (err >= 0) | ||
| 979 | /* Return status information */ | ||
| 980 | for (i = 0; i < nr_pages; i++) | ||
| 981 | if (put_user(pm[i].status, status + i)) | ||
| 982 | err = -EFAULT; | ||
| 983 | |||
| 984 | out_pm: | ||
| 985 | vfree(pm); | ||
| 986 | out: | ||
| 987 | return err; | ||
| 988 | } | ||
| 989 | |||
| 990 | /* | ||
| 991 | * Determine the nodes of an array of pages and store it in an array of status. | ||
| 992 | */ | ||
| 993 | static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, | ||
| 994 | const void __user * __user *pages, | ||
| 995 | int __user *status) | ||
| 996 | { | ||
| 997 | unsigned long i; | ||
| 998 | int err; | ||
| 999 | |||
| 946 | down_read(&mm->mmap_sem); | 1000 | down_read(&mm->mmap_sem); |
| 947 | 1001 | ||
| 948 | for ( ; pm->node != MAX_NUMNODES; pm++) { | 1002 | for (i = 0; i < nr_pages; i++) { |
| 1003 | const void __user *p; | ||
| 1004 | unsigned long addr; | ||
| 949 | struct vm_area_struct *vma; | 1005 | struct vm_area_struct *vma; |
| 950 | struct page *page; | 1006 | struct page *page; |
| 951 | int err; | ||
| 952 | 1007 | ||
| 953 | err = -EFAULT; | 1008 | err = -EFAULT; |
| 954 | vma = find_vma(mm, pm->addr); | 1009 | if (get_user(p, pages+i)) |
| 1010 | goto out; | ||
| 1011 | addr = (unsigned long) p; | ||
| 1012 | |||
| 1013 | vma = find_vma(mm, addr); | ||
| 955 | if (!vma) | 1014 | if (!vma) |
| 956 | goto set_status; | 1015 | goto set_status; |
| 957 | 1016 | ||
| 958 | page = follow_page(vma, pm->addr, 0); | 1017 | page = follow_page(vma, addr, 0); |
| 959 | 1018 | ||
| 960 | err = PTR_ERR(page); | 1019 | err = PTR_ERR(page); |
| 961 | if (IS_ERR(page)) | 1020 | if (IS_ERR(page)) |
| @@ -968,11 +1027,13 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) | |||
| 968 | 1027 | ||
| 969 | err = page_to_nid(page); | 1028 | err = page_to_nid(page); |
| 970 | set_status: | 1029 | set_status: |
| 971 | pm->status = err; | 1030 | put_user(err, status+i); |
| 972 | } | 1031 | } |
| 1032 | err = 0; | ||
| 973 | 1033 | ||
| 1034 | out: | ||
| 974 | up_read(&mm->mmap_sem); | 1035 | up_read(&mm->mmap_sem); |
| 975 | return 0; | 1036 | return err; |
| 976 | } | 1037 | } |
| 977 | 1038 | ||
| 978 | /* | 1039 | /* |
| @@ -984,12 +1045,9 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, | |||
| 984 | const int __user *nodes, | 1045 | const int __user *nodes, |
| 985 | int __user *status, int flags) | 1046 | int __user *status, int flags) |
| 986 | { | 1047 | { |
| 987 | int err = 0; | ||
| 988 | int i; | ||
| 989 | struct task_struct *task; | 1048 | struct task_struct *task; |
| 990 | nodemask_t task_nodes; | ||
| 991 | struct mm_struct *mm; | 1049 | struct mm_struct *mm; |
| 992 | struct page_to_node *pm = NULL; | 1050 | int err; |
| 993 | 1051 | ||
| 994 | /* Check flags */ | 1052 | /* Check flags */ |
| 995 | if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) | 1053 | if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) |
| @@ -1021,75 +1079,21 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, | |||
| 1021 | (current->uid != task->suid) && (current->uid != task->uid) && | 1079 | (current->uid != task->suid) && (current->uid != task->uid) && |
| 1022 | !capable(CAP_SYS_NICE)) { | 1080 | !capable(CAP_SYS_NICE)) { |
| 1023 | err = -EPERM; | 1081 | err = -EPERM; |
| 1024 | goto out2; | 1082 | goto out; |
| 1025 | } | 1083 | } |
| 1026 | 1084 | ||
| 1027 | err = security_task_movememory(task); | 1085 | err = security_task_movememory(task); |
| 1028 | if (err) | 1086 | if (err) |
| 1029 | goto out2; | 1087 | goto out; |
| 1030 | |||
| 1031 | |||
| 1032 | task_nodes = cpuset_mems_allowed(task); | ||
| 1033 | |||
| 1034 | /* Limit nr_pages so that the multiplication may not overflow */ | ||
| 1035 | if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { | ||
| 1036 | err = -E2BIG; | ||
| 1037 | goto out2; | ||
| 1038 | } | ||
| 1039 | |||
| 1040 | pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); | ||
| 1041 | if (!pm) { | ||
| 1042 | err = -ENOMEM; | ||
| 1043 | goto out2; | ||
| 1044 | } | ||
| 1045 | |||
| 1046 | /* | ||
| 1047 | * Get parameters from user space and initialize the pm | ||
| 1048 | * array. Return various errors if the user did something wrong. | ||
| 1049 | */ | ||
| 1050 | for (i = 0; i < nr_pages; i++) { | ||
| 1051 | const void __user *p; | ||
| 1052 | |||
| 1053 | err = -EFAULT; | ||
| 1054 | if (get_user(p, pages + i)) | ||
| 1055 | goto out; | ||
| 1056 | |||
| 1057 | pm[i].addr = (unsigned long)p; | ||
| 1058 | if (nodes) { | ||
| 1059 | int node; | ||
| 1060 | |||
| 1061 | if (get_user(node, nodes + i)) | ||
| 1062 | goto out; | ||
| 1063 | |||
| 1064 | err = -ENODEV; | ||
| 1065 | if (!node_state(node, N_HIGH_MEMORY)) | ||
| 1066 | goto out; | ||
| 1067 | |||
| 1068 | err = -EACCES; | ||
| 1069 | if (!node_isset(node, task_nodes)) | ||
| 1070 | goto out; | ||
| 1071 | 1088 | ||
| 1072 | pm[i].node = node; | 1089 | if (nodes) { |
| 1073 | } else | 1090 | err = do_pages_move(mm, task, nr_pages, pages, nodes, status, |
| 1074 | pm[i].node = 0; /* anything to not match MAX_NUMNODES */ | 1091 | flags); |
| 1092 | } else { | ||
| 1093 | err = do_pages_stat(mm, nr_pages, pages, status); | ||
| 1075 | } | 1094 | } |
| 1076 | /* End marker */ | ||
| 1077 | pm[nr_pages].node = MAX_NUMNODES; | ||
| 1078 | |||
| 1079 | if (nodes) | ||
| 1080 | err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); | ||
| 1081 | else | ||
| 1082 | err = do_pages_stat(mm, pm); | ||
| 1083 | |||
| 1084 | if (err >= 0) | ||
| 1085 | /* Return status information */ | ||
| 1086 | for (i = 0; i < nr_pages; i++) | ||
| 1087 | if (put_user(pm[i].status, status + i)) | ||
| 1088 | err = -EFAULT; | ||
| 1089 | 1095 | ||
| 1090 | out: | 1096 | out: |
| 1091 | vfree(pm); | ||
| 1092 | out2: | ||
| 1093 | mmput(mm); | 1097 | mmput(mm); |
| 1094 | return err; | 1098 | return err; |
| 1095 | } | 1099 | } |
diff --git a/mm/mlock.c b/mm/mlock.c index 01fbe93eff5c..a6da2aee940a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
| @@ -8,10 +8,18 @@ | |||
| 8 | #include <linux/capability.h> | 8 | #include <linux/capability.h> |
| 9 | #include <linux/mman.h> | 9 | #include <linux/mman.h> |
| 10 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
| 11 | #include <linux/swap.h> | ||
| 12 | #include <linux/swapops.h> | ||
| 13 | #include <linux/pagemap.h> | ||
| 11 | #include <linux/mempolicy.h> | 14 | #include <linux/mempolicy.h> |
| 12 | #include <linux/syscalls.h> | 15 | #include <linux/syscalls.h> |
| 13 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
| 14 | #include <linux/module.h> | 17 | #include <linux/module.h> |
| 18 | #include <linux/rmap.h> | ||
| 19 | #include <linux/mmzone.h> | ||
| 20 | #include <linux/hugetlb.h> | ||
| 21 | |||
| 22 | #include "internal.h" | ||
| 15 | 23 | ||
| 16 | int can_do_mlock(void) | 24 | int can_do_mlock(void) |
| 17 | { | 25 | { |
| @@ -23,17 +31,373 @@ int can_do_mlock(void) | |||
| 23 | } | 31 | } |
| 24 | EXPORT_SYMBOL(can_do_mlock); | 32 | EXPORT_SYMBOL(can_do_mlock); |
| 25 | 33 | ||
| 34 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 35 | /* | ||
| 36 | * Mlocked pages are marked with PageMlocked() flag for efficient testing | ||
| 37 | * in vmscan and, possibly, the fault path; and to support semi-accurate | ||
| 38 | * statistics. | ||
| 39 | * | ||
| 40 | * An mlocked page [PageMlocked(page)] is unevictable. As such, it will | ||
| 41 | * be placed on the LRU "unevictable" list, rather than the [in]active lists. | ||
| 42 | * The unevictable list is an LRU sibling list to the [in]active lists. | ||
| 43 | * PageUnevictable is set to indicate the unevictable state. | ||
| 44 | * | ||
| 45 | * When lazy mlocking via vmscan, it is important to ensure that the | ||
| 46 | * vma's VM_LOCKED status is not concurrently being modified, otherwise we | ||
| 47 | * may have mlocked a page that is being munlocked. So lazy mlock must take | ||
| 48 | * the mmap_sem for read, and verify that the vma really is locked | ||
| 49 | * (see mm/rmap.c). | ||
| 50 | */ | ||
| 51 | |||
| 52 | /* | ||
| 53 | * LRU accounting for clear_page_mlock() | ||
| 54 | */ | ||
| 55 | void __clear_page_mlock(struct page *page) | ||
| 56 | { | ||
| 57 | VM_BUG_ON(!PageLocked(page)); | ||
| 58 | |||
| 59 | if (!page->mapping) { /* truncated ? */ | ||
| 60 | return; | ||
| 61 | } | ||
| 62 | |||
| 63 | dec_zone_page_state(page, NR_MLOCK); | ||
| 64 | count_vm_event(UNEVICTABLE_PGCLEARED); | ||
| 65 | if (!isolate_lru_page(page)) { | ||
| 66 | putback_lru_page(page); | ||
| 67 | } else { | ||
| 68 | /* | ||
| 69 | * We lost the race. the page already moved to evictable list. | ||
| 70 | */ | ||
| 71 | if (PageUnevictable(page)) | ||
| 72 | count_vm_event(UNEVICTABLE_PGSTRANDED); | ||
| 73 | } | ||
| 74 | } | ||
| 75 | |||
| 76 | /* | ||
| 77 | * Mark page as mlocked if not already. | ||
| 78 | * If page on LRU, isolate and putback to move to unevictable list. | ||
| 79 | */ | ||
| 80 | void mlock_vma_page(struct page *page) | ||
| 81 | { | ||
| 82 | BUG_ON(!PageLocked(page)); | ||
| 83 | |||
| 84 | if (!TestSetPageMlocked(page)) { | ||
| 85 | inc_zone_page_state(page, NR_MLOCK); | ||
| 86 | count_vm_event(UNEVICTABLE_PGMLOCKED); | ||
| 87 | if (!isolate_lru_page(page)) | ||
| 88 | putback_lru_page(page); | ||
| 89 | } | ||
| 90 | } | ||
| 91 | |||
| 92 | /* | ||
| 93 | * called from munlock()/munmap() path with page supposedly on the LRU. | ||
| 94 | * | ||
| 95 | * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked | ||
| 96 | * [in try_to_munlock()] and then attempt to isolate the page. We must | ||
| 97 | * isolate the page to keep others from messing with its unevictable | ||
| 98 | * and mlocked state while trying to munlock. However, we pre-clear the | ||
| 99 | * mlocked state anyway as we might lose the isolation race and we might | ||
| 100 | * not get another chance to clear PageMlocked. If we successfully | ||
| 101 | * isolate the page and try_to_munlock() detects other VM_LOCKED vmas | ||
| 102 | * mapping the page, it will restore the PageMlocked state, unless the page | ||
| 103 | * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), | ||
| 104 | * perhaps redundantly. | ||
| 105 | * If we lose the isolation race, and the page is mapped by other VM_LOCKED | ||
| 106 | * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() | ||
| 107 | * either of which will restore the PageMlocked state by calling | ||
| 108 | * mlock_vma_page() above, if it can grab the vma's mmap sem. | ||
| 109 | */ | ||
| 110 | static void munlock_vma_page(struct page *page) | ||
| 111 | { | ||
| 112 | BUG_ON(!PageLocked(page)); | ||
| 113 | |||
| 114 | if (TestClearPageMlocked(page)) { | ||
| 115 | dec_zone_page_state(page, NR_MLOCK); | ||
| 116 | if (!isolate_lru_page(page)) { | ||
| 117 | int ret = try_to_munlock(page); | ||
| 118 | /* | ||
| 119 | * did try_to_unlock() succeed or punt? | ||
| 120 | */ | ||
| 121 | if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) | ||
| 122 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | ||
| 123 | |||
| 124 | putback_lru_page(page); | ||
| 125 | } else { | ||
| 126 | /* | ||
| 127 | * We lost the race. let try_to_unmap() deal | ||
| 128 | * with it. At least we get the page state and | ||
| 129 | * mlock stats right. However, page is still on | ||
| 130 | * the noreclaim list. We'll fix that up when | ||
| 131 | * the page is eventually freed or we scan the | ||
| 132 | * noreclaim list. | ||
| 133 | */ | ||
| 134 | if (PageUnevictable(page)) | ||
| 135 | count_vm_event(UNEVICTABLE_PGSTRANDED); | ||
| 136 | else | ||
| 137 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | ||
| 138 | } | ||
| 139 | } | ||
| 140 | } | ||
| 141 | |||
| 142 | /** | ||
| 143 | * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. | ||
| 144 | * @vma: target vma | ||
| 145 | * @start: start address | ||
| 146 | * @end: end address | ||
| 147 | * @mlock: 0 indicate munlock, otherwise mlock. | ||
| 148 | * | ||
| 149 | * If @mlock == 0, unlock an mlocked range; | ||
| 150 | * else mlock the range of pages. This takes care of making the pages present , | ||
| 151 | * too. | ||
| 152 | * | ||
| 153 | * return 0 on success, negative error code on error. | ||
| 154 | * | ||
| 155 | * vma->vm_mm->mmap_sem must be held for at least read. | ||
| 156 | */ | ||
| 157 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | ||
| 158 | unsigned long start, unsigned long end, | ||
| 159 | int mlock) | ||
| 160 | { | ||
| 161 | struct mm_struct *mm = vma->vm_mm; | ||
| 162 | unsigned long addr = start; | ||
| 163 | struct page *pages[16]; /* 16 gives a reasonable batch */ | ||
| 164 | int nr_pages = (end - start) / PAGE_SIZE; | ||
| 165 | int ret; | ||
| 166 | int gup_flags = 0; | ||
| 167 | |||
| 168 | VM_BUG_ON(start & ~PAGE_MASK); | ||
| 169 | VM_BUG_ON(end & ~PAGE_MASK); | ||
| 170 | VM_BUG_ON(start < vma->vm_start); | ||
| 171 | VM_BUG_ON(end > vma->vm_end); | ||
| 172 | VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && | ||
| 173 | (atomic_read(&mm->mm_users) != 0)); | ||
| 174 | |||
| 175 | /* | ||
| 176 | * mlock: don't page populate if page has PROT_NONE permission. | ||
| 177 | * munlock: the pages always do munlock althrough | ||
| 178 | * its has PROT_NONE permission. | ||
| 179 | */ | ||
| 180 | if (!mlock) | ||
| 181 | gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; | ||
| 182 | |||
| 183 | if (vma->vm_flags & VM_WRITE) | ||
| 184 | gup_flags |= GUP_FLAGS_WRITE; | ||
| 185 | |||
| 186 | while (nr_pages > 0) { | ||
| 187 | int i; | ||
| 188 | |||
| 189 | cond_resched(); | ||
| 190 | |||
| 191 | /* | ||
| 192 | * get_user_pages makes pages present if we are | ||
| 193 | * setting mlock. and this extra reference count will | ||
| 194 | * disable migration of this page. However, page may | ||
| 195 | * still be truncated out from under us. | ||
| 196 | */ | ||
| 197 | ret = __get_user_pages(current, mm, addr, | ||
| 198 | min_t(int, nr_pages, ARRAY_SIZE(pages)), | ||
| 199 | gup_flags, pages, NULL); | ||
| 200 | /* | ||
| 201 | * This can happen for, e.g., VM_NONLINEAR regions before | ||
| 202 | * a page has been allocated and mapped at a given offset, | ||
| 203 | * or for addresses that map beyond end of a file. | ||
| 204 | * We'll mlock the the pages if/when they get faulted in. | ||
| 205 | */ | ||
| 206 | if (ret < 0) | ||
| 207 | break; | ||
| 208 | if (ret == 0) { | ||
| 209 | /* | ||
| 210 | * We know the vma is there, so the only time | ||
| 211 | * we cannot get a single page should be an | ||
| 212 | * error (ret < 0) case. | ||
| 213 | */ | ||
| 214 | WARN_ON(1); | ||
| 215 | break; | ||
| 216 | } | ||
| 217 | |||
| 218 | lru_add_drain(); /* push cached pages to LRU */ | ||
| 219 | |||
| 220 | for (i = 0; i < ret; i++) { | ||
| 221 | struct page *page = pages[i]; | ||
| 222 | |||
| 223 | lock_page(page); | ||
| 224 | /* | ||
| 225 | * Because we lock page here and migration is blocked | ||
| 226 | * by the elevated reference, we need only check for | ||
| 227 | * page truncation (file-cache only). | ||
| 228 | */ | ||
| 229 | if (page->mapping) { | ||
| 230 | if (mlock) | ||
| 231 | mlock_vma_page(page); | ||
| 232 | else | ||
| 233 | munlock_vma_page(page); | ||
| 234 | } | ||
| 235 | unlock_page(page); | ||
| 236 | put_page(page); /* ref from get_user_pages() */ | ||
| 237 | |||
| 238 | /* | ||
| 239 | * here we assume that get_user_pages() has given us | ||
| 240 | * a list of virtually contiguous pages. | ||
| 241 | */ | ||
| 242 | addr += PAGE_SIZE; /* for next get_user_pages() */ | ||
| 243 | nr_pages--; | ||
| 244 | } | ||
| 245 | ret = 0; | ||
| 246 | } | ||
| 247 | |||
| 248 | return ret; /* count entire vma as locked_vm */ | ||
| 249 | } | ||
| 250 | |||
| 251 | /* | ||
| 252 | * convert get_user_pages() return value to posix mlock() error | ||
| 253 | */ | ||
| 254 | static int __mlock_posix_error_return(long retval) | ||
| 255 | { | ||
| 256 | if (retval == -EFAULT) | ||
| 257 | retval = -ENOMEM; | ||
| 258 | else if (retval == -ENOMEM) | ||
| 259 | retval = -EAGAIN; | ||
| 260 | return retval; | ||
| 261 | } | ||
| 262 | |||
| 263 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
| 264 | |||
| 265 | /* | ||
| 266 | * Just make pages present if VM_LOCKED. No-op if unlocking. | ||
| 267 | */ | ||
| 268 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | ||
| 269 | unsigned long start, unsigned long end, | ||
| 270 | int mlock) | ||
| 271 | { | ||
| 272 | if (mlock && (vma->vm_flags & VM_LOCKED)) | ||
| 273 | return make_pages_present(start, end); | ||
| 274 | return 0; | ||
| 275 | } | ||
| 276 | |||
| 277 | static inline int __mlock_posix_error_return(long retval) | ||
| 278 | { | ||
| 279 | return 0; | ||
| 280 | } | ||
| 281 | |||
| 282 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
| 283 | |||
| 284 | /** | ||
| 285 | * mlock_vma_pages_range() - mlock pages in specified vma range. | ||
| 286 | * @vma - the vma containing the specfied address range | ||
| 287 | * @start - starting address in @vma to mlock | ||
| 288 | * @end - end address [+1] in @vma to mlock | ||
| 289 | * | ||
| 290 | * For mmap()/mremap()/expansion of mlocked vma. | ||
| 291 | * | ||
| 292 | * return 0 on success for "normal" vmas. | ||
| 293 | * | ||
| 294 | * return number of pages [> 0] to be removed from locked_vm on success | ||
| 295 | * of "special" vmas. | ||
| 296 | * | ||
| 297 | * return negative error if vma spanning @start-@range disappears while | ||
| 298 | * mmap semaphore is dropped. Unlikely? | ||
| 299 | */ | ||
| 300 | long mlock_vma_pages_range(struct vm_area_struct *vma, | ||
| 301 | unsigned long start, unsigned long end) | ||
| 302 | { | ||
| 303 | struct mm_struct *mm = vma->vm_mm; | ||
| 304 | int nr_pages = (end - start) / PAGE_SIZE; | ||
| 305 | BUG_ON(!(vma->vm_flags & VM_LOCKED)); | ||
| 306 | |||
| 307 | /* | ||
| 308 | * filter unlockable vmas | ||
| 309 | */ | ||
| 310 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | ||
| 311 | goto no_mlock; | ||
| 312 | |||
| 313 | if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || | ||
| 314 | is_vm_hugetlb_page(vma) || | ||
| 315 | vma == get_gate_vma(current))) { | ||
| 316 | long error; | ||
| 317 | downgrade_write(&mm->mmap_sem); | ||
| 318 | |||
| 319 | error = __mlock_vma_pages_range(vma, start, end, 1); | ||
| 320 | |||
| 321 | up_read(&mm->mmap_sem); | ||
| 322 | /* vma can change or disappear */ | ||
| 323 | down_write(&mm->mmap_sem); | ||
| 324 | vma = find_vma(mm, start); | ||
| 325 | /* non-NULL vma must contain @start, but need to check @end */ | ||
| 326 | if (!vma || end > vma->vm_end) | ||
| 327 | return -ENOMEM; | ||
| 328 | |||
| 329 | return 0; /* hide other errors from mmap(), et al */ | ||
| 330 | } | ||
| 331 | |||
| 332 | /* | ||
| 333 | * User mapped kernel pages or huge pages: | ||
| 334 | * make these pages present to populate the ptes, but | ||
| 335 | * fall thru' to reset VM_LOCKED--no need to unlock, and | ||
| 336 | * return nr_pages so these don't get counted against task's | ||
| 337 | * locked limit. huge pages are already counted against | ||
| 338 | * locked vm limit. | ||
| 339 | */ | ||
| 340 | make_pages_present(start, end); | ||
| 341 | |||
| 342 | no_mlock: | ||
| 343 | vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */ | ||
| 344 | return nr_pages; /* error or pages NOT mlocked */ | ||
| 345 | } | ||
| 346 | |||
| 347 | |||
| 348 | /* | ||
| 349 | * munlock_vma_pages_range() - munlock all pages in the vma range.' | ||
| 350 | * @vma - vma containing range to be munlock()ed. | ||
| 351 | * @start - start address in @vma of the range | ||
| 352 | * @end - end of range in @vma. | ||
| 353 | * | ||
| 354 | * For mremap(), munmap() and exit(). | ||
| 355 | * | ||
| 356 | * Called with @vma VM_LOCKED. | ||
| 357 | * | ||
| 358 | * Returns with VM_LOCKED cleared. Callers must be prepared to | ||
| 359 | * deal with this. | ||
| 360 | * | ||
| 361 | * We don't save and restore VM_LOCKED here because pages are | ||
| 362 | * still on lru. In unmap path, pages might be scanned by reclaim | ||
| 363 | * and re-mlocked by try_to_{munlock|unmap} before we unmap and | ||
| 364 | * free them. This will result in freeing mlocked pages. | ||
| 365 | */ | ||
| 366 | void munlock_vma_pages_range(struct vm_area_struct *vma, | ||
| 367 | unsigned long start, unsigned long end) | ||
| 368 | { | ||
| 369 | vma->vm_flags &= ~VM_LOCKED; | ||
| 370 | __mlock_vma_pages_range(vma, start, end, 0); | ||
| 371 | } | ||
| 372 | |||
| 373 | /* | ||
| 374 | * mlock_fixup - handle mlock[all]/munlock[all] requests. | ||
| 375 | * | ||
| 376 | * Filters out "special" vmas -- VM_LOCKED never gets set for these, and | ||
| 377 | * munlock is a no-op. However, for some special vmas, we go ahead and | ||
| 378 | * populate the ptes via make_pages_present(). | ||
| 379 | * | ||
| 380 | * For vmas that pass the filters, merge/split as appropriate. | ||
| 381 | */ | ||
| 26 | static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | 382 | static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, |
| 27 | unsigned long start, unsigned long end, unsigned int newflags) | 383 | unsigned long start, unsigned long end, unsigned int newflags) |
| 28 | { | 384 | { |
| 29 | struct mm_struct * mm = vma->vm_mm; | 385 | struct mm_struct *mm = vma->vm_mm; |
| 30 | pgoff_t pgoff; | 386 | pgoff_t pgoff; |
| 31 | int pages; | 387 | int nr_pages; |
| 32 | int ret = 0; | 388 | int ret = 0; |
| 33 | 389 | int lock = newflags & VM_LOCKED; | |
| 34 | if (newflags == vma->vm_flags) { | 390 | |
| 35 | *prev = vma; | 391 | if (newflags == vma->vm_flags || |
| 36 | goto out; | 392 | (vma->vm_flags & (VM_IO | VM_PFNMAP))) |
| 393 | goto out; /* don't set VM_LOCKED, don't count */ | ||
| 394 | |||
| 395 | if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || | ||
| 396 | is_vm_hugetlb_page(vma) || | ||
| 397 | vma == get_gate_vma(current)) { | ||
| 398 | if (lock) | ||
| 399 | make_pages_present(start, end); | ||
| 400 | goto out; /* don't set VM_LOCKED, don't count */ | ||
| 37 | } | 401 | } |
| 38 | 402 | ||
| 39 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | 403 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
| @@ -44,8 +408,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
| 44 | goto success; | 408 | goto success; |
| 45 | } | 409 | } |
| 46 | 410 | ||
| 47 | *prev = vma; | ||
| 48 | |||
| 49 | if (start != vma->vm_start) { | 411 | if (start != vma->vm_start) { |
| 50 | ret = split_vma(mm, vma, start, 1); | 412 | ret = split_vma(mm, vma, start, 1); |
| 51 | if (ret) | 413 | if (ret) |
| @@ -60,24 +422,61 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
| 60 | 422 | ||
| 61 | success: | 423 | success: |
| 62 | /* | 424 | /* |
| 425 | * Keep track of amount of locked VM. | ||
| 426 | */ | ||
| 427 | nr_pages = (end - start) >> PAGE_SHIFT; | ||
| 428 | if (!lock) | ||
| 429 | nr_pages = -nr_pages; | ||
| 430 | mm->locked_vm += nr_pages; | ||
| 431 | |||
| 432 | /* | ||
| 63 | * vm_flags is protected by the mmap_sem held in write mode. | 433 | * vm_flags is protected by the mmap_sem held in write mode. |
| 64 | * It's okay if try_to_unmap_one unmaps a page just after we | 434 | * It's okay if try_to_unmap_one unmaps a page just after we |
| 65 | * set VM_LOCKED, make_pages_present below will bring it back. | 435 | * set VM_LOCKED, __mlock_vma_pages_range will bring it back. |
| 66 | */ | 436 | */ |
| 67 | vma->vm_flags = newflags; | 437 | vma->vm_flags = newflags; |
| 68 | 438 | ||
| 69 | /* | 439 | if (lock) { |
| 70 | * Keep track of amount of locked VM. | 440 | /* |
| 71 | */ | 441 | * mmap_sem is currently held for write. Downgrade the write |
| 72 | pages = (end - start) >> PAGE_SHIFT; | 442 | * lock to a read lock so that other faults, mmap scans, ... |
| 73 | if (newflags & VM_LOCKED) { | 443 | * while we fault in all pages. |
| 74 | pages = -pages; | 444 | */ |
| 75 | if (!(newflags & VM_IO)) | 445 | downgrade_write(&mm->mmap_sem); |
| 76 | ret = make_pages_present(start, end); | 446 | |
| 447 | ret = __mlock_vma_pages_range(vma, start, end, 1); | ||
| 448 | |||
| 449 | /* | ||
| 450 | * Need to reacquire mmap sem in write mode, as our callers | ||
| 451 | * expect this. We have no support for atomically upgrading | ||
| 452 | * a sem to write, so we need to check for ranges while sem | ||
| 453 | * is unlocked. | ||
| 454 | */ | ||
| 455 | up_read(&mm->mmap_sem); | ||
| 456 | /* vma can change or disappear */ | ||
| 457 | down_write(&mm->mmap_sem); | ||
| 458 | *prev = find_vma(mm, start); | ||
| 459 | /* non-NULL *prev must contain @start, but need to check @end */ | ||
| 460 | if (!(*prev) || end > (*prev)->vm_end) | ||
| 461 | ret = -ENOMEM; | ||
| 462 | else if (ret > 0) { | ||
| 463 | mm->locked_vm -= ret; | ||
| 464 | ret = 0; | ||
| 465 | } else | ||
| 466 | ret = __mlock_posix_error_return(ret); /* translate if needed */ | ||
| 467 | } else { | ||
| 468 | /* | ||
| 469 | * TODO: for unlocking, pages will already be resident, so | ||
| 470 | * we don't need to wait for allocations/reclaim/pagein, ... | ||
| 471 | * However, unlocking a very large region can still take a | ||
| 472 | * while. Should we downgrade the semaphore for both lock | ||
| 473 | * AND unlock ? | ||
| 474 | */ | ||
| 475 | __mlock_vma_pages_range(vma, start, end, 0); | ||
| 77 | } | 476 | } |
| 78 | 477 | ||
| 79 | mm->locked_vm -= pages; | ||
| 80 | out: | 478 | out: |
| 479 | *prev = vma; | ||
| 81 | return ret; | 480 | return ret; |
| 82 | } | 481 | } |
| 83 | 482 | ||
| @@ -139,6 +538,8 @@ asmlinkage long sys_mlock(unsigned long start, size_t len) | |||
| 139 | if (!can_do_mlock()) | 538 | if (!can_do_mlock()) |
| 140 | return -EPERM; | 539 | return -EPERM; |
| 141 | 540 | ||
| 541 | lru_add_drain_all(); /* flush pagevec */ | ||
| 542 | |||
| 142 | down_write(¤t->mm->mmap_sem); | 543 | down_write(¤t->mm->mmap_sem); |
| 143 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); | 544 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); |
| 144 | start &= PAGE_MASK; | 545 | start &= PAGE_MASK; |
| @@ -205,6 +606,8 @@ asmlinkage long sys_mlockall(int flags) | |||
| 205 | if (!can_do_mlock()) | 606 | if (!can_do_mlock()) |
| 206 | goto out; | 607 | goto out; |
| 207 | 608 | ||
| 609 | lru_add_drain_all(); /* flush pagevec */ | ||
| 610 | |||
| 208 | down_write(¤t->mm->mmap_sem); | 611 | down_write(¤t->mm->mmap_sem); |
| 209 | 612 | ||
| 210 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 613 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; |
| @@ -175,7 +175,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
| 175 | 175 | ||
| 176 | /* Don't let a single process grow too big: | 176 | /* Don't let a single process grow too big: |
| 177 | leave 3% of the size of this process for other processes */ | 177 | leave 3% of the size of this process for other processes */ |
| 178 | allowed -= mm->total_vm / 32; | 178 | if (mm) |
| 179 | allowed -= mm->total_vm / 32; | ||
| 179 | 180 | ||
| 180 | /* | 181 | /* |
| 181 | * cast `allowed' as a signed long because vm_committed_space | 182 | * cast `allowed' as a signed long because vm_committed_space |
| @@ -410,7 +411,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 410 | rb_insert_color(&vma->vm_rb, &mm->mm_rb); | 411 | rb_insert_color(&vma->vm_rb, &mm->mm_rb); |
| 411 | } | 412 | } |
| 412 | 413 | ||
| 413 | static inline void __vma_link_file(struct vm_area_struct *vma) | 414 | static void __vma_link_file(struct vm_area_struct *vma) |
| 414 | { | 415 | { |
| 415 | struct file * file; | 416 | struct file * file; |
| 416 | 417 | ||
| @@ -662,8 +663,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 662 | * If the vma has a ->close operation then the driver probably needs to release | 663 | * If the vma has a ->close operation then the driver probably needs to release |
| 663 | * per-vma resources, so we don't attempt to merge those. | 664 | * per-vma resources, so we don't attempt to merge those. |
| 664 | */ | 665 | */ |
| 665 | #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) | ||
| 666 | |||
| 667 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | 666 | static inline int is_mergeable_vma(struct vm_area_struct *vma, |
| 668 | struct file *file, unsigned long vm_flags) | 667 | struct file *file, unsigned long vm_flags) |
| 669 | { | 668 | { |
| @@ -972,6 +971,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
| 972 | return -EPERM; | 971 | return -EPERM; |
| 973 | vm_flags |= VM_LOCKED; | 972 | vm_flags |= VM_LOCKED; |
| 974 | } | 973 | } |
| 974 | |||
| 975 | /* mlock MCL_FUTURE? */ | 975 | /* mlock MCL_FUTURE? */ |
| 976 | if (vm_flags & VM_LOCKED) { | 976 | if (vm_flags & VM_LOCKED) { |
| 977 | unsigned long locked, lock_limit; | 977 | unsigned long locked, lock_limit; |
| @@ -1139,10 +1139,12 @@ munmap_back: | |||
| 1139 | * The VM_SHARED test is necessary because shmem_zero_setup | 1139 | * The VM_SHARED test is necessary because shmem_zero_setup |
| 1140 | * will create the file object for a shared anonymous map below. | 1140 | * will create the file object for a shared anonymous map below. |
| 1141 | */ | 1141 | */ |
| 1142 | if (!file && !(vm_flags & VM_SHARED) && | 1142 | if (!file && !(vm_flags & VM_SHARED)) { |
| 1143 | vma_merge(mm, prev, addr, addr + len, vm_flags, | 1143 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, |
| 1144 | NULL, NULL, pgoff, NULL)) | 1144 | NULL, NULL, pgoff, NULL); |
| 1145 | goto out; | 1145 | if (vma) |
| 1146 | goto out; | ||
| 1147 | } | ||
| 1146 | 1148 | ||
| 1147 | /* | 1149 | /* |
| 1148 | * Determine the object being mapped and call the appropriate | 1150 | * Determine the object being mapped and call the appropriate |
| @@ -1224,10 +1226,14 @@ out: | |||
| 1224 | mm->total_vm += len >> PAGE_SHIFT; | 1226 | mm->total_vm += len >> PAGE_SHIFT; |
| 1225 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1227 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
| 1226 | if (vm_flags & VM_LOCKED) { | 1228 | if (vm_flags & VM_LOCKED) { |
| 1227 | mm->locked_vm += len >> PAGE_SHIFT; | 1229 | /* |
| 1228 | make_pages_present(addr, addr + len); | 1230 | * makes pages present; downgrades, drops, reacquires mmap_sem |
| 1229 | } | 1231 | */ |
| 1230 | if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) | 1232 | long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); |
| 1233 | if (nr_pages < 0) | ||
| 1234 | return nr_pages; /* vma gone! */ | ||
| 1235 | mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; | ||
| 1236 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) | ||
| 1231 | make_pages_present(addr, addr + len); | 1237 | make_pages_present(addr, addr + len); |
| 1232 | return addr; | 1238 | return addr; |
| 1233 | 1239 | ||
| @@ -1586,7 +1592,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un | |||
| 1586 | * vma is the last one with address > vma->vm_end. Have to extend vma. | 1592 | * vma is the last one with address > vma->vm_end. Have to extend vma. |
| 1587 | */ | 1593 | */ |
| 1588 | #ifndef CONFIG_IA64 | 1594 | #ifndef CONFIG_IA64 |
| 1589 | static inline | 1595 | static |
| 1590 | #endif | 1596 | #endif |
| 1591 | int expand_upwards(struct vm_area_struct *vma, unsigned long address) | 1597 | int expand_upwards(struct vm_area_struct *vma, unsigned long address) |
| 1592 | { | 1598 | { |
| @@ -1636,7 +1642,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
| 1636 | /* | 1642 | /* |
| 1637 | * vma is the first one with address < vma->vm_start. Have to extend vma. | 1643 | * vma is the first one with address < vma->vm_start. Have to extend vma. |
| 1638 | */ | 1644 | */ |
| 1639 | static inline int expand_downwards(struct vm_area_struct *vma, | 1645 | static int expand_downwards(struct vm_area_struct *vma, |
| 1640 | unsigned long address) | 1646 | unsigned long address) |
| 1641 | { | 1647 | { |
| 1642 | int error; | 1648 | int error; |
| @@ -1700,8 +1706,10 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) | |||
| 1700 | return vma; | 1706 | return vma; |
| 1701 | if (!prev || expand_stack(prev, addr)) | 1707 | if (!prev || expand_stack(prev, addr)) |
| 1702 | return NULL; | 1708 | return NULL; |
| 1703 | if (prev->vm_flags & VM_LOCKED) | 1709 | if (prev->vm_flags & VM_LOCKED) { |
| 1704 | make_pages_present(addr, prev->vm_end); | 1710 | if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) |
| 1711 | return NULL; /* vma gone! */ | ||
| 1712 | } | ||
| 1705 | return prev; | 1713 | return prev; |
| 1706 | } | 1714 | } |
| 1707 | #else | 1715 | #else |
| @@ -1727,8 +1735,10 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) | |||
| 1727 | start = vma->vm_start; | 1735 | start = vma->vm_start; |
| 1728 | if (expand_stack(vma, addr)) | 1736 | if (expand_stack(vma, addr)) |
| 1729 | return NULL; | 1737 | return NULL; |
| 1730 | if (vma->vm_flags & VM_LOCKED) | 1738 | if (vma->vm_flags & VM_LOCKED) { |
| 1731 | make_pages_present(addr, start); | 1739 | if (mlock_vma_pages_range(vma, addr, start) < 0) |
| 1740 | return NULL; /* vma gone! */ | ||
| 1741 | } | ||
| 1732 | return vma; | 1742 | return vma; |
| 1733 | } | 1743 | } |
| 1734 | #endif | 1744 | #endif |
| @@ -1747,8 +1757,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) | |||
| 1747 | long nrpages = vma_pages(vma); | 1757 | long nrpages = vma_pages(vma); |
| 1748 | 1758 | ||
| 1749 | mm->total_vm -= nrpages; | 1759 | mm->total_vm -= nrpages; |
| 1750 | if (vma->vm_flags & VM_LOCKED) | ||
| 1751 | mm->locked_vm -= nrpages; | ||
| 1752 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); | 1760 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); |
| 1753 | vma = remove_vma(vma); | 1761 | vma = remove_vma(vma); |
| 1754 | } while (vma); | 1762 | } while (vma); |
| @@ -1914,6 +1922,20 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
| 1914 | vma = prev? prev->vm_next: mm->mmap; | 1922 | vma = prev? prev->vm_next: mm->mmap; |
| 1915 | 1923 | ||
| 1916 | /* | 1924 | /* |
| 1925 | * unlock any mlock()ed ranges before detaching vmas | ||
| 1926 | */ | ||
| 1927 | if (mm->locked_vm) { | ||
| 1928 | struct vm_area_struct *tmp = vma; | ||
| 1929 | while (tmp && tmp->vm_start < end) { | ||
| 1930 | if (tmp->vm_flags & VM_LOCKED) { | ||
| 1931 | mm->locked_vm -= vma_pages(tmp); | ||
| 1932 | munlock_vma_pages_all(tmp); | ||
| 1933 | } | ||
| 1934 | tmp = tmp->vm_next; | ||
| 1935 | } | ||
| 1936 | } | ||
| 1937 | |||
| 1938 | /* | ||
| 1917 | * Remove the vma's, and unmap the actual pages | 1939 | * Remove the vma's, and unmap the actual pages |
| 1918 | */ | 1940 | */ |
| 1919 | detach_vmas_to_be_unmapped(mm, vma, prev, end); | 1941 | detach_vmas_to_be_unmapped(mm, vma, prev, end); |
| @@ -2025,8 +2047,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
| 2025 | return -ENOMEM; | 2047 | return -ENOMEM; |
| 2026 | 2048 | ||
| 2027 | /* Can we just expand an old private anonymous mapping? */ | 2049 | /* Can we just expand an old private anonymous mapping? */ |
| 2028 | if (vma_merge(mm, prev, addr, addr + len, flags, | 2050 | vma = vma_merge(mm, prev, addr, addr + len, flags, |
| 2029 | NULL, NULL, pgoff, NULL)) | 2051 | NULL, NULL, pgoff, NULL); |
| 2052 | if (vma) | ||
| 2030 | goto out; | 2053 | goto out; |
| 2031 | 2054 | ||
| 2032 | /* | 2055 | /* |
| @@ -2048,8 +2071,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
| 2048 | out: | 2071 | out: |
| 2049 | mm->total_vm += len >> PAGE_SHIFT; | 2072 | mm->total_vm += len >> PAGE_SHIFT; |
| 2050 | if (flags & VM_LOCKED) { | 2073 | if (flags & VM_LOCKED) { |
| 2051 | mm->locked_vm += len >> PAGE_SHIFT; | 2074 | if (!mlock_vma_pages_range(vma, addr, addr + len)) |
| 2052 | make_pages_present(addr, addr + len); | 2075 | mm->locked_vm += (len >> PAGE_SHIFT); |
| 2053 | } | 2076 | } |
| 2054 | return addr; | 2077 | return addr; |
| 2055 | } | 2078 | } |
| @@ -2060,7 +2083,7 @@ EXPORT_SYMBOL(do_brk); | |||
| 2060 | void exit_mmap(struct mm_struct *mm) | 2083 | void exit_mmap(struct mm_struct *mm) |
| 2061 | { | 2084 | { |
| 2062 | struct mmu_gather *tlb; | 2085 | struct mmu_gather *tlb; |
| 2063 | struct vm_area_struct *vma = mm->mmap; | 2086 | struct vm_area_struct *vma; |
| 2064 | unsigned long nr_accounted = 0; | 2087 | unsigned long nr_accounted = 0; |
| 2065 | unsigned long end; | 2088 | unsigned long end; |
| 2066 | 2089 | ||
| @@ -2068,6 +2091,15 @@ void exit_mmap(struct mm_struct *mm) | |||
| 2068 | arch_exit_mmap(mm); | 2091 | arch_exit_mmap(mm); |
| 2069 | mmu_notifier_release(mm); | 2092 | mmu_notifier_release(mm); |
| 2070 | 2093 | ||
| 2094 | if (mm->locked_vm) { | ||
| 2095 | vma = mm->mmap; | ||
| 2096 | while (vma) { | ||
| 2097 | if (vma->vm_flags & VM_LOCKED) | ||
| 2098 | munlock_vma_pages_all(vma); | ||
| 2099 | vma = vma->vm_next; | ||
| 2100 | } | ||
| 2101 | } | ||
| 2102 | vma = mm->mmap; | ||
| 2071 | lru_add_drain(); | 2103 | lru_add_drain(); |
| 2072 | flush_cache_mm(mm); | 2104 | flush_cache_mm(mm); |
| 2073 | tlb = tlb_gather_mmu(mm, 1); | 2105 | tlb = tlb_gather_mmu(mm, 1); |
diff --git a/mm/mremap.c b/mm/mremap.c index 1a7743923c8c..58a2908f42f5 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
| @@ -24,6 +24,8 @@ | |||
| 24 | #include <asm/cacheflush.h> | 24 | #include <asm/cacheflush.h> |
| 25 | #include <asm/tlbflush.h> | 25 | #include <asm/tlbflush.h> |
| 26 | 26 | ||
| 27 | #include "internal.h" | ||
| 28 | |||
| 27 | static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) | 29 | static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) |
| 28 | { | 30 | { |
| 29 | pgd_t *pgd; | 31 | pgd_t *pgd; |
| @@ -238,8 +240,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
| 238 | if (vm_flags & VM_LOCKED) { | 240 | if (vm_flags & VM_LOCKED) { |
| 239 | mm->locked_vm += new_len >> PAGE_SHIFT; | 241 | mm->locked_vm += new_len >> PAGE_SHIFT; |
| 240 | if (new_len > old_len) | 242 | if (new_len > old_len) |
| 241 | make_pages_present(new_addr + old_len, | 243 | mlock_vma_pages_range(new_vma, new_addr + old_len, |
| 242 | new_addr + new_len); | 244 | new_addr + new_len); |
| 243 | } | 245 | } |
| 244 | 246 | ||
| 245 | return new_addr; | 247 | return new_addr; |
| @@ -379,7 +381,7 @@ unsigned long do_mremap(unsigned long addr, | |||
| 379 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); | 381 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); |
| 380 | if (vma->vm_flags & VM_LOCKED) { | 382 | if (vma->vm_flags & VM_LOCKED) { |
| 381 | mm->locked_vm += pages; | 383 | mm->locked_vm += pages; |
| 382 | make_pages_present(addr + old_len, | 384 | mlock_vma_pages_range(vma, addr + old_len, |
| 383 | addr + new_len); | 385 | addr + new_len); |
| 384 | } | 386 | } |
| 385 | ret = addr; | 387 | ret = addr; |
diff --git a/mm/nommu.c b/mm/nommu.c index ed75bc962fbe..7695dc850785 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -34,6 +34,8 @@ | |||
| 34 | #include <asm/tlb.h> | 34 | #include <asm/tlb.h> |
| 35 | #include <asm/tlbflush.h> | 35 | #include <asm/tlbflush.h> |
| 36 | 36 | ||
| 37 | #include "internal.h" | ||
| 38 | |||
| 37 | void *high_memory; | 39 | void *high_memory; |
| 38 | struct page *mem_map; | 40 | struct page *mem_map; |
| 39 | unsigned long max_mapnr; | 41 | unsigned long max_mapnr; |
| @@ -128,20 +130,16 @@ unsigned int kobjsize(const void *objp) | |||
| 128 | return PAGE_SIZE << compound_order(page); | 130 | return PAGE_SIZE << compound_order(page); |
| 129 | } | 131 | } |
| 130 | 132 | ||
| 131 | /* | 133 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 132 | * get a list of pages in an address range belonging to the specified process | 134 | unsigned long start, int len, int flags, |
| 133 | * and indicate the VMA that covers each page | 135 | struct page **pages, struct vm_area_struct **vmas) |
| 134 | * - this is potentially dodgy as we may end incrementing the page count of a | ||
| 135 | * slab page or a secondary page from a compound page | ||
| 136 | * - don't permit access to VMAs that don't support it, such as I/O mappings | ||
| 137 | */ | ||
| 138 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
| 139 | unsigned long start, int len, int write, int force, | ||
| 140 | struct page **pages, struct vm_area_struct **vmas) | ||
| 141 | { | 136 | { |
| 142 | struct vm_area_struct *vma; | 137 | struct vm_area_struct *vma; |
| 143 | unsigned long vm_flags; | 138 | unsigned long vm_flags; |
| 144 | int i; | 139 | int i; |
| 140 | int write = !!(flags & GUP_FLAGS_WRITE); | ||
| 141 | int force = !!(flags & GUP_FLAGS_FORCE); | ||
| 142 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); | ||
| 145 | 143 | ||
| 146 | /* calculate required read or write permissions. | 144 | /* calculate required read or write permissions. |
| 147 | * - if 'force' is set, we only require the "MAY" flags. | 145 | * - if 'force' is set, we only require the "MAY" flags. |
| @@ -156,7 +154,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 156 | 154 | ||
| 157 | /* protect what we can, including chardevs */ | 155 | /* protect what we can, including chardevs */ |
| 158 | if (vma->vm_flags & (VM_IO | VM_PFNMAP) || | 156 | if (vma->vm_flags & (VM_IO | VM_PFNMAP) || |
| 159 | !(vm_flags & vma->vm_flags)) | 157 | (!ignore && !(vm_flags & vma->vm_flags))) |
| 160 | goto finish_or_fault; | 158 | goto finish_or_fault; |
| 161 | 159 | ||
| 162 | if (pages) { | 160 | if (pages) { |
| @@ -174,6 +172,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 174 | finish_or_fault: | 172 | finish_or_fault: |
| 175 | return i ? : -EFAULT; | 173 | return i ? : -EFAULT; |
| 176 | } | 174 | } |
| 175 | |||
| 176 | |||
| 177 | /* | ||
| 178 | * get a list of pages in an address range belonging to the specified process | ||
| 179 | * and indicate the VMA that covers each page | ||
| 180 | * - this is potentially dodgy as we may end incrementing the page count of a | ||
| 181 | * slab page or a secondary page from a compound page | ||
| 182 | * - don't permit access to VMAs that don't support it, such as I/O mappings | ||
| 183 | */ | ||
| 184 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | ||
| 185 | unsigned long start, int len, int write, int force, | ||
| 186 | struct page **pages, struct vm_area_struct **vmas) | ||
| 187 | { | ||
| 188 | int flags = 0; | ||
| 189 | |||
| 190 | if (write) | ||
| 191 | flags |= GUP_FLAGS_WRITE; | ||
| 192 | if (force) | ||
| 193 | flags |= GUP_FLAGS_FORCE; | ||
| 194 | |||
| 195 | return __get_user_pages(tsk, mm, | ||
| 196 | start, len, flags, | ||
| 197 | pages, vmas); | ||
| 198 | } | ||
| 177 | EXPORT_SYMBOL(get_user_pages); | 199 | EXPORT_SYMBOL(get_user_pages); |
| 178 | 200 | ||
| 179 | DEFINE_RWLOCK(vmlist_lock); | 201 | DEFINE_RWLOCK(vmlist_lock); |
| @@ -1432,7 +1454,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
| 1432 | 1454 | ||
| 1433 | /* Don't let a single process grow too big: | 1455 | /* Don't let a single process grow too big: |
| 1434 | leave 3% of the size of this process for other processes */ | 1456 | leave 3% of the size of this process for other processes */ |
| 1435 | allowed -= current->mm->total_vm / 32; | 1457 | if (mm) |
| 1458 | allowed -= mm->total_vm / 32; | ||
| 1436 | 1459 | ||
| 1437 | /* | 1460 | /* |
| 1438 | * cast `allowed' as a signed long because vm_committed_space | 1461 | * cast `allowed' as a signed long because vm_committed_space |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 64e5b4bcd964..a0a01902f551 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -38,7 +38,6 @@ static DEFINE_SPINLOCK(zone_scan_mutex); | |||
| 38 | * badness - calculate a numeric value for how bad this task has been | 38 | * badness - calculate a numeric value for how bad this task has been |
| 39 | * @p: task struct of which task we should calculate | 39 | * @p: task struct of which task we should calculate |
| 40 | * @uptime: current uptime in seconds | 40 | * @uptime: current uptime in seconds |
| 41 | * @mem: target memory controller | ||
| 42 | * | 41 | * |
| 43 | * The formula used is relatively simple and documented inline in the | 42 | * The formula used is relatively simple and documented inline in the |
| 44 | * function. The main rationale is that we want to select a good task | 43 | * function. The main rationale is that we want to select a good task |
| @@ -295,6 +294,8 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
| 295 | continue; | 294 | continue; |
| 296 | if (mem && !task_in_mem_cgroup(p, mem)) | 295 | if (mem && !task_in_mem_cgroup(p, mem)) |
| 297 | continue; | 296 | continue; |
| 297 | if (!thread_group_leader(p)) | ||
| 298 | continue; | ||
| 298 | 299 | ||
| 299 | task_lock(p); | 300 | task_lock(p); |
| 300 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", | 301 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 24de8b65fdbd..2970e35fd03f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -7,7 +7,7 @@ | |||
| 7 | * Contains functions related to writing back dirty pages at the | 7 | * Contains functions related to writing back dirty pages at the |
| 8 | * address_space level. | 8 | * address_space level. |
| 9 | * | 9 | * |
| 10 | * 10Apr2002 akpm@zip.com.au | 10 | * 10Apr2002 Andrew Morton |
| 11 | * Initial version | 11 | * Initial version |
| 12 | */ | 12 | */ |
| 13 | 13 | ||
| @@ -329,9 +329,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) | |||
| 329 | struct zone *z = | 329 | struct zone *z = |
| 330 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; | 330 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; |
| 331 | 331 | ||
| 332 | x += zone_page_state(z, NR_FREE_PAGES) | 332 | x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); |
| 333 | + zone_page_state(z, NR_INACTIVE) | ||
| 334 | + zone_page_state(z, NR_ACTIVE); | ||
| 335 | } | 333 | } |
| 336 | /* | 334 | /* |
| 337 | * Make sure that the number of highmem pages is never larger | 335 | * Make sure that the number of highmem pages is never larger |
| @@ -355,9 +353,7 @@ unsigned long determine_dirtyable_memory(void) | |||
| 355 | { | 353 | { |
| 356 | unsigned long x; | 354 | unsigned long x; |
| 357 | 355 | ||
| 358 | x = global_page_state(NR_FREE_PAGES) | 356 | x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); |
| 359 | + global_page_state(NR_INACTIVE) | ||
| 360 | + global_page_state(NR_ACTIVE); | ||
| 361 | 357 | ||
| 362 | if (!vm_highmem_is_dirtyable) | 358 | if (!vm_highmem_is_dirtyable) |
| 363 | x -= highmem_dirtyable_memory(x); | 359 | x -= highmem_dirtyable_memory(x); |
| @@ -876,6 +872,7 @@ int write_cache_pages(struct address_space *mapping, | |||
| 876 | pgoff_t end; /* Inclusive */ | 872 | pgoff_t end; /* Inclusive */ |
| 877 | int scanned = 0; | 873 | int scanned = 0; |
| 878 | int range_whole = 0; | 874 | int range_whole = 0; |
| 875 | long nr_to_write = wbc->nr_to_write; | ||
| 879 | 876 | ||
| 880 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | 877 | if (wbc->nonblocking && bdi_write_congested(bdi)) { |
| 881 | wbc->encountered_congestion = 1; | 878 | wbc->encountered_congestion = 1; |
| @@ -939,7 +936,7 @@ retry: | |||
| 939 | unlock_page(page); | 936 | unlock_page(page); |
| 940 | ret = 0; | 937 | ret = 0; |
| 941 | } | 938 | } |
| 942 | if (ret || (--(wbc->nr_to_write) <= 0)) | 939 | if (ret || (--nr_to_write <= 0)) |
| 943 | done = 1; | 940 | done = 1; |
| 944 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | 941 | if (wbc->nonblocking && bdi_write_congested(bdi)) { |
| 945 | wbc->encountered_congestion = 1; | 942 | wbc->encountered_congestion = 1; |
| @@ -958,11 +955,12 @@ retry: | |||
| 958 | index = 0; | 955 | index = 0; |
| 959 | goto retry; | 956 | goto retry; |
| 960 | } | 957 | } |
| 961 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) | 958 | if (!wbc->no_nrwrite_index_update) { |
| 962 | mapping->writeback_index = index; | 959 | if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) |
| 960 | mapping->writeback_index = index; | ||
| 961 | wbc->nr_to_write = nr_to_write; | ||
| 962 | } | ||
| 963 | 963 | ||
| 964 | if (wbc->range_cont) | ||
| 965 | wbc->range_start = index << PAGE_CACHE_SHIFT; | ||
| 966 | return ret; | 964 | return ret; |
| 967 | } | 965 | } |
| 968 | EXPORT_SYMBOL(write_cache_pages); | 966 | EXPORT_SYMBOL(write_cache_pages); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 27b8681139fd..d8ac01474563 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -44,7 +44,7 @@ | |||
| 44 | #include <linux/backing-dev.h> | 44 | #include <linux/backing-dev.h> |
| 45 | #include <linux/fault-inject.h> | 45 | #include <linux/fault-inject.h> |
| 46 | #include <linux/page-isolation.h> | 46 | #include <linux/page-isolation.h> |
| 47 | #include <linux/memcontrol.h> | 47 | #include <linux/page_cgroup.h> |
| 48 | #include <linux/debugobjects.h> | 48 | #include <linux/debugobjects.h> |
| 49 | 49 | ||
| 50 | #include <asm/tlbflush.h> | 50 | #include <asm/tlbflush.h> |
| @@ -223,17 +223,12 @@ static inline int bad_range(struct zone *zone, struct page *page) | |||
| 223 | 223 | ||
| 224 | static void bad_page(struct page *page) | 224 | static void bad_page(struct page *page) |
| 225 | { | 225 | { |
| 226 | void *pc = page_get_page_cgroup(page); | ||
| 227 | |||
| 228 | printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG | 226 | printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG |
| 229 | "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", | 227 | "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", |
| 230 | current->comm, page, (int)(2*sizeof(unsigned long)), | 228 | current->comm, page, (int)(2*sizeof(unsigned long)), |
| 231 | (unsigned long)page->flags, page->mapping, | 229 | (unsigned long)page->flags, page->mapping, |
| 232 | page_mapcount(page), page_count(page)); | 230 | page_mapcount(page), page_count(page)); |
| 233 | if (pc) { | 231 | |
| 234 | printk(KERN_EMERG "cgroup:%p\n", pc); | ||
| 235 | page_reset_bad_cgroup(page); | ||
| 236 | } | ||
| 237 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" | 232 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" |
| 238 | KERN_EMERG "Backtrace:\n"); | 233 | KERN_EMERG "Backtrace:\n"); |
| 239 | dump_stack(); | 234 | dump_stack(); |
| @@ -268,24 +263,39 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
| 268 | { | 263 | { |
| 269 | int i; | 264 | int i; |
| 270 | int nr_pages = 1 << order; | 265 | int nr_pages = 1 << order; |
| 266 | |||
| 267 | set_compound_page_dtor(page, free_compound_page); | ||
| 268 | set_compound_order(page, order); | ||
| 269 | __SetPageHead(page); | ||
| 270 | for (i = 1; i < nr_pages; i++) { | ||
| 271 | struct page *p = page + i; | ||
| 272 | |||
| 273 | __SetPageTail(p); | ||
| 274 | p->first_page = page; | ||
| 275 | } | ||
| 276 | } | ||
| 277 | |||
| 278 | #ifdef CONFIG_HUGETLBFS | ||
| 279 | void prep_compound_gigantic_page(struct page *page, unsigned long order) | ||
| 280 | { | ||
| 281 | int i; | ||
| 282 | int nr_pages = 1 << order; | ||
| 271 | struct page *p = page + 1; | 283 | struct page *p = page + 1; |
| 272 | 284 | ||
| 273 | set_compound_page_dtor(page, free_compound_page); | 285 | set_compound_page_dtor(page, free_compound_page); |
| 274 | set_compound_order(page, order); | 286 | set_compound_order(page, order); |
| 275 | __SetPageHead(page); | 287 | __SetPageHead(page); |
| 276 | for (i = 1; i < nr_pages; i++, p++) { | 288 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { |
| 277 | if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) | ||
| 278 | p = pfn_to_page(page_to_pfn(page) + i); | ||
| 279 | __SetPageTail(p); | 289 | __SetPageTail(p); |
| 280 | p->first_page = page; | 290 | p->first_page = page; |
| 281 | } | 291 | } |
| 282 | } | 292 | } |
| 293 | #endif | ||
| 283 | 294 | ||
| 284 | static void destroy_compound_page(struct page *page, unsigned long order) | 295 | static void destroy_compound_page(struct page *page, unsigned long order) |
| 285 | { | 296 | { |
| 286 | int i; | 297 | int i; |
| 287 | int nr_pages = 1 << order; | 298 | int nr_pages = 1 << order; |
| 288 | struct page *p = page + 1; | ||
| 289 | 299 | ||
| 290 | if (unlikely(compound_order(page) != order)) | 300 | if (unlikely(compound_order(page) != order)) |
| 291 | bad_page(page); | 301 | bad_page(page); |
| @@ -293,9 +303,8 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
| 293 | if (unlikely(!PageHead(page))) | 303 | if (unlikely(!PageHead(page))) |
| 294 | bad_page(page); | 304 | bad_page(page); |
| 295 | __ClearPageHead(page); | 305 | __ClearPageHead(page); |
| 296 | for (i = 1; i < nr_pages; i++, p++) { | 306 | for (i = 1; i < nr_pages; i++) { |
| 297 | if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) | 307 | struct page *p = page + i; |
| 298 | p = pfn_to_page(page_to_pfn(page) + i); | ||
| 299 | 308 | ||
| 300 | if (unlikely(!PageTail(p) | | 309 | if (unlikely(!PageTail(p) | |
| 301 | (p->first_page != page))) | 310 | (p->first_page != page))) |
| @@ -454,14 +463,16 @@ static inline void __free_one_page(struct page *page, | |||
| 454 | 463 | ||
| 455 | static inline int free_pages_check(struct page *page) | 464 | static inline int free_pages_check(struct page *page) |
| 456 | { | 465 | { |
| 466 | free_page_mlock(page); | ||
| 457 | if (unlikely(page_mapcount(page) | | 467 | if (unlikely(page_mapcount(page) | |
| 458 | (page->mapping != NULL) | | 468 | (page->mapping != NULL) | |
| 459 | (page_get_page_cgroup(page) != NULL) | | ||
| 460 | (page_count(page) != 0) | | 469 | (page_count(page) != 0) | |
| 461 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) | 470 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) |
| 462 | bad_page(page); | 471 | bad_page(page); |
| 463 | if (PageDirty(page)) | 472 | if (PageDirty(page)) |
| 464 | __ClearPageDirty(page); | 473 | __ClearPageDirty(page); |
| 474 | if (PageSwapBacked(page)) | ||
| 475 | __ClearPageSwapBacked(page); | ||
| 465 | /* | 476 | /* |
| 466 | * For now, we report if PG_reserved was found set, but do not | 477 | * For now, we report if PG_reserved was found set, but do not |
| 467 | * clear it, and do not free the page. But we shall soon need | 478 | * clear it, and do not free the page. But we shall soon need |
| @@ -600,7 +611,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
| 600 | { | 611 | { |
| 601 | if (unlikely(page_mapcount(page) | | 612 | if (unlikely(page_mapcount(page) | |
| 602 | (page->mapping != NULL) | | 613 | (page->mapping != NULL) | |
| 603 | (page_get_page_cgroup(page) != NULL) | | ||
| 604 | (page_count(page) != 0) | | 614 | (page_count(page) != 0) | |
| 605 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) | 615 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) |
| 606 | bad_page(page); | 616 | bad_page(page); |
| @@ -614,7 +624,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
| 614 | 624 | ||
| 615 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | | 625 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | |
| 616 | 1 << PG_referenced | 1 << PG_arch_1 | | 626 | 1 << PG_referenced | 1 << PG_arch_1 | |
| 617 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); | 627 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk |
| 628 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 629 | | 1 << PG_mlocked | ||
| 630 | #endif | ||
| 631 | ); | ||
| 618 | set_page_private(page, 0); | 632 | set_page_private(page, 0); |
| 619 | set_page_refcounted(page); | 633 | set_page_refcounted(page); |
| 620 | 634 | ||
| @@ -1547,6 +1561,10 @@ nofail_alloc: | |||
| 1547 | 1561 | ||
| 1548 | /* We now go into synchronous reclaim */ | 1562 | /* We now go into synchronous reclaim */ |
| 1549 | cpuset_memory_pressure_bump(); | 1563 | cpuset_memory_pressure_bump(); |
| 1564 | /* | ||
| 1565 | * The task's cpuset might have expanded its set of allowable nodes | ||
| 1566 | */ | ||
| 1567 | cpuset_update_task_memory_state(); | ||
| 1550 | p->flags |= PF_MEMALLOC; | 1568 | p->flags |= PF_MEMALLOC; |
| 1551 | reclaim_state.reclaimed_slab = 0; | 1569 | reclaim_state.reclaimed_slab = 0; |
| 1552 | p->reclaim_state = &reclaim_state; | 1570 | p->reclaim_state = &reclaim_state; |
| @@ -1862,10 +1880,21 @@ void show_free_areas(void) | |||
| 1862 | } | 1880 | } |
| 1863 | } | 1881 | } |
| 1864 | 1882 | ||
| 1865 | printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" | 1883 | printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" |
| 1884 | " inactive_file:%lu" | ||
| 1885 | //TODO: check/adjust line lengths | ||
| 1886 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1887 | " unevictable:%lu" | ||
| 1888 | #endif | ||
| 1889 | " dirty:%lu writeback:%lu unstable:%lu\n" | ||
| 1866 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", | 1890 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", |
| 1867 | global_page_state(NR_ACTIVE), | 1891 | global_page_state(NR_ACTIVE_ANON), |
| 1868 | global_page_state(NR_INACTIVE), | 1892 | global_page_state(NR_ACTIVE_FILE), |
| 1893 | global_page_state(NR_INACTIVE_ANON), | ||
| 1894 | global_page_state(NR_INACTIVE_FILE), | ||
| 1895 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1896 | global_page_state(NR_UNEVICTABLE), | ||
| 1897 | #endif | ||
| 1869 | global_page_state(NR_FILE_DIRTY), | 1898 | global_page_state(NR_FILE_DIRTY), |
| 1870 | global_page_state(NR_WRITEBACK), | 1899 | global_page_state(NR_WRITEBACK), |
| 1871 | global_page_state(NR_UNSTABLE_NFS), | 1900 | global_page_state(NR_UNSTABLE_NFS), |
| @@ -1888,8 +1917,13 @@ void show_free_areas(void) | |||
| 1888 | " min:%lukB" | 1917 | " min:%lukB" |
| 1889 | " low:%lukB" | 1918 | " low:%lukB" |
| 1890 | " high:%lukB" | 1919 | " high:%lukB" |
| 1891 | " active:%lukB" | 1920 | " active_anon:%lukB" |
| 1892 | " inactive:%lukB" | 1921 | " inactive_anon:%lukB" |
| 1922 | " active_file:%lukB" | ||
| 1923 | " inactive_file:%lukB" | ||
| 1924 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1925 | " unevictable:%lukB" | ||
| 1926 | #endif | ||
| 1893 | " present:%lukB" | 1927 | " present:%lukB" |
| 1894 | " pages_scanned:%lu" | 1928 | " pages_scanned:%lu" |
| 1895 | " all_unreclaimable? %s" | 1929 | " all_unreclaimable? %s" |
| @@ -1899,8 +1933,13 @@ void show_free_areas(void) | |||
| 1899 | K(zone->pages_min), | 1933 | K(zone->pages_min), |
| 1900 | K(zone->pages_low), | 1934 | K(zone->pages_low), |
| 1901 | K(zone->pages_high), | 1935 | K(zone->pages_high), |
| 1902 | K(zone_page_state(zone, NR_ACTIVE)), | 1936 | K(zone_page_state(zone, NR_ACTIVE_ANON)), |
| 1903 | K(zone_page_state(zone, NR_INACTIVE)), | 1937 | K(zone_page_state(zone, NR_INACTIVE_ANON)), |
| 1938 | K(zone_page_state(zone, NR_ACTIVE_FILE)), | ||
| 1939 | K(zone_page_state(zone, NR_INACTIVE_FILE)), | ||
| 1940 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1941 | K(zone_page_state(zone, NR_UNEVICTABLE)), | ||
| 1942 | #endif | ||
| 1904 | K(zone->present_pages), | 1943 | K(zone->present_pages), |
| 1905 | zone->pages_scanned, | 1944 | zone->pages_scanned, |
| 1906 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") | 1945 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") |
| @@ -3410,10 +3449,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
| 3410 | pgdat->nr_zones = 0; | 3449 | pgdat->nr_zones = 0; |
| 3411 | init_waitqueue_head(&pgdat->kswapd_wait); | 3450 | init_waitqueue_head(&pgdat->kswapd_wait); |
| 3412 | pgdat->kswapd_max_order = 0; | 3451 | pgdat->kswapd_max_order = 0; |
| 3452 | pgdat_page_cgroup_init(pgdat); | ||
| 3413 | 3453 | ||
| 3414 | for (j = 0; j < MAX_NR_ZONES; j++) { | 3454 | for (j = 0; j < MAX_NR_ZONES; j++) { |
| 3415 | struct zone *zone = pgdat->node_zones + j; | 3455 | struct zone *zone = pgdat->node_zones + j; |
| 3416 | unsigned long size, realsize, memmap_pages; | 3456 | unsigned long size, realsize, memmap_pages; |
| 3457 | enum lru_list l; | ||
| 3417 | 3458 | ||
| 3418 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 3459 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
| 3419 | realsize = size - zone_absent_pages_in_node(nid, j, | 3460 | realsize = size - zone_absent_pages_in_node(nid, j, |
| @@ -3428,8 +3469,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
| 3428 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; | 3469 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; |
| 3429 | if (realsize >= memmap_pages) { | 3470 | if (realsize >= memmap_pages) { |
| 3430 | realsize -= memmap_pages; | 3471 | realsize -= memmap_pages; |
| 3431 | mminit_dprintk(MMINIT_TRACE, "memmap_init", | 3472 | printk(KERN_DEBUG |
| 3432 | "%s zone: %lu pages used for memmap\n", | 3473 | " %s zone: %lu pages used for memmap\n", |
| 3433 | zone_names[j], memmap_pages); | 3474 | zone_names[j], memmap_pages); |
| 3434 | } else | 3475 | } else |
| 3435 | printk(KERN_WARNING | 3476 | printk(KERN_WARNING |
| @@ -3439,8 +3480,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
| 3439 | /* Account for reserved pages */ | 3480 | /* Account for reserved pages */ |
| 3440 | if (j == 0 && realsize > dma_reserve) { | 3481 | if (j == 0 && realsize > dma_reserve) { |
| 3441 | realsize -= dma_reserve; | 3482 | realsize -= dma_reserve; |
| 3442 | mminit_dprintk(MMINIT_TRACE, "memmap_init", | 3483 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", |
| 3443 | "%s zone: %lu pages reserved\n", | ||
| 3444 | zone_names[0], dma_reserve); | 3484 | zone_names[0], dma_reserve); |
| 3445 | } | 3485 | } |
| 3446 | 3486 | ||
| @@ -3465,10 +3505,14 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
| 3465 | zone->prev_priority = DEF_PRIORITY; | 3505 | zone->prev_priority = DEF_PRIORITY; |
| 3466 | 3506 | ||
| 3467 | zone_pcp_init(zone); | 3507 | zone_pcp_init(zone); |
| 3468 | INIT_LIST_HEAD(&zone->active_list); | 3508 | for_each_lru(l) { |
| 3469 | INIT_LIST_HEAD(&zone->inactive_list); | 3509 | INIT_LIST_HEAD(&zone->lru[l].list); |
| 3470 | zone->nr_scan_active = 0; | 3510 | zone->lru[l].nr_scan = 0; |
| 3471 | zone->nr_scan_inactive = 0; | 3511 | } |
| 3512 | zone->recent_rotated[0] = 0; | ||
| 3513 | zone->recent_rotated[1] = 0; | ||
| 3514 | zone->recent_scanned[0] = 0; | ||
| 3515 | zone->recent_scanned[1] = 0; | ||
| 3472 | zap_zone_vm_stats(zone); | 3516 | zap_zone_vm_stats(zone); |
| 3473 | zone->flags = 0; | 3517 | zone->flags = 0; |
| 3474 | if (!size) | 3518 | if (!size) |
| @@ -3952,7 +3996,7 @@ static void check_for_regular_memory(pg_data_t *pgdat) | |||
| 3952 | void __init free_area_init_nodes(unsigned long *max_zone_pfn) | 3996 | void __init free_area_init_nodes(unsigned long *max_zone_pfn) |
| 3953 | { | 3997 | { |
| 3954 | unsigned long nid; | 3998 | unsigned long nid; |
| 3955 | enum zone_type i; | 3999 | int i; |
| 3956 | 4000 | ||
| 3957 | /* Sort early_node_map as initialisation assumes it is sorted */ | 4001 | /* Sort early_node_map as initialisation assumes it is sorted */ |
| 3958 | sort_node_map(); | 4002 | sort_node_map(); |
| @@ -4210,7 +4254,7 @@ void setup_per_zone_pages_min(void) | |||
| 4210 | for_each_zone(zone) { | 4254 | for_each_zone(zone) { |
| 4211 | u64 tmp; | 4255 | u64 tmp; |
| 4212 | 4256 | ||
| 4213 | spin_lock_irqsave(&zone->lru_lock, flags); | 4257 | spin_lock_irqsave(&zone->lock, flags); |
| 4214 | tmp = (u64)pages_min * zone->present_pages; | 4258 | tmp = (u64)pages_min * zone->present_pages; |
| 4215 | do_div(tmp, lowmem_pages); | 4259 | do_div(tmp, lowmem_pages); |
| 4216 | if (is_highmem(zone)) { | 4260 | if (is_highmem(zone)) { |
| @@ -4242,13 +4286,53 @@ void setup_per_zone_pages_min(void) | |||
| 4242 | zone->pages_low = zone->pages_min + (tmp >> 2); | 4286 | zone->pages_low = zone->pages_min + (tmp >> 2); |
| 4243 | zone->pages_high = zone->pages_min + (tmp >> 1); | 4287 | zone->pages_high = zone->pages_min + (tmp >> 1); |
| 4244 | setup_zone_migrate_reserve(zone); | 4288 | setup_zone_migrate_reserve(zone); |
| 4245 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 4289 | spin_unlock_irqrestore(&zone->lock, flags); |
| 4246 | } | 4290 | } |
| 4247 | 4291 | ||
| 4248 | /* update totalreserve_pages */ | 4292 | /* update totalreserve_pages */ |
| 4249 | calculate_totalreserve_pages(); | 4293 | calculate_totalreserve_pages(); |
| 4250 | } | 4294 | } |
| 4251 | 4295 | ||
| 4296 | /** | ||
| 4297 | * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. | ||
| 4298 | * | ||
| 4299 | * The inactive anon list should be small enough that the VM never has to | ||
| 4300 | * do too much work, but large enough that each inactive page has a chance | ||
| 4301 | * to be referenced again before it is swapped out. | ||
| 4302 | * | ||
| 4303 | * The inactive_anon ratio is the target ratio of ACTIVE_ANON to | ||
| 4304 | * INACTIVE_ANON pages on this zone's LRU, maintained by the | ||
| 4305 | * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of | ||
| 4306 | * the anonymous pages are kept on the inactive list. | ||
| 4307 | * | ||
| 4308 | * total target max | ||
| 4309 | * memory ratio inactive anon | ||
| 4310 | * ------------------------------------- | ||
| 4311 | * 10MB 1 5MB | ||
| 4312 | * 100MB 1 50MB | ||
| 4313 | * 1GB 3 250MB | ||
| 4314 | * 10GB 10 0.9GB | ||
| 4315 | * 100GB 31 3GB | ||
| 4316 | * 1TB 101 10GB | ||
| 4317 | * 10TB 320 32GB | ||
| 4318 | */ | ||
| 4319 | void setup_per_zone_inactive_ratio(void) | ||
| 4320 | { | ||
| 4321 | struct zone *zone; | ||
| 4322 | |||
| 4323 | for_each_zone(zone) { | ||
| 4324 | unsigned int gb, ratio; | ||
| 4325 | |||
| 4326 | /* Zone size in gigabytes */ | ||
| 4327 | gb = zone->present_pages >> (30 - PAGE_SHIFT); | ||
| 4328 | ratio = int_sqrt(10 * gb); | ||
| 4329 | if (!ratio) | ||
| 4330 | ratio = 1; | ||
| 4331 | |||
| 4332 | zone->inactive_ratio = ratio; | ||
| 4333 | } | ||
| 4334 | } | ||
| 4335 | |||
| 4252 | /* | 4336 | /* |
| 4253 | * Initialise min_free_kbytes. | 4337 | * Initialise min_free_kbytes. |
| 4254 | * | 4338 | * |
| @@ -4286,6 +4370,7 @@ static int __init init_per_zone_pages_min(void) | |||
| 4286 | min_free_kbytes = 65536; | 4370 | min_free_kbytes = 65536; |
| 4287 | setup_per_zone_pages_min(); | 4371 | setup_per_zone_pages_min(); |
| 4288 | setup_per_zone_lowmem_reserve(); | 4372 | setup_per_zone_lowmem_reserve(); |
| 4373 | setup_per_zone_inactive_ratio(); | ||
| 4289 | return 0; | 4374 | return 0; |
| 4290 | } | 4375 | } |
| 4291 | module_init(init_per_zone_pages_min) | 4376 | module_init(init_per_zone_pages_min) |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c new file mode 100644 index 000000000000..1223d927904d --- /dev/null +++ b/mm/page_cgroup.c | |||
| @@ -0,0 +1,256 @@ | |||
| 1 | #include <linux/mm.h> | ||
| 2 | #include <linux/mmzone.h> | ||
| 3 | #include <linux/bootmem.h> | ||
| 4 | #include <linux/bit_spinlock.h> | ||
| 5 | #include <linux/page_cgroup.h> | ||
| 6 | #include <linux/hash.h> | ||
| 7 | #include <linux/slab.h> | ||
| 8 | #include <linux/memory.h> | ||
| 9 | #include <linux/vmalloc.h> | ||
| 10 | #include <linux/cgroup.h> | ||
| 11 | |||
| 12 | static void __meminit | ||
| 13 | __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) | ||
| 14 | { | ||
| 15 | pc->flags = 0; | ||
| 16 | pc->mem_cgroup = NULL; | ||
| 17 | pc->page = pfn_to_page(pfn); | ||
| 18 | } | ||
| 19 | static unsigned long total_usage; | ||
| 20 | |||
| 21 | #if !defined(CONFIG_SPARSEMEM) | ||
| 22 | |||
| 23 | |||
| 24 | void __init pgdat_page_cgroup_init(struct pglist_data *pgdat) | ||
| 25 | { | ||
| 26 | pgdat->node_page_cgroup = NULL; | ||
| 27 | } | ||
| 28 | |||
| 29 | struct page_cgroup *lookup_page_cgroup(struct page *page) | ||
| 30 | { | ||
| 31 | unsigned long pfn = page_to_pfn(page); | ||
| 32 | unsigned long offset; | ||
| 33 | struct page_cgroup *base; | ||
| 34 | |||
| 35 | base = NODE_DATA(page_to_nid(page))->node_page_cgroup; | ||
| 36 | if (unlikely(!base)) | ||
| 37 | return NULL; | ||
| 38 | |||
| 39 | offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; | ||
| 40 | return base + offset; | ||
| 41 | } | ||
| 42 | |||
| 43 | static int __init alloc_node_page_cgroup(int nid) | ||
| 44 | { | ||
| 45 | struct page_cgroup *base, *pc; | ||
| 46 | unsigned long table_size; | ||
| 47 | unsigned long start_pfn, nr_pages, index; | ||
| 48 | |||
| 49 | start_pfn = NODE_DATA(nid)->node_start_pfn; | ||
| 50 | nr_pages = NODE_DATA(nid)->node_spanned_pages; | ||
| 51 | |||
| 52 | table_size = sizeof(struct page_cgroup) * nr_pages; | ||
| 53 | |||
| 54 | base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), | ||
| 55 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | ||
| 56 | if (!base) | ||
| 57 | return -ENOMEM; | ||
| 58 | for (index = 0; index < nr_pages; index++) { | ||
| 59 | pc = base + index; | ||
| 60 | __init_page_cgroup(pc, start_pfn + index); | ||
| 61 | } | ||
| 62 | NODE_DATA(nid)->node_page_cgroup = base; | ||
| 63 | total_usage += table_size; | ||
| 64 | return 0; | ||
| 65 | } | ||
| 66 | |||
| 67 | void __init page_cgroup_init(void) | ||
| 68 | { | ||
| 69 | |||
| 70 | int nid, fail; | ||
| 71 | |||
| 72 | if (mem_cgroup_subsys.disabled) | ||
| 73 | return; | ||
| 74 | |||
| 75 | for_each_online_node(nid) { | ||
| 76 | fail = alloc_node_page_cgroup(nid); | ||
| 77 | if (fail) | ||
| 78 | goto fail; | ||
| 79 | } | ||
| 80 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | ||
| 81 | printk(KERN_INFO "please try cgroup_disable=memory option if you" | ||
| 82 | " don't want\n"); | ||
| 83 | return; | ||
| 84 | fail: | ||
| 85 | printk(KERN_CRIT "allocation of page_cgroup was failed.\n"); | ||
| 86 | printk(KERN_CRIT "please try cgroup_disable=memory boot option\n"); | ||
| 87 | panic("Out of memory"); | ||
| 88 | } | ||
| 89 | |||
| 90 | #else /* CONFIG_FLAT_NODE_MEM_MAP */ | ||
| 91 | |||
| 92 | struct page_cgroup *lookup_page_cgroup(struct page *page) | ||
| 93 | { | ||
| 94 | unsigned long pfn = page_to_pfn(page); | ||
| 95 | struct mem_section *section = __pfn_to_section(pfn); | ||
| 96 | |||
| 97 | return section->page_cgroup + pfn; | ||
| 98 | } | ||
| 99 | |||
| 100 | int __meminit init_section_page_cgroup(unsigned long pfn) | ||
| 101 | { | ||
| 102 | struct mem_section *section; | ||
| 103 | struct page_cgroup *base, *pc; | ||
| 104 | unsigned long table_size; | ||
| 105 | int nid, index; | ||
| 106 | |||
| 107 | section = __pfn_to_section(pfn); | ||
| 108 | |||
| 109 | if (section->page_cgroup) | ||
| 110 | return 0; | ||
| 111 | |||
| 112 | nid = page_to_nid(pfn_to_page(pfn)); | ||
| 113 | |||
| 114 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | ||
| 115 | if (slab_is_available()) { | ||
| 116 | base = kmalloc_node(table_size, GFP_KERNEL, nid); | ||
| 117 | if (!base) | ||
| 118 | base = vmalloc_node(table_size, nid); | ||
| 119 | } else { | ||
| 120 | base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size, | ||
| 121 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | ||
| 122 | } | ||
| 123 | |||
| 124 | if (!base) { | ||
| 125 | printk(KERN_ERR "page cgroup allocation failure\n"); | ||
| 126 | return -ENOMEM; | ||
| 127 | } | ||
| 128 | |||
| 129 | for (index = 0; index < PAGES_PER_SECTION; index++) { | ||
| 130 | pc = base + index; | ||
| 131 | __init_page_cgroup(pc, pfn + index); | ||
| 132 | } | ||
| 133 | |||
| 134 | section = __pfn_to_section(pfn); | ||
| 135 | section->page_cgroup = base - pfn; | ||
| 136 | total_usage += table_size; | ||
| 137 | return 0; | ||
| 138 | } | ||
| 139 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
| 140 | void __free_page_cgroup(unsigned long pfn) | ||
| 141 | { | ||
| 142 | struct mem_section *ms; | ||
| 143 | struct page_cgroup *base; | ||
| 144 | |||
| 145 | ms = __pfn_to_section(pfn); | ||
| 146 | if (!ms || !ms->page_cgroup) | ||
| 147 | return; | ||
| 148 | base = ms->page_cgroup + pfn; | ||
| 149 | if (is_vmalloc_addr(base)) { | ||
| 150 | vfree(base); | ||
| 151 | ms->page_cgroup = NULL; | ||
| 152 | } else { | ||
| 153 | struct page *page = virt_to_page(base); | ||
| 154 | if (!PageReserved(page)) { /* Is bootmem ? */ | ||
| 155 | kfree(base); | ||
| 156 | ms->page_cgroup = NULL; | ||
| 157 | } | ||
| 158 | } | ||
| 159 | } | ||
| 160 | |||
| 161 | int online_page_cgroup(unsigned long start_pfn, | ||
| 162 | unsigned long nr_pages, | ||
| 163 | int nid) | ||
| 164 | { | ||
| 165 | unsigned long start, end, pfn; | ||
| 166 | int fail = 0; | ||
| 167 | |||
| 168 | start = start_pfn & ~(PAGES_PER_SECTION - 1); | ||
| 169 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); | ||
| 170 | |||
| 171 | for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { | ||
| 172 | if (!pfn_present(pfn)) | ||
| 173 | continue; | ||
| 174 | fail = init_section_page_cgroup(pfn); | ||
| 175 | } | ||
| 176 | if (!fail) | ||
| 177 | return 0; | ||
| 178 | |||
| 179 | /* rollback */ | ||
| 180 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | ||
| 181 | __free_page_cgroup(pfn); | ||
| 182 | |||
| 183 | return -ENOMEM; | ||
| 184 | } | ||
| 185 | |||
| 186 | int offline_page_cgroup(unsigned long start_pfn, | ||
| 187 | unsigned long nr_pages, int nid) | ||
| 188 | { | ||
| 189 | unsigned long start, end, pfn; | ||
| 190 | |||
| 191 | start = start_pfn & ~(PAGES_PER_SECTION - 1); | ||
| 192 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); | ||
| 193 | |||
| 194 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | ||
| 195 | __free_page_cgroup(pfn); | ||
| 196 | return 0; | ||
| 197 | |||
| 198 | } | ||
| 199 | |||
| 200 | static int page_cgroup_callback(struct notifier_block *self, | ||
| 201 | unsigned long action, void *arg) | ||
| 202 | { | ||
| 203 | struct memory_notify *mn = arg; | ||
| 204 | int ret = 0; | ||
| 205 | switch (action) { | ||
| 206 | case MEM_GOING_ONLINE: | ||
| 207 | ret = online_page_cgroup(mn->start_pfn, | ||
| 208 | mn->nr_pages, mn->status_change_nid); | ||
| 209 | break; | ||
| 210 | case MEM_CANCEL_ONLINE: | ||
| 211 | case MEM_OFFLINE: | ||
| 212 | offline_page_cgroup(mn->start_pfn, | ||
| 213 | mn->nr_pages, mn->status_change_nid); | ||
| 214 | break; | ||
| 215 | case MEM_GOING_OFFLINE: | ||
| 216 | break; | ||
| 217 | case MEM_ONLINE: | ||
| 218 | case MEM_CANCEL_OFFLINE: | ||
| 219 | break; | ||
| 220 | } | ||
| 221 | ret = notifier_from_errno(ret); | ||
| 222 | return ret; | ||
| 223 | } | ||
| 224 | |||
| 225 | #endif | ||
| 226 | |||
| 227 | void __init page_cgroup_init(void) | ||
| 228 | { | ||
| 229 | unsigned long pfn; | ||
| 230 | int fail = 0; | ||
| 231 | |||
| 232 | if (mem_cgroup_subsys.disabled) | ||
| 233 | return; | ||
| 234 | |||
| 235 | for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { | ||
| 236 | if (!pfn_present(pfn)) | ||
| 237 | continue; | ||
| 238 | fail = init_section_page_cgroup(pfn); | ||
| 239 | } | ||
| 240 | if (fail) { | ||
| 241 | printk(KERN_CRIT "try cgroup_disable=memory boot option\n"); | ||
| 242 | panic("Out of memory"); | ||
| 243 | } else { | ||
| 244 | hotplug_memory_notifier(page_cgroup_callback, 0); | ||
| 245 | } | ||
| 246 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | ||
| 247 | printk(KERN_INFO "please try cgroup_disable=memory option if you don't" | ||
| 248 | " want\n"); | ||
| 249 | } | ||
| 250 | |||
| 251 | void __init pgdat_page_cgroup_init(struct pglist_data *pgdat) | ||
| 252 | { | ||
| 253 | return; | ||
| 254 | } | ||
| 255 | |||
| 256 | #endif | ||
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index b70a7fec1ff6..5e0ffd967452 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
| @@ -130,10 +130,11 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
| 130 | if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | 130 | if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE) |
| 131 | break; | 131 | break; |
| 132 | } | 132 | } |
| 133 | if (pfn < end_pfn) | 133 | page = __first_valid_page(start_pfn, end_pfn - start_pfn); |
| 134 | if ((pfn < end_pfn) || !page) | ||
| 134 | return -EBUSY; | 135 | return -EBUSY; |
| 135 | /* Check all pages are free or Marked as ISOLATED */ | 136 | /* Check all pages are free or Marked as ISOLATED */ |
| 136 | zone = page_zone(pfn_to_page(pfn)); | 137 | zone = page_zone(page); |
| 137 | spin_lock_irqsave(&zone->lock, flags); | 138 | spin_lock_irqsave(&zone->lock, flags); |
| 138 | ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); | 139 | ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); |
| 139 | spin_unlock_irqrestore(&zone->lock, flags); | 140 | spin_unlock_irqrestore(&zone->lock, flags); |
diff --git a/mm/pdflush.c b/mm/pdflush.c index 0cbe0c60c6bf..a0a14c4d5072 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c | |||
| @@ -3,7 +3,7 @@ | |||
| 3 | * | 3 | * |
| 4 | * Copyright (C) 2002, Linus Torvalds. | 4 | * Copyright (C) 2002, Linus Torvalds. |
| 5 | * | 5 | * |
| 6 | * 09Apr2002 akpm@zip.com.au | 6 | * 09Apr2002 Andrew Morton |
| 7 | * Initial version | 7 | * Initial version |
| 8 | * 29Feb2004 kaos@sgi.com | 8 | * 29Feb2004 kaos@sgi.com |
| 9 | * Move worker thread creation to kthread to avoid chewing | 9 | * Move worker thread creation to kthread to avoid chewing |
diff --git a/mm/readahead.c b/mm/readahead.c index 77e8ddf945e9..bec83c15a78f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
| @@ -3,7 +3,7 @@ | |||
| 3 | * | 3 | * |
| 4 | * Copyright (C) 2002, Linus Torvalds | 4 | * Copyright (C) 2002, Linus Torvalds |
| 5 | * | 5 | * |
| 6 | * 09Apr2002 akpm@zip.com.au | 6 | * 09Apr2002 Andrew Morton |
| 7 | * Initial version. | 7 | * Initial version. |
| 8 | */ | 8 | */ |
| 9 | 9 | ||
| @@ -229,7 +229,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
| 229 | */ | 229 | */ |
| 230 | unsigned long max_sane_readahead(unsigned long nr) | 230 | unsigned long max_sane_readahead(unsigned long nr) |
| 231 | { | 231 | { |
| 232 | return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) | 232 | return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) |
| 233 | + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); | 233 | + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); |
| 234 | } | 234 | } |
| 235 | 235 | ||
| @@ -53,9 +53,47 @@ | |||
| 53 | 53 | ||
| 54 | #include <asm/tlbflush.h> | 54 | #include <asm/tlbflush.h> |
| 55 | 55 | ||
| 56 | struct kmem_cache *anon_vma_cachep; | 56 | #include "internal.h" |
| 57 | 57 | ||
| 58 | /* This must be called under the mmap_sem. */ | 58 | static struct kmem_cache *anon_vma_cachep; |
| 59 | |||
| 60 | static inline struct anon_vma *anon_vma_alloc(void) | ||
| 61 | { | ||
| 62 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | ||
| 63 | } | ||
| 64 | |||
| 65 | static inline void anon_vma_free(struct anon_vma *anon_vma) | ||
| 66 | { | ||
| 67 | kmem_cache_free(anon_vma_cachep, anon_vma); | ||
| 68 | } | ||
| 69 | |||
| 70 | /** | ||
| 71 | * anon_vma_prepare - attach an anon_vma to a memory region | ||
| 72 | * @vma: the memory region in question | ||
| 73 | * | ||
| 74 | * This makes sure the memory mapping described by 'vma' has | ||
| 75 | * an 'anon_vma' attached to it, so that we can associate the | ||
| 76 | * anonymous pages mapped into it with that anon_vma. | ||
| 77 | * | ||
| 78 | * The common case will be that we already have one, but if | ||
| 79 | * if not we either need to find an adjacent mapping that we | ||
| 80 | * can re-use the anon_vma from (very common when the only | ||
| 81 | * reason for splitting a vma has been mprotect()), or we | ||
| 82 | * allocate a new one. | ||
| 83 | * | ||
| 84 | * Anon-vma allocations are very subtle, because we may have | ||
| 85 | * optimistically looked up an anon_vma in page_lock_anon_vma() | ||
| 86 | * and that may actually touch the spinlock even in the newly | ||
| 87 | * allocated vma (it depends on RCU to make sure that the | ||
| 88 | * anon_vma isn't actually destroyed). | ||
| 89 | * | ||
| 90 | * As a result, we need to do proper anon_vma locking even | ||
| 91 | * for the new allocation. At the same time, we do not want | ||
| 92 | * to do any locking for the common case of already having | ||
| 93 | * an anon_vma. | ||
| 94 | * | ||
| 95 | * This must be called with the mmap_sem held for reading. | ||
| 96 | */ | ||
| 59 | int anon_vma_prepare(struct vm_area_struct *vma) | 97 | int anon_vma_prepare(struct vm_area_struct *vma) |
| 60 | { | 98 | { |
| 61 | struct anon_vma *anon_vma = vma->anon_vma; | 99 | struct anon_vma *anon_vma = vma->anon_vma; |
| @@ -63,20 +101,17 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
| 63 | might_sleep(); | 101 | might_sleep(); |
| 64 | if (unlikely(!anon_vma)) { | 102 | if (unlikely(!anon_vma)) { |
| 65 | struct mm_struct *mm = vma->vm_mm; | 103 | struct mm_struct *mm = vma->vm_mm; |
| 66 | struct anon_vma *allocated, *locked; | 104 | struct anon_vma *allocated; |
| 67 | 105 | ||
| 68 | anon_vma = find_mergeable_anon_vma(vma); | 106 | anon_vma = find_mergeable_anon_vma(vma); |
| 69 | if (anon_vma) { | 107 | allocated = NULL; |
| 70 | allocated = NULL; | 108 | if (!anon_vma) { |
| 71 | locked = anon_vma; | ||
| 72 | spin_lock(&locked->lock); | ||
| 73 | } else { | ||
| 74 | anon_vma = anon_vma_alloc(); | 109 | anon_vma = anon_vma_alloc(); |
| 75 | if (unlikely(!anon_vma)) | 110 | if (unlikely(!anon_vma)) |
| 76 | return -ENOMEM; | 111 | return -ENOMEM; |
| 77 | allocated = anon_vma; | 112 | allocated = anon_vma; |
| 78 | locked = NULL; | ||
| 79 | } | 113 | } |
| 114 | spin_lock(&anon_vma->lock); | ||
| 80 | 115 | ||
| 81 | /* page_table_lock to protect against threads */ | 116 | /* page_table_lock to protect against threads */ |
| 82 | spin_lock(&mm->page_table_lock); | 117 | spin_lock(&mm->page_table_lock); |
| @@ -87,8 +122,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
| 87 | } | 122 | } |
| 88 | spin_unlock(&mm->page_table_lock); | 123 | spin_unlock(&mm->page_table_lock); |
| 89 | 124 | ||
| 90 | if (locked) | 125 | spin_unlock(&anon_vma->lock); |
| 91 | spin_unlock(&locked->lock); | ||
| 92 | if (unlikely(allocated)) | 126 | if (unlikely(allocated)) |
| 93 | anon_vma_free(allocated); | 127 | anon_vma_free(allocated); |
| 94 | } | 128 | } |
| @@ -157,7 +191,7 @@ void __init anon_vma_init(void) | |||
| 157 | * Getting a lock on a stable anon_vma from a page off the LRU is | 191 | * Getting a lock on a stable anon_vma from a page off the LRU is |
| 158 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. | 192 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. |
| 159 | */ | 193 | */ |
| 160 | static struct anon_vma *page_lock_anon_vma(struct page *page) | 194 | struct anon_vma *page_lock_anon_vma(struct page *page) |
| 161 | { | 195 | { |
| 162 | struct anon_vma *anon_vma; | 196 | struct anon_vma *anon_vma; |
| 163 | unsigned long anon_mapping; | 197 | unsigned long anon_mapping; |
| @@ -177,7 +211,7 @@ out: | |||
| 177 | return NULL; | 211 | return NULL; |
| 178 | } | 212 | } |
| 179 | 213 | ||
| 180 | static void page_unlock_anon_vma(struct anon_vma *anon_vma) | 214 | void page_unlock_anon_vma(struct anon_vma *anon_vma) |
| 181 | { | 215 | { |
| 182 | spin_unlock(&anon_vma->lock); | 216 | spin_unlock(&anon_vma->lock); |
| 183 | rcu_read_unlock(); | 217 | rcu_read_unlock(); |
| @@ -268,6 +302,32 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, | |||
| 268 | return NULL; | 302 | return NULL; |
| 269 | } | 303 | } |
| 270 | 304 | ||
| 305 | /** | ||
| 306 | * page_mapped_in_vma - check whether a page is really mapped in a VMA | ||
| 307 | * @page: the page to test | ||
| 308 | * @vma: the VMA to test | ||
| 309 | * | ||
| 310 | * Returns 1 if the page is mapped into the page tables of the VMA, 0 | ||
| 311 | * if the page is not mapped into the page tables of this VMA. Only | ||
| 312 | * valid for normal file or anonymous VMAs. | ||
| 313 | */ | ||
| 314 | static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | ||
| 315 | { | ||
| 316 | unsigned long address; | ||
| 317 | pte_t *pte; | ||
| 318 | spinlock_t *ptl; | ||
| 319 | |||
| 320 | address = vma_address(page, vma); | ||
| 321 | if (address == -EFAULT) /* out of vma range */ | ||
| 322 | return 0; | ||
| 323 | pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); | ||
| 324 | if (!pte) /* the page is not in this mm */ | ||
| 325 | return 0; | ||
| 326 | pte_unmap_unlock(pte, ptl); | ||
| 327 | |||
| 328 | return 1; | ||
| 329 | } | ||
| 330 | |||
| 271 | /* | 331 | /* |
| 272 | * Subfunctions of page_referenced: page_referenced_one called | 332 | * Subfunctions of page_referenced: page_referenced_one called |
| 273 | * repeatedly from either page_referenced_anon or page_referenced_file. | 333 | * repeatedly from either page_referenced_anon or page_referenced_file. |
| @@ -289,10 +349,17 @@ static int page_referenced_one(struct page *page, | |||
| 289 | if (!pte) | 349 | if (!pte) |
| 290 | goto out; | 350 | goto out; |
| 291 | 351 | ||
| 352 | /* | ||
| 353 | * Don't want to elevate referenced for mlocked page that gets this far, | ||
| 354 | * in order that it progresses to try_to_unmap and is moved to the | ||
| 355 | * unevictable list. | ||
| 356 | */ | ||
| 292 | if (vma->vm_flags & VM_LOCKED) { | 357 | if (vma->vm_flags & VM_LOCKED) { |
| 293 | referenced++; | ||
| 294 | *mapcount = 1; /* break early from loop */ | 358 | *mapcount = 1; /* break early from loop */ |
| 295 | } else if (ptep_clear_flush_young_notify(vma, address, pte)) | 359 | goto out_unmap; |
| 360 | } | ||
| 361 | |||
| 362 | if (ptep_clear_flush_young_notify(vma, address, pte)) | ||
| 296 | referenced++; | 363 | referenced++; |
| 297 | 364 | ||
| 298 | /* Pretend the page is referenced if the task has the | 365 | /* Pretend the page is referenced if the task has the |
| @@ -301,6 +368,7 @@ static int page_referenced_one(struct page *page, | |||
| 301 | rwsem_is_locked(&mm->mmap_sem)) | 368 | rwsem_is_locked(&mm->mmap_sem)) |
| 302 | referenced++; | 369 | referenced++; |
| 303 | 370 | ||
| 371 | out_unmap: | ||
| 304 | (*mapcount)--; | 372 | (*mapcount)--; |
| 305 | pte_unmap_unlock(pte, ptl); | 373 | pte_unmap_unlock(pte, ptl); |
| 306 | out: | 374 | out: |
| @@ -390,11 +458,6 @@ static int page_referenced_file(struct page *page, | |||
| 390 | */ | 458 | */ |
| 391 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 459 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
| 392 | continue; | 460 | continue; |
| 393 | if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) | ||
| 394 | == (VM_LOCKED|VM_MAYSHARE)) { | ||
| 395 | referenced++; | ||
| 396 | break; | ||
| 397 | } | ||
| 398 | referenced += page_referenced_one(page, vma, &mapcount); | 461 | referenced += page_referenced_one(page, vma, &mapcount); |
| 399 | if (!mapcount) | 462 | if (!mapcount) |
| 400 | break; | 463 | break; |
| @@ -674,8 +737,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) | |||
| 674 | page_clear_dirty(page); | 737 | page_clear_dirty(page); |
| 675 | set_page_dirty(page); | 738 | set_page_dirty(page); |
| 676 | } | 739 | } |
| 677 | 740 | if (PageAnon(page)) | |
| 678 | mem_cgroup_uncharge_page(page); | 741 | mem_cgroup_uncharge_page(page); |
| 679 | __dec_zone_page_state(page, | 742 | __dec_zone_page_state(page, |
| 680 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); | 743 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); |
| 681 | /* | 744 | /* |
| @@ -717,11 +780,16 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
| 717 | * If it's recently referenced (perhaps page_referenced | 780 | * If it's recently referenced (perhaps page_referenced |
| 718 | * skipped over this mm) then we should reactivate it. | 781 | * skipped over this mm) then we should reactivate it. |
| 719 | */ | 782 | */ |
| 720 | if (!migration && ((vma->vm_flags & VM_LOCKED) || | 783 | if (!migration) { |
| 721 | (ptep_clear_flush_young_notify(vma, address, pte)))) { | 784 | if (vma->vm_flags & VM_LOCKED) { |
| 722 | ret = SWAP_FAIL; | 785 | ret = SWAP_MLOCK; |
| 723 | goto out_unmap; | 786 | goto out_unmap; |
| 724 | } | 787 | } |
| 788 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | ||
| 789 | ret = SWAP_FAIL; | ||
| 790 | goto out_unmap; | ||
| 791 | } | ||
| 792 | } | ||
| 725 | 793 | ||
| 726 | /* Nuke the page table entry. */ | 794 | /* Nuke the page table entry. */ |
| 727 | flush_cache_page(vma, address, page_to_pfn(page)); | 795 | flush_cache_page(vma, address, page_to_pfn(page)); |
| @@ -802,12 +870,17 @@ out: | |||
| 802 | * For very sparsely populated VMAs this is a little inefficient - chances are | 870 | * For very sparsely populated VMAs this is a little inefficient - chances are |
| 803 | * there there won't be many ptes located within the scan cluster. In this case | 871 | * there there won't be many ptes located within the scan cluster. In this case |
| 804 | * maybe we could scan further - to the end of the pte page, perhaps. | 872 | * maybe we could scan further - to the end of the pte page, perhaps. |
| 873 | * | ||
| 874 | * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can | ||
| 875 | * acquire it without blocking. If vma locked, mlock the pages in the cluster, | ||
| 876 | * rather than unmapping them. If we encounter the "check_page" that vmscan is | ||
| 877 | * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. | ||
| 805 | */ | 878 | */ |
| 806 | #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) | 879 | #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) |
| 807 | #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) | 880 | #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) |
| 808 | 881 | ||
| 809 | static void try_to_unmap_cluster(unsigned long cursor, | 882 | static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, |
| 810 | unsigned int *mapcount, struct vm_area_struct *vma) | 883 | struct vm_area_struct *vma, struct page *check_page) |
| 811 | { | 884 | { |
| 812 | struct mm_struct *mm = vma->vm_mm; | 885 | struct mm_struct *mm = vma->vm_mm; |
| 813 | pgd_t *pgd; | 886 | pgd_t *pgd; |
| @@ -819,6 +892,8 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
| 819 | struct page *page; | 892 | struct page *page; |
| 820 | unsigned long address; | 893 | unsigned long address; |
| 821 | unsigned long end; | 894 | unsigned long end; |
| 895 | int ret = SWAP_AGAIN; | ||
| 896 | int locked_vma = 0; | ||
| 822 | 897 | ||
| 823 | address = (vma->vm_start + cursor) & CLUSTER_MASK; | 898 | address = (vma->vm_start + cursor) & CLUSTER_MASK; |
| 824 | end = address + CLUSTER_SIZE; | 899 | end = address + CLUSTER_SIZE; |
| @@ -829,15 +904,26 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
| 829 | 904 | ||
| 830 | pgd = pgd_offset(mm, address); | 905 | pgd = pgd_offset(mm, address); |
| 831 | if (!pgd_present(*pgd)) | 906 | if (!pgd_present(*pgd)) |
| 832 | return; | 907 | return ret; |
| 833 | 908 | ||
| 834 | pud = pud_offset(pgd, address); | 909 | pud = pud_offset(pgd, address); |
| 835 | if (!pud_present(*pud)) | 910 | if (!pud_present(*pud)) |
| 836 | return; | 911 | return ret; |
| 837 | 912 | ||
| 838 | pmd = pmd_offset(pud, address); | 913 | pmd = pmd_offset(pud, address); |
| 839 | if (!pmd_present(*pmd)) | 914 | if (!pmd_present(*pmd)) |
| 840 | return; | 915 | return ret; |
| 916 | |||
| 917 | /* | ||
| 918 | * MLOCK_PAGES => feature is configured. | ||
| 919 | * if we can acquire the mmap_sem for read, and vma is VM_LOCKED, | ||
| 920 | * keep the sem while scanning the cluster for mlocking pages. | ||
| 921 | */ | ||
| 922 | if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
| 923 | locked_vma = (vma->vm_flags & VM_LOCKED); | ||
| 924 | if (!locked_vma) | ||
| 925 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ | ||
| 926 | } | ||
| 841 | 927 | ||
| 842 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 928 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
| 843 | 929 | ||
| @@ -850,6 +936,13 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
| 850 | page = vm_normal_page(vma, address, *pte); | 936 | page = vm_normal_page(vma, address, *pte); |
| 851 | BUG_ON(!page || PageAnon(page)); | 937 | BUG_ON(!page || PageAnon(page)); |
| 852 | 938 | ||
| 939 | if (locked_vma) { | ||
| 940 | mlock_vma_page(page); /* no-op if already mlocked */ | ||
| 941 | if (page == check_page) | ||
| 942 | ret = SWAP_MLOCK; | ||
| 943 | continue; /* don't unmap */ | ||
| 944 | } | ||
| 945 | |||
| 853 | if (ptep_clear_flush_young_notify(vma, address, pte)) | 946 | if (ptep_clear_flush_young_notify(vma, address, pte)) |
| 854 | continue; | 947 | continue; |
| 855 | 948 | ||
| @@ -871,39 +964,104 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
| 871 | (*mapcount)--; | 964 | (*mapcount)--; |
| 872 | } | 965 | } |
| 873 | pte_unmap_unlock(pte - 1, ptl); | 966 | pte_unmap_unlock(pte - 1, ptl); |
| 967 | if (locked_vma) | ||
| 968 | up_read(&vma->vm_mm->mmap_sem); | ||
| 969 | return ret; | ||
| 874 | } | 970 | } |
| 875 | 971 | ||
| 876 | static int try_to_unmap_anon(struct page *page, int migration) | 972 | /* |
| 973 | * common handling for pages mapped in VM_LOCKED vmas | ||
| 974 | */ | ||
| 975 | static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) | ||
| 976 | { | ||
| 977 | int mlocked = 0; | ||
| 978 | |||
| 979 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
| 980 | if (vma->vm_flags & VM_LOCKED) { | ||
| 981 | mlock_vma_page(page); | ||
| 982 | mlocked++; /* really mlocked the page */ | ||
| 983 | } | ||
| 984 | up_read(&vma->vm_mm->mmap_sem); | ||
| 985 | } | ||
| 986 | return mlocked; | ||
| 987 | } | ||
| 988 | |||
| 989 | /** | ||
| 990 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based | ||
| 991 | * rmap method | ||
| 992 | * @page: the page to unmap/unlock | ||
| 993 | * @unlock: request for unlock rather than unmap [unlikely] | ||
| 994 | * @migration: unmapping for migration - ignored if @unlock | ||
| 995 | * | ||
| 996 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
| 997 | * contained in the anon_vma struct it points to. | ||
| 998 | * | ||
| 999 | * This function is only called from try_to_unmap/try_to_munlock for | ||
| 1000 | * anonymous pages. | ||
| 1001 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
| 1002 | * where the page was found will be held for write. So, we won't recheck | ||
| 1003 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
| 1004 | * 'LOCKED. | ||
| 1005 | */ | ||
| 1006 | static int try_to_unmap_anon(struct page *page, int unlock, int migration) | ||
| 877 | { | 1007 | { |
| 878 | struct anon_vma *anon_vma; | 1008 | struct anon_vma *anon_vma; |
| 879 | struct vm_area_struct *vma; | 1009 | struct vm_area_struct *vma; |
| 1010 | unsigned int mlocked = 0; | ||
| 880 | int ret = SWAP_AGAIN; | 1011 | int ret = SWAP_AGAIN; |
| 881 | 1012 | ||
| 1013 | if (MLOCK_PAGES && unlikely(unlock)) | ||
| 1014 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | ||
| 1015 | |||
| 882 | anon_vma = page_lock_anon_vma(page); | 1016 | anon_vma = page_lock_anon_vma(page); |
| 883 | if (!anon_vma) | 1017 | if (!anon_vma) |
| 884 | return ret; | 1018 | return ret; |
| 885 | 1019 | ||
| 886 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1020 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { |
| 887 | ret = try_to_unmap_one(page, vma, migration); | 1021 | if (MLOCK_PAGES && unlikely(unlock)) { |
| 888 | if (ret == SWAP_FAIL || !page_mapped(page)) | 1022 | if (!((vma->vm_flags & VM_LOCKED) && |
| 889 | break; | 1023 | page_mapped_in_vma(page, vma))) |
| 1024 | continue; /* must visit all unlocked vmas */ | ||
| 1025 | ret = SWAP_MLOCK; /* saw at least one mlocked vma */ | ||
| 1026 | } else { | ||
| 1027 | ret = try_to_unmap_one(page, vma, migration); | ||
| 1028 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
| 1029 | break; | ||
| 1030 | } | ||
| 1031 | if (ret == SWAP_MLOCK) { | ||
| 1032 | mlocked = try_to_mlock_page(page, vma); | ||
| 1033 | if (mlocked) | ||
| 1034 | break; /* stop if actually mlocked page */ | ||
| 1035 | } | ||
| 890 | } | 1036 | } |
| 891 | 1037 | ||
| 892 | page_unlock_anon_vma(anon_vma); | 1038 | page_unlock_anon_vma(anon_vma); |
| 1039 | |||
| 1040 | if (mlocked) | ||
| 1041 | ret = SWAP_MLOCK; /* actually mlocked the page */ | ||
| 1042 | else if (ret == SWAP_MLOCK) | ||
| 1043 | ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ | ||
| 1044 | |||
| 893 | return ret; | 1045 | return ret; |
| 894 | } | 1046 | } |
| 895 | 1047 | ||
| 896 | /** | 1048 | /** |
| 897 | * try_to_unmap_file - unmap file page using the object-based rmap method | 1049 | * try_to_unmap_file - unmap/unlock file page using the object-based rmap method |
| 898 | * @page: the page to unmap | 1050 | * @page: the page to unmap/unlock |
| 899 | * @migration: migration flag | 1051 | * @unlock: request for unlock rather than unmap [unlikely] |
| 1052 | * @migration: unmapping for migration - ignored if @unlock | ||
| 900 | * | 1053 | * |
| 901 | * Find all the mappings of a page using the mapping pointer and the vma chains | 1054 | * Find all the mappings of a page using the mapping pointer and the vma chains |
| 902 | * contained in the address_space struct it points to. | 1055 | * contained in the address_space struct it points to. |
| 903 | * | 1056 | * |
| 904 | * This function is only called from try_to_unmap for object-based pages. | 1057 | * This function is only called from try_to_unmap/try_to_munlock for |
| 1058 | * object-based pages. | ||
| 1059 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
| 1060 | * where the page was found will be held for write. So, we won't recheck | ||
| 1061 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
| 1062 | * 'LOCKED. | ||
| 905 | */ | 1063 | */ |
| 906 | static int try_to_unmap_file(struct page *page, int migration) | 1064 | static int try_to_unmap_file(struct page *page, int unlock, int migration) |
| 907 | { | 1065 | { |
| 908 | struct address_space *mapping = page->mapping; | 1066 | struct address_space *mapping = page->mapping; |
| 909 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1067 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
| @@ -914,20 +1072,44 @@ static int try_to_unmap_file(struct page *page, int migration) | |||
| 914 | unsigned long max_nl_cursor = 0; | 1072 | unsigned long max_nl_cursor = 0; |
| 915 | unsigned long max_nl_size = 0; | 1073 | unsigned long max_nl_size = 0; |
| 916 | unsigned int mapcount; | 1074 | unsigned int mapcount; |
| 1075 | unsigned int mlocked = 0; | ||
| 1076 | |||
| 1077 | if (MLOCK_PAGES && unlikely(unlock)) | ||
| 1078 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | ||
| 917 | 1079 | ||
| 918 | spin_lock(&mapping->i_mmap_lock); | 1080 | spin_lock(&mapping->i_mmap_lock); |
| 919 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1081 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
| 920 | ret = try_to_unmap_one(page, vma, migration); | 1082 | if (MLOCK_PAGES && unlikely(unlock)) { |
| 921 | if (ret == SWAP_FAIL || !page_mapped(page)) | 1083 | if (!(vma->vm_flags & VM_LOCKED)) |
| 922 | goto out; | 1084 | continue; /* must visit all vmas */ |
| 1085 | ret = SWAP_MLOCK; | ||
| 1086 | } else { | ||
| 1087 | ret = try_to_unmap_one(page, vma, migration); | ||
| 1088 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
| 1089 | goto out; | ||
| 1090 | } | ||
| 1091 | if (ret == SWAP_MLOCK) { | ||
| 1092 | mlocked = try_to_mlock_page(page, vma); | ||
| 1093 | if (mlocked) | ||
| 1094 | break; /* stop if actually mlocked page */ | ||
| 1095 | } | ||
| 923 | } | 1096 | } |
| 924 | 1097 | ||
| 1098 | if (mlocked) | ||
| 1099 | goto out; | ||
| 1100 | |||
| 925 | if (list_empty(&mapping->i_mmap_nonlinear)) | 1101 | if (list_empty(&mapping->i_mmap_nonlinear)) |
| 926 | goto out; | 1102 | goto out; |
| 927 | 1103 | ||
| 928 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1104 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
| 929 | shared.vm_set.list) { | 1105 | shared.vm_set.list) { |
| 930 | if ((vma->vm_flags & VM_LOCKED) && !migration) | 1106 | if (MLOCK_PAGES && unlikely(unlock)) { |
| 1107 | if (!(vma->vm_flags & VM_LOCKED)) | ||
| 1108 | continue; /* must visit all vmas */ | ||
| 1109 | ret = SWAP_MLOCK; /* leave mlocked == 0 */ | ||
| 1110 | goto out; /* no need to look further */ | ||
| 1111 | } | ||
| 1112 | if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) | ||
| 931 | continue; | 1113 | continue; |
| 932 | cursor = (unsigned long) vma->vm_private_data; | 1114 | cursor = (unsigned long) vma->vm_private_data; |
| 933 | if (cursor > max_nl_cursor) | 1115 | if (cursor > max_nl_cursor) |
| @@ -937,7 +1119,7 @@ static int try_to_unmap_file(struct page *page, int migration) | |||
| 937 | max_nl_size = cursor; | 1119 | max_nl_size = cursor; |
| 938 | } | 1120 | } |
| 939 | 1121 | ||
| 940 | if (max_nl_size == 0) { /* any nonlinears locked or reserved */ | 1122 | if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ |
| 941 | ret = SWAP_FAIL; | 1123 | ret = SWAP_FAIL; |
| 942 | goto out; | 1124 | goto out; |
| 943 | } | 1125 | } |
| @@ -961,12 +1143,16 @@ static int try_to_unmap_file(struct page *page, int migration) | |||
| 961 | do { | 1143 | do { |
| 962 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1144 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
| 963 | shared.vm_set.list) { | 1145 | shared.vm_set.list) { |
| 964 | if ((vma->vm_flags & VM_LOCKED) && !migration) | 1146 | if (!MLOCK_PAGES && !migration && |
| 1147 | (vma->vm_flags & VM_LOCKED)) | ||
| 965 | continue; | 1148 | continue; |
| 966 | cursor = (unsigned long) vma->vm_private_data; | 1149 | cursor = (unsigned long) vma->vm_private_data; |
| 967 | while ( cursor < max_nl_cursor && | 1150 | while ( cursor < max_nl_cursor && |
| 968 | cursor < vma->vm_end - vma->vm_start) { | 1151 | cursor < vma->vm_end - vma->vm_start) { |
| 969 | try_to_unmap_cluster(cursor, &mapcount, vma); | 1152 | ret = try_to_unmap_cluster(cursor, &mapcount, |
| 1153 | vma, page); | ||
| 1154 | if (ret == SWAP_MLOCK) | ||
| 1155 | mlocked = 2; /* to return below */ | ||
| 970 | cursor += CLUSTER_SIZE; | 1156 | cursor += CLUSTER_SIZE; |
| 971 | vma->vm_private_data = (void *) cursor; | 1157 | vma->vm_private_data = (void *) cursor; |
| 972 | if ((int)mapcount <= 0) | 1158 | if ((int)mapcount <= 0) |
| @@ -987,6 +1173,10 @@ static int try_to_unmap_file(struct page *page, int migration) | |||
| 987 | vma->vm_private_data = NULL; | 1173 | vma->vm_private_data = NULL; |
| 988 | out: | 1174 | out: |
| 989 | spin_unlock(&mapping->i_mmap_lock); | 1175 | spin_unlock(&mapping->i_mmap_lock); |
| 1176 | if (mlocked) | ||
| 1177 | ret = SWAP_MLOCK; /* actually mlocked the page */ | ||
| 1178 | else if (ret == SWAP_MLOCK) | ||
| 1179 | ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ | ||
| 990 | return ret; | 1180 | return ret; |
| 991 | } | 1181 | } |
| 992 | 1182 | ||
| @@ -1002,6 +1192,7 @@ out: | |||
| 1002 | * SWAP_SUCCESS - we succeeded in removing all mappings | 1192 | * SWAP_SUCCESS - we succeeded in removing all mappings |
| 1003 | * SWAP_AGAIN - we missed a mapping, try again later | 1193 | * SWAP_AGAIN - we missed a mapping, try again later |
| 1004 | * SWAP_FAIL - the page is unswappable | 1194 | * SWAP_FAIL - the page is unswappable |
| 1195 | * SWAP_MLOCK - page is mlocked. | ||
| 1005 | */ | 1196 | */ |
| 1006 | int try_to_unmap(struct page *page, int migration) | 1197 | int try_to_unmap(struct page *page, int migration) |
| 1007 | { | 1198 | { |
| @@ -1010,12 +1201,36 @@ int try_to_unmap(struct page *page, int migration) | |||
| 1010 | BUG_ON(!PageLocked(page)); | 1201 | BUG_ON(!PageLocked(page)); |
| 1011 | 1202 | ||
| 1012 | if (PageAnon(page)) | 1203 | if (PageAnon(page)) |
| 1013 | ret = try_to_unmap_anon(page, migration); | 1204 | ret = try_to_unmap_anon(page, 0, migration); |
| 1014 | else | 1205 | else |
| 1015 | ret = try_to_unmap_file(page, migration); | 1206 | ret = try_to_unmap_file(page, 0, migration); |
| 1016 | 1207 | if (ret != SWAP_MLOCK && !page_mapped(page)) | |
| 1017 | if (!page_mapped(page)) | ||
| 1018 | ret = SWAP_SUCCESS; | 1208 | ret = SWAP_SUCCESS; |
| 1019 | return ret; | 1209 | return ret; |
| 1020 | } | 1210 | } |
| 1021 | 1211 | ||
| 1212 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1213 | /** | ||
| 1214 | * try_to_munlock - try to munlock a page | ||
| 1215 | * @page: the page to be munlocked | ||
| 1216 | * | ||
| 1217 | * Called from munlock code. Checks all of the VMAs mapping the page | ||
| 1218 | * to make sure nobody else has this page mlocked. The page will be | ||
| 1219 | * returned with PG_mlocked cleared if no other vmas have it mlocked. | ||
| 1220 | * | ||
| 1221 | * Return values are: | ||
| 1222 | * | ||
| 1223 | * SWAP_SUCCESS - no vma's holding page mlocked. | ||
| 1224 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem | ||
| 1225 | * SWAP_MLOCK - page is now mlocked. | ||
| 1226 | */ | ||
| 1227 | int try_to_munlock(struct page *page) | ||
| 1228 | { | ||
| 1229 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); | ||
| 1230 | |||
| 1231 | if (PageAnon(page)) | ||
| 1232 | return try_to_unmap_anon(page, 1, 0); | ||
| 1233 | else | ||
| 1234 | return try_to_unmap_file(page, 1, 0); | ||
| 1235 | } | ||
| 1236 | #endif | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 04fb4f1ab88e..0ed075215e5f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -50,14 +50,12 @@ | |||
| 50 | #include <linux/migrate.h> | 50 | #include <linux/migrate.h> |
| 51 | #include <linux/highmem.h> | 51 | #include <linux/highmem.h> |
| 52 | #include <linux/seq_file.h> | 52 | #include <linux/seq_file.h> |
| 53 | #include <linux/magic.h> | ||
| 53 | 54 | ||
| 54 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
| 55 | #include <asm/div64.h> | 56 | #include <asm/div64.h> |
| 56 | #include <asm/pgtable.h> | 57 | #include <asm/pgtable.h> |
| 57 | 58 | ||
| 58 | /* This magic number is used in glibc for posix shared memory */ | ||
| 59 | #define TMPFS_MAGIC 0x01021994 | ||
| 60 | |||
| 61 | #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) | 59 | #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) |
| 62 | #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) | 60 | #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) |
| 63 | #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) | 61 | #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) |
| @@ -163,8 +161,8 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) | |||
| 163 | */ | 161 | */ |
| 164 | static inline int shmem_acct_size(unsigned long flags, loff_t size) | 162 | static inline int shmem_acct_size(unsigned long flags, loff_t size) |
| 165 | { | 163 | { |
| 166 | return (flags & VM_ACCOUNT)? | 164 | return (flags & VM_ACCOUNT) ? |
| 167 | security_vm_enough_memory(VM_ACCT(size)): 0; | 165 | security_vm_enough_memory_kern(VM_ACCT(size)) : 0; |
| 168 | } | 166 | } |
| 169 | 167 | ||
| 170 | static inline void shmem_unacct_size(unsigned long flags, loff_t size) | 168 | static inline void shmem_unacct_size(unsigned long flags, loff_t size) |
| @@ -181,8 +179,8 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size) | |||
| 181 | */ | 179 | */ |
| 182 | static inline int shmem_acct_block(unsigned long flags) | 180 | static inline int shmem_acct_block(unsigned long flags) |
| 183 | { | 181 | { |
| 184 | return (flags & VM_ACCOUNT)? | 182 | return (flags & VM_ACCOUNT) ? |
| 185 | 0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE)); | 183 | 0 : security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)); |
| 186 | } | 184 | } |
| 187 | 185 | ||
| 188 | static inline void shmem_unacct_blocks(unsigned long flags, long pages) | 186 | static inline void shmem_unacct_blocks(unsigned long flags, long pages) |
| @@ -201,7 +199,7 @@ static struct vm_operations_struct shmem_vm_ops; | |||
| 201 | 199 | ||
| 202 | static struct backing_dev_info shmem_backing_dev_info __read_mostly = { | 200 | static struct backing_dev_info shmem_backing_dev_info __read_mostly = { |
| 203 | .ra_pages = 0, /* No readahead */ | 201 | .ra_pages = 0, /* No readahead */ |
| 204 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 202 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, |
| 205 | .unplug_io_fn = default_unplug_io_fn, | 203 | .unplug_io_fn = default_unplug_io_fn, |
| 206 | }; | 204 | }; |
| 207 | 205 | ||
| @@ -1369,6 +1367,7 @@ repeat: | |||
| 1369 | error = -ENOMEM; | 1367 | error = -ENOMEM; |
| 1370 | goto failed; | 1368 | goto failed; |
| 1371 | } | 1369 | } |
| 1370 | SetPageSwapBacked(filepage); | ||
| 1372 | 1371 | ||
| 1373 | /* Precharge page while we can wait, compensate after */ | 1372 | /* Precharge page while we can wait, compensate after */ |
| 1374 | error = mem_cgroup_cache_charge(filepage, current->mm, | 1373 | error = mem_cgroup_cache_charge(filepage, current->mm, |
| @@ -1478,12 +1477,16 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) | |||
| 1478 | if (!user_shm_lock(inode->i_size, user)) | 1477 | if (!user_shm_lock(inode->i_size, user)) |
| 1479 | goto out_nomem; | 1478 | goto out_nomem; |
| 1480 | info->flags |= VM_LOCKED; | 1479 | info->flags |= VM_LOCKED; |
| 1480 | mapping_set_unevictable(file->f_mapping); | ||
| 1481 | } | 1481 | } |
| 1482 | if (!lock && (info->flags & VM_LOCKED) && user) { | 1482 | if (!lock && (info->flags & VM_LOCKED) && user) { |
| 1483 | user_shm_unlock(inode->i_size, user); | 1483 | user_shm_unlock(inode->i_size, user); |
| 1484 | info->flags &= ~VM_LOCKED; | 1484 | info->flags &= ~VM_LOCKED; |
| 1485 | mapping_clear_unevictable(file->f_mapping); | ||
| 1486 | scan_mapping_unevictable_pages(file->f_mapping); | ||
| 1485 | } | 1487 | } |
| 1486 | retval = 0; | 1488 | retval = 0; |
| 1489 | |||
| 1487 | out_nomem: | 1490 | out_nomem: |
| 1488 | spin_unlock(&info->lock); | 1491 | spin_unlock(&info->lock); |
| 1489 | return retval; | 1492 | return retval; |
| @@ -2582,6 +2585,7 @@ put_memory: | |||
| 2582 | shmem_unacct_size(flags, size); | 2585 | shmem_unacct_size(flags, size); |
| 2583 | return ERR_PTR(error); | 2586 | return ERR_PTR(error); |
| 2584 | } | 2587 | } |
| 2588 | EXPORT_SYMBOL_GPL(shmem_file_setup); | ||
| 2585 | 2589 | ||
| 2586 | /** | 2590 | /** |
| 2587 | * shmem_zero_setup - setup a shared anonymous mapping | 2591 | * shmem_zero_setup - setup a shared anonymous mapping |
| @@ -95,6 +95,7 @@ | |||
| 95 | #include <linux/init.h> | 95 | #include <linux/init.h> |
| 96 | #include <linux/compiler.h> | 96 | #include <linux/compiler.h> |
| 97 | #include <linux/cpuset.h> | 97 | #include <linux/cpuset.h> |
| 98 | #include <linux/proc_fs.h> | ||
| 98 | #include <linux/seq_file.h> | 99 | #include <linux/seq_file.h> |
| 99 | #include <linux/notifier.h> | 100 | #include <linux/notifier.h> |
| 100 | #include <linux/kallsyms.h> | 101 | #include <linux/kallsyms.h> |
| @@ -4258,7 +4259,7 @@ static int s_show(struct seq_file *m, void *p) | |||
| 4258 | * + further values on SMP and with statistics enabled | 4259 | * + further values on SMP and with statistics enabled |
| 4259 | */ | 4260 | */ |
| 4260 | 4261 | ||
| 4261 | const struct seq_operations slabinfo_op = { | 4262 | static const struct seq_operations slabinfo_op = { |
| 4262 | .start = s_start, | 4263 | .start = s_start, |
| 4263 | .next = s_next, | 4264 | .next = s_next, |
| 4264 | .stop = s_stop, | 4265 | .stop = s_stop, |
| @@ -4315,6 +4316,19 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
| 4315 | return res; | 4316 | return res; |
| 4316 | } | 4317 | } |
| 4317 | 4318 | ||
| 4319 | static int slabinfo_open(struct inode *inode, struct file *file) | ||
| 4320 | { | ||
| 4321 | return seq_open(file, &slabinfo_op); | ||
| 4322 | } | ||
| 4323 | |||
| 4324 | static const struct file_operations proc_slabinfo_operations = { | ||
| 4325 | .open = slabinfo_open, | ||
| 4326 | .read = seq_read, | ||
| 4327 | .write = slabinfo_write, | ||
| 4328 | .llseek = seq_lseek, | ||
| 4329 | .release = seq_release, | ||
| 4330 | }; | ||
| 4331 | |||
| 4318 | #ifdef CONFIG_DEBUG_SLAB_LEAK | 4332 | #ifdef CONFIG_DEBUG_SLAB_LEAK |
| 4319 | 4333 | ||
| 4320 | static void *leaks_start(struct seq_file *m, loff_t *pos) | 4334 | static void *leaks_start(struct seq_file *m, loff_t *pos) |
| @@ -4443,13 +4457,47 @@ static int leaks_show(struct seq_file *m, void *p) | |||
| 4443 | return 0; | 4457 | return 0; |
| 4444 | } | 4458 | } |
| 4445 | 4459 | ||
| 4446 | const struct seq_operations slabstats_op = { | 4460 | static const struct seq_operations slabstats_op = { |
| 4447 | .start = leaks_start, | 4461 | .start = leaks_start, |
| 4448 | .next = s_next, | 4462 | .next = s_next, |
| 4449 | .stop = s_stop, | 4463 | .stop = s_stop, |
| 4450 | .show = leaks_show, | 4464 | .show = leaks_show, |
| 4451 | }; | 4465 | }; |
| 4466 | |||
| 4467 | static int slabstats_open(struct inode *inode, struct file *file) | ||
| 4468 | { | ||
| 4469 | unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); | ||
| 4470 | int ret = -ENOMEM; | ||
| 4471 | if (n) { | ||
| 4472 | ret = seq_open(file, &slabstats_op); | ||
| 4473 | if (!ret) { | ||
| 4474 | struct seq_file *m = file->private_data; | ||
| 4475 | *n = PAGE_SIZE / (2 * sizeof(unsigned long)); | ||
| 4476 | m->private = n; | ||
| 4477 | n = NULL; | ||
| 4478 | } | ||
| 4479 | kfree(n); | ||
| 4480 | } | ||
| 4481 | return ret; | ||
| 4482 | } | ||
| 4483 | |||
| 4484 | static const struct file_operations proc_slabstats_operations = { | ||
| 4485 | .open = slabstats_open, | ||
| 4486 | .read = seq_read, | ||
| 4487 | .llseek = seq_lseek, | ||
| 4488 | .release = seq_release_private, | ||
| 4489 | }; | ||
| 4490 | #endif | ||
| 4491 | |||
| 4492 | static int __init slab_proc_init(void) | ||
| 4493 | { | ||
| 4494 | proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); | ||
| 4495 | #ifdef CONFIG_DEBUG_SLAB_LEAK | ||
| 4496 | proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); | ||
| 4452 | #endif | 4497 | #endif |
| 4498 | return 0; | ||
| 4499 | } | ||
| 4500 | module_init(slab_proc_init); | ||
| 4453 | #endif | 4501 | #endif |
| 4454 | 4502 | ||
| 4455 | /** | 4503 | /** |
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/interrupt.h> | 14 | #include <linux/interrupt.h> |
| 15 | #include <linux/bitops.h> | 15 | #include <linux/bitops.h> |
| 16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
| 17 | #include <linux/proc_fs.h> | ||
| 17 | #include <linux/seq_file.h> | 18 | #include <linux/seq_file.h> |
| 18 | #include <linux/cpu.h> | 19 | #include <linux/cpu.h> |
| 19 | #include <linux/cpuset.h> | 20 | #include <linux/cpuset.h> |
| @@ -4417,14 +4418,6 @@ __initcall(slab_sysfs_init); | |||
| 4417 | * The /proc/slabinfo ABI | 4418 | * The /proc/slabinfo ABI |
| 4418 | */ | 4419 | */ |
| 4419 | #ifdef CONFIG_SLABINFO | 4420 | #ifdef CONFIG_SLABINFO |
| 4420 | |||
| 4421 | ssize_t slabinfo_write(struct file *file, const char __user *buffer, | ||
| 4422 | size_t count, loff_t *ppos) | ||
| 4423 | { | ||
| 4424 | return -EINVAL; | ||
| 4425 | } | ||
| 4426 | |||
| 4427 | |||
| 4428 | static void print_slabinfo_header(struct seq_file *m) | 4421 | static void print_slabinfo_header(struct seq_file *m) |
| 4429 | { | 4422 | { |
| 4430 | seq_puts(m, "slabinfo - version: 2.1\n"); | 4423 | seq_puts(m, "slabinfo - version: 2.1\n"); |
| @@ -4492,11 +4485,29 @@ static int s_show(struct seq_file *m, void *p) | |||
| 4492 | return 0; | 4485 | return 0; |
| 4493 | } | 4486 | } |
| 4494 | 4487 | ||
| 4495 | const struct seq_operations slabinfo_op = { | 4488 | static const struct seq_operations slabinfo_op = { |
| 4496 | .start = s_start, | 4489 | .start = s_start, |
| 4497 | .next = s_next, | 4490 | .next = s_next, |
| 4498 | .stop = s_stop, | 4491 | .stop = s_stop, |
| 4499 | .show = s_show, | 4492 | .show = s_show, |
| 4500 | }; | 4493 | }; |
| 4501 | 4494 | ||
| 4495 | static int slabinfo_open(struct inode *inode, struct file *file) | ||
| 4496 | { | ||
| 4497 | return seq_open(file, &slabinfo_op); | ||
| 4498 | } | ||
| 4499 | |||
| 4500 | static const struct file_operations proc_slabinfo_operations = { | ||
| 4501 | .open = slabinfo_open, | ||
| 4502 | .read = seq_read, | ||
| 4503 | .llseek = seq_lseek, | ||
| 4504 | .release = seq_release, | ||
| 4505 | }; | ||
| 4506 | |||
| 4507 | static int __init slab_proc_init(void) | ||
| 4508 | { | ||
| 4509 | proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); | ||
| 4510 | return 0; | ||
| 4511 | } | ||
| 4512 | module_init(slab_proc_init); | ||
| 4502 | #endif /* CONFIG_SLABINFO */ | 4513 | #endif /* CONFIG_SLABINFO */ |
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index a91b5f8fcaf6..a13ea6401ae7 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
| @@ -64,7 +64,7 @@ void __meminit vmemmap_verify(pte_t *pte, int node, | |||
| 64 | unsigned long pfn = pte_pfn(*pte); | 64 | unsigned long pfn = pte_pfn(*pte); |
| 65 | int actual_node = early_pfn_to_nid(pfn); | 65 | int actual_node = early_pfn_to_nid(pfn); |
| 66 | 66 | ||
| 67 | if (actual_node != node) | 67 | if (node_distance(actual_node, node) > LOCAL_DISTANCE) |
| 68 | printk(KERN_WARNING "[%lx-%lx] potential offnode " | 68 | printk(KERN_WARNING "[%lx-%lx] potential offnode " |
| 69 | "page_structs\n", start, end - 1); | 69 | "page_structs\n", start, end - 1); |
| 70 | } | 70 | } |
| @@ -31,11 +31,12 @@ | |||
| 31 | #include <linux/backing-dev.h> | 31 | #include <linux/backing-dev.h> |
| 32 | #include <linux/memcontrol.h> | 32 | #include <linux/memcontrol.h> |
| 33 | 33 | ||
| 34 | #include "internal.h" | ||
| 35 | |||
| 34 | /* How many pages do we try to swap or page in/out together? */ | 36 | /* How many pages do we try to swap or page in/out together? */ |
| 35 | int page_cluster; | 37 | int page_cluster; |
| 36 | 38 | ||
| 37 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs); | 39 | static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); |
| 38 | static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs); | ||
| 39 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); | 40 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); |
| 40 | 41 | ||
| 41 | /* | 42 | /* |
| @@ -116,8 +117,9 @@ static void pagevec_move_tail(struct pagevec *pvec) | |||
| 116 | zone = pagezone; | 117 | zone = pagezone; |
| 117 | spin_lock(&zone->lru_lock); | 118 | spin_lock(&zone->lru_lock); |
| 118 | } | 119 | } |
| 119 | if (PageLRU(page) && !PageActive(page)) { | 120 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
| 120 | list_move_tail(&page->lru, &zone->inactive_list); | 121 | int lru = page_is_file_cache(page); |
| 122 | list_move_tail(&page->lru, &zone->lru[lru].list); | ||
| 121 | pgmoved++; | 123 | pgmoved++; |
| 122 | } | 124 | } |
| 123 | } | 125 | } |
| @@ -136,7 +138,7 @@ static void pagevec_move_tail(struct pagevec *pvec) | |||
| 136 | void rotate_reclaimable_page(struct page *page) | 138 | void rotate_reclaimable_page(struct page *page) |
| 137 | { | 139 | { |
| 138 | if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && | 140 | if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && |
| 139 | PageLRU(page)) { | 141 | !PageUnevictable(page) && PageLRU(page)) { |
| 140 | struct pagevec *pvec; | 142 | struct pagevec *pvec; |
| 141 | unsigned long flags; | 143 | unsigned long flags; |
| 142 | 144 | ||
| @@ -157,12 +159,19 @@ void activate_page(struct page *page) | |||
| 157 | struct zone *zone = page_zone(page); | 159 | struct zone *zone = page_zone(page); |
| 158 | 160 | ||
| 159 | spin_lock_irq(&zone->lru_lock); | 161 | spin_lock_irq(&zone->lru_lock); |
| 160 | if (PageLRU(page) && !PageActive(page)) { | 162 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
| 161 | del_page_from_inactive_list(zone, page); | 163 | int file = page_is_file_cache(page); |
| 164 | int lru = LRU_BASE + file; | ||
| 165 | del_page_from_lru_list(zone, page, lru); | ||
| 166 | |||
| 162 | SetPageActive(page); | 167 | SetPageActive(page); |
| 163 | add_page_to_active_list(zone, page); | 168 | lru += LRU_ACTIVE; |
| 169 | add_page_to_lru_list(zone, page, lru); | ||
| 164 | __count_vm_event(PGACTIVATE); | 170 | __count_vm_event(PGACTIVATE); |
| 165 | mem_cgroup_move_lists(page, true); | 171 | mem_cgroup_move_lists(page, lru); |
| 172 | |||
| 173 | zone->recent_rotated[!!file]++; | ||
| 174 | zone->recent_scanned[!!file]++; | ||
| 166 | } | 175 | } |
| 167 | spin_unlock_irq(&zone->lru_lock); | 176 | spin_unlock_irq(&zone->lru_lock); |
| 168 | } | 177 | } |
| @@ -176,7 +185,8 @@ void activate_page(struct page *page) | |||
| 176 | */ | 185 | */ |
| 177 | void mark_page_accessed(struct page *page) | 186 | void mark_page_accessed(struct page *page) |
| 178 | { | 187 | { |
| 179 | if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { | 188 | if (!PageActive(page) && !PageUnevictable(page) && |
| 189 | PageReferenced(page) && PageLRU(page)) { | ||
| 180 | activate_page(page); | 190 | activate_page(page); |
| 181 | ClearPageReferenced(page); | 191 | ClearPageReferenced(page); |
| 182 | } else if (!PageReferenced(page)) { | 192 | } else if (!PageReferenced(page)) { |
| @@ -186,28 +196,73 @@ void mark_page_accessed(struct page *page) | |||
| 186 | 196 | ||
| 187 | EXPORT_SYMBOL(mark_page_accessed); | 197 | EXPORT_SYMBOL(mark_page_accessed); |
| 188 | 198 | ||
| 189 | /** | 199 | void __lru_cache_add(struct page *page, enum lru_list lru) |
| 190 | * lru_cache_add: add a page to the page lists | ||
| 191 | * @page: the page to add | ||
| 192 | */ | ||
| 193 | void lru_cache_add(struct page *page) | ||
| 194 | { | 200 | { |
| 195 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); | 201 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; |
| 196 | 202 | ||
| 197 | page_cache_get(page); | 203 | page_cache_get(page); |
| 198 | if (!pagevec_add(pvec, page)) | 204 | if (!pagevec_add(pvec, page)) |
| 199 | __pagevec_lru_add(pvec); | 205 | ____pagevec_lru_add(pvec, lru); |
| 200 | put_cpu_var(lru_add_pvecs); | 206 | put_cpu_var(lru_add_pvecs); |
| 201 | } | 207 | } |
| 202 | 208 | ||
| 203 | void lru_cache_add_active(struct page *page) | 209 | /** |
| 210 | * lru_cache_add_lru - add a page to a page list | ||
| 211 | * @page: the page to be added to the LRU. | ||
| 212 | * @lru: the LRU list to which the page is added. | ||
| 213 | */ | ||
| 214 | void lru_cache_add_lru(struct page *page, enum lru_list lru) | ||
| 204 | { | 215 | { |
| 205 | struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); | 216 | if (PageActive(page)) { |
| 217 | VM_BUG_ON(PageUnevictable(page)); | ||
| 218 | ClearPageActive(page); | ||
| 219 | } else if (PageUnevictable(page)) { | ||
| 220 | VM_BUG_ON(PageActive(page)); | ||
| 221 | ClearPageUnevictable(page); | ||
| 222 | } | ||
| 206 | 223 | ||
| 207 | page_cache_get(page); | 224 | VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); |
| 208 | if (!pagevec_add(pvec, page)) | 225 | __lru_cache_add(page, lru); |
| 209 | __pagevec_lru_add_active(pvec); | 226 | } |
| 210 | put_cpu_var(lru_add_active_pvecs); | 227 | |
| 228 | /** | ||
| 229 | * add_page_to_unevictable_list - add a page to the unevictable list | ||
| 230 | * @page: the page to be added to the unevictable list | ||
| 231 | * | ||
| 232 | * Add page directly to its zone's unevictable list. To avoid races with | ||
| 233 | * tasks that might be making the page evictable, through eg. munlock, | ||
| 234 | * munmap or exit, while it's not on the lru, we want to add the page | ||
| 235 | * while it's locked or otherwise "invisible" to other tasks. This is | ||
| 236 | * difficult to do when using the pagevec cache, so bypass that. | ||
| 237 | */ | ||
| 238 | void add_page_to_unevictable_list(struct page *page) | ||
| 239 | { | ||
| 240 | struct zone *zone = page_zone(page); | ||
| 241 | |||
| 242 | spin_lock_irq(&zone->lru_lock); | ||
| 243 | SetPageUnevictable(page); | ||
| 244 | SetPageLRU(page); | ||
| 245 | add_page_to_lru_list(zone, page, LRU_UNEVICTABLE); | ||
| 246 | spin_unlock_irq(&zone->lru_lock); | ||
| 247 | } | ||
| 248 | |||
| 249 | /** | ||
| 250 | * lru_cache_add_active_or_unevictable | ||
| 251 | * @page: the page to be added to LRU | ||
| 252 | * @vma: vma in which page is mapped for determining reclaimability | ||
| 253 | * | ||
| 254 | * place @page on active or unevictable LRU list, depending on | ||
| 255 | * page_evictable(). Note that if the page is not evictable, | ||
| 256 | * it goes directly back onto it's zone's unevictable list. It does | ||
| 257 | * NOT use a per cpu pagevec. | ||
| 258 | */ | ||
| 259 | void lru_cache_add_active_or_unevictable(struct page *page, | ||
| 260 | struct vm_area_struct *vma) | ||
| 261 | { | ||
| 262 | if (page_evictable(page, vma)) | ||
| 263 | lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page)); | ||
| 264 | else | ||
| 265 | add_page_to_unevictable_list(page); | ||
| 211 | } | 266 | } |
| 212 | 267 | ||
| 213 | /* | 268 | /* |
| @@ -217,15 +272,15 @@ void lru_cache_add_active(struct page *page) | |||
| 217 | */ | 272 | */ |
| 218 | static void drain_cpu_pagevecs(int cpu) | 273 | static void drain_cpu_pagevecs(int cpu) |
| 219 | { | 274 | { |
| 275 | struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); | ||
| 220 | struct pagevec *pvec; | 276 | struct pagevec *pvec; |
| 277 | int lru; | ||
| 221 | 278 | ||
| 222 | pvec = &per_cpu(lru_add_pvecs, cpu); | 279 | for_each_lru(lru) { |
| 223 | if (pagevec_count(pvec)) | 280 | pvec = &pvecs[lru - LRU_BASE]; |
| 224 | __pagevec_lru_add(pvec); | 281 | if (pagevec_count(pvec)) |
| 225 | 282 | ____pagevec_lru_add(pvec, lru); | |
| 226 | pvec = &per_cpu(lru_add_active_pvecs, cpu); | 283 | } |
| 227 | if (pagevec_count(pvec)) | ||
| 228 | __pagevec_lru_add_active(pvec); | ||
| 229 | 284 | ||
| 230 | pvec = &per_cpu(lru_rotate_pvecs, cpu); | 285 | pvec = &per_cpu(lru_rotate_pvecs, cpu); |
| 231 | if (pagevec_count(pvec)) { | 286 | if (pagevec_count(pvec)) { |
| @@ -244,7 +299,7 @@ void lru_add_drain(void) | |||
| 244 | put_cpu(); | 299 | put_cpu(); |
| 245 | } | 300 | } |
| 246 | 301 | ||
| 247 | #ifdef CONFIG_NUMA | 302 | #if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU) |
| 248 | static void lru_add_drain_per_cpu(struct work_struct *dummy) | 303 | static void lru_add_drain_per_cpu(struct work_struct *dummy) |
| 249 | { | 304 | { |
| 250 | lru_add_drain(); | 305 | lru_add_drain(); |
| @@ -308,6 +363,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
| 308 | 363 | ||
| 309 | if (PageLRU(page)) { | 364 | if (PageLRU(page)) { |
| 310 | struct zone *pagezone = page_zone(page); | 365 | struct zone *pagezone = page_zone(page); |
| 366 | |||
| 311 | if (pagezone != zone) { | 367 | if (pagezone != zone) { |
| 312 | if (zone) | 368 | if (zone) |
| 313 | spin_unlock_irqrestore(&zone->lru_lock, | 369 | spin_unlock_irqrestore(&zone->lru_lock, |
| @@ -380,10 +436,11 @@ void __pagevec_release_nonlru(struct pagevec *pvec) | |||
| 380 | * Add the passed pages to the LRU, then drop the caller's refcount | 436 | * Add the passed pages to the LRU, then drop the caller's refcount |
| 381 | * on them. Reinitialises the caller's pagevec. | 437 | * on them. Reinitialises the caller's pagevec. |
| 382 | */ | 438 | */ |
| 383 | void __pagevec_lru_add(struct pagevec *pvec) | 439 | void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) |
| 384 | { | 440 | { |
| 385 | int i; | 441 | int i; |
| 386 | struct zone *zone = NULL; | 442 | struct zone *zone = NULL; |
| 443 | VM_BUG_ON(is_unevictable_lru(lru)); | ||
| 387 | 444 | ||
| 388 | for (i = 0; i < pagevec_count(pvec); i++) { | 445 | for (i = 0; i < pagevec_count(pvec); i++) { |
| 389 | struct page *page = pvec->pages[i]; | 446 | struct page *page = pvec->pages[i]; |
| @@ -395,9 +452,13 @@ void __pagevec_lru_add(struct pagevec *pvec) | |||
| 395 | zone = pagezone; | 452 | zone = pagezone; |
| 396 | spin_lock_irq(&zone->lru_lock); | 453 | spin_lock_irq(&zone->lru_lock); |
| 397 | } | 454 | } |
| 455 | VM_BUG_ON(PageActive(page)); | ||
| 456 | VM_BUG_ON(PageUnevictable(page)); | ||
| 398 | VM_BUG_ON(PageLRU(page)); | 457 | VM_BUG_ON(PageLRU(page)); |
| 399 | SetPageLRU(page); | 458 | SetPageLRU(page); |
| 400 | add_page_to_inactive_list(zone, page); | 459 | if (is_active_lru(lru)) |
| 460 | SetPageActive(page); | ||
| 461 | add_page_to_lru_list(zone, page, lru); | ||
| 401 | } | 462 | } |
| 402 | if (zone) | 463 | if (zone) |
| 403 | spin_unlock_irq(&zone->lru_lock); | 464 | spin_unlock_irq(&zone->lru_lock); |
| @@ -405,48 +466,45 @@ void __pagevec_lru_add(struct pagevec *pvec) | |||
| 405 | pagevec_reinit(pvec); | 466 | pagevec_reinit(pvec); |
| 406 | } | 467 | } |
| 407 | 468 | ||
| 408 | EXPORT_SYMBOL(__pagevec_lru_add); | 469 | EXPORT_SYMBOL(____pagevec_lru_add); |
| 409 | 470 | ||
| 410 | void __pagevec_lru_add_active(struct pagevec *pvec) | 471 | /* |
| 472 | * Try to drop buffers from the pages in a pagevec | ||
| 473 | */ | ||
| 474 | void pagevec_strip(struct pagevec *pvec) | ||
| 411 | { | 475 | { |
| 412 | int i; | 476 | int i; |
| 413 | struct zone *zone = NULL; | ||
| 414 | 477 | ||
| 415 | for (i = 0; i < pagevec_count(pvec); i++) { | 478 | for (i = 0; i < pagevec_count(pvec); i++) { |
| 416 | struct page *page = pvec->pages[i]; | 479 | struct page *page = pvec->pages[i]; |
| 417 | struct zone *pagezone = page_zone(page); | ||
| 418 | 480 | ||
| 419 | if (pagezone != zone) { | 481 | if (PagePrivate(page) && trylock_page(page)) { |
| 420 | if (zone) | 482 | if (PagePrivate(page)) |
| 421 | spin_unlock_irq(&zone->lru_lock); | 483 | try_to_release_page(page, 0); |
| 422 | zone = pagezone; | 484 | unlock_page(page); |
| 423 | spin_lock_irq(&zone->lru_lock); | ||
| 424 | } | 485 | } |
| 425 | VM_BUG_ON(PageLRU(page)); | ||
| 426 | SetPageLRU(page); | ||
| 427 | VM_BUG_ON(PageActive(page)); | ||
| 428 | SetPageActive(page); | ||
| 429 | add_page_to_active_list(zone, page); | ||
| 430 | } | 486 | } |
| 431 | if (zone) | ||
| 432 | spin_unlock_irq(&zone->lru_lock); | ||
| 433 | release_pages(pvec->pages, pvec->nr, pvec->cold); | ||
| 434 | pagevec_reinit(pvec); | ||
| 435 | } | 487 | } |
| 436 | 488 | ||
| 437 | /* | 489 | /** |
| 438 | * Try to drop buffers from the pages in a pagevec | 490 | * pagevec_swap_free - try to free swap space from the pages in a pagevec |
| 491 | * @pvec: pagevec with swapcache pages to free the swap space of | ||
| 492 | * | ||
| 493 | * The caller needs to hold an extra reference to each page and | ||
| 494 | * not hold the page lock on the pages. This function uses a | ||
| 495 | * trylock on the page lock so it may not always free the swap | ||
| 496 | * space associated with a page. | ||
| 439 | */ | 497 | */ |
| 440 | void pagevec_strip(struct pagevec *pvec) | 498 | void pagevec_swap_free(struct pagevec *pvec) |
| 441 | { | 499 | { |
| 442 | int i; | 500 | int i; |
| 443 | 501 | ||
| 444 | for (i = 0; i < pagevec_count(pvec); i++) { | 502 | for (i = 0; i < pagevec_count(pvec); i++) { |
| 445 | struct page *page = pvec->pages[i]; | 503 | struct page *page = pvec->pages[i]; |
| 446 | 504 | ||
| 447 | if (PagePrivate(page) && trylock_page(page)) { | 505 | if (PageSwapCache(page) && trylock_page(page)) { |
| 448 | if (PagePrivate(page)) | 506 | if (PageSwapCache(page)) |
| 449 | try_to_release_page(page, 0); | 507 | remove_exclusive_swap_page_ref(page); |
| 450 | unlock_page(page); | 508 | unlock_page(page); |
| 451 | } | 509 | } |
| 452 | } | 510 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 797c3831cbec..3353c9029cef 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = { | |||
| 33 | }; | 33 | }; |
| 34 | 34 | ||
| 35 | static struct backing_dev_info swap_backing_dev_info = { | 35 | static struct backing_dev_info swap_backing_dev_info = { |
| 36 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 36 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, |
| 37 | .unplug_io_fn = swap_unplug_io_fn, | 37 | .unplug_io_fn = swap_unplug_io_fn, |
| 38 | }; | 38 | }; |
| 39 | 39 | ||
| @@ -75,6 +75,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
| 75 | BUG_ON(!PageLocked(page)); | 75 | BUG_ON(!PageLocked(page)); |
| 76 | BUG_ON(PageSwapCache(page)); | 76 | BUG_ON(PageSwapCache(page)); |
| 77 | BUG_ON(PagePrivate(page)); | 77 | BUG_ON(PagePrivate(page)); |
| 78 | BUG_ON(!PageSwapBacked(page)); | ||
| 78 | error = radix_tree_preload(gfp_mask); | 79 | error = radix_tree_preload(gfp_mask); |
| 79 | if (!error) { | 80 | if (!error) { |
| 80 | page_cache_get(page); | 81 | page_cache_get(page); |
| @@ -302,17 +303,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
| 302 | * re-using the just freed swap entry for an existing page. | 303 | * re-using the just freed swap entry for an existing page. |
| 303 | * May fail (-ENOMEM) if radix-tree node allocation failed. | 304 | * May fail (-ENOMEM) if radix-tree node allocation failed. |
| 304 | */ | 305 | */ |
| 305 | set_page_locked(new_page); | 306 | __set_page_locked(new_page); |
| 307 | SetPageSwapBacked(new_page); | ||
| 306 | err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); | 308 | err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); |
| 307 | if (likely(!err)) { | 309 | if (likely(!err)) { |
| 308 | /* | 310 | /* |
| 309 | * Initiate read into locked page and return. | 311 | * Initiate read into locked page and return. |
| 310 | */ | 312 | */ |
| 311 | lru_cache_add_active(new_page); | 313 | lru_cache_add_anon(new_page); |
| 312 | swap_readpage(NULL, new_page); | 314 | swap_readpage(NULL, new_page); |
| 313 | return new_page; | 315 | return new_page; |
| 314 | } | 316 | } |
| 315 | clear_page_locked(new_page); | 317 | ClearPageSwapBacked(new_page); |
| 318 | __clear_page_locked(new_page); | ||
| 316 | swap_free(entry); | 319 | swap_free(entry); |
| 317 | } while (err != -ENOMEM); | 320 | } while (err != -ENOMEM); |
| 318 | 321 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 1e330f2998fa..90cb67a5417c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -344,7 +344,7 @@ int can_share_swap_page(struct page *page) | |||
| 344 | * Work out if there are any other processes sharing this | 344 | * Work out if there are any other processes sharing this |
| 345 | * swap cache page. Free it if you can. Return success. | 345 | * swap cache page. Free it if you can. Return success. |
| 346 | */ | 346 | */ |
| 347 | int remove_exclusive_swap_page(struct page *page) | 347 | static int remove_exclusive_swap_page_count(struct page *page, int count) |
| 348 | { | 348 | { |
| 349 | int retval; | 349 | int retval; |
| 350 | struct swap_info_struct * p; | 350 | struct swap_info_struct * p; |
| @@ -357,7 +357,7 @@ int remove_exclusive_swap_page(struct page *page) | |||
| 357 | return 0; | 357 | return 0; |
| 358 | if (PageWriteback(page)) | 358 | if (PageWriteback(page)) |
| 359 | return 0; | 359 | return 0; |
| 360 | if (page_count(page) != 2) /* 2: us + cache */ | 360 | if (page_count(page) != count) /* us + cache + ptes */ |
| 361 | return 0; | 361 | return 0; |
| 362 | 362 | ||
| 363 | entry.val = page_private(page); | 363 | entry.val = page_private(page); |
| @@ -370,7 +370,7 @@ int remove_exclusive_swap_page(struct page *page) | |||
| 370 | if (p->swap_map[swp_offset(entry)] == 1) { | 370 | if (p->swap_map[swp_offset(entry)] == 1) { |
| 371 | /* Recheck the page count with the swapcache lock held.. */ | 371 | /* Recheck the page count with the swapcache lock held.. */ |
| 372 | spin_lock_irq(&swapper_space.tree_lock); | 372 | spin_lock_irq(&swapper_space.tree_lock); |
| 373 | if ((page_count(page) == 2) && !PageWriteback(page)) { | 373 | if ((page_count(page) == count) && !PageWriteback(page)) { |
| 374 | __delete_from_swap_cache(page); | 374 | __delete_from_swap_cache(page); |
| 375 | SetPageDirty(page); | 375 | SetPageDirty(page); |
| 376 | retval = 1; | 376 | retval = 1; |
| @@ -388,6 +388,25 @@ int remove_exclusive_swap_page(struct page *page) | |||
| 388 | } | 388 | } |
| 389 | 389 | ||
| 390 | /* | 390 | /* |
| 391 | * Most of the time the page should have two references: one for the | ||
| 392 | * process and one for the swap cache. | ||
| 393 | */ | ||
| 394 | int remove_exclusive_swap_page(struct page *page) | ||
| 395 | { | ||
| 396 | return remove_exclusive_swap_page_count(page, 2); | ||
| 397 | } | ||
| 398 | |||
| 399 | /* | ||
| 400 | * The pageout code holds an extra reference to the page. That raises | ||
| 401 | * the reference count to test for to 2 for a page that is only in the | ||
| 402 | * swap cache plus 1 for each process that maps the page. | ||
| 403 | */ | ||
| 404 | int remove_exclusive_swap_page_ref(struct page *page) | ||
| 405 | { | ||
| 406 | return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page)); | ||
| 407 | } | ||
| 408 | |||
| 409 | /* | ||
| 391 | * Free the swap entry like above, but also try to | 410 | * Free the swap entry like above, but also try to |
| 392 | * free the page cache entry if it is the last user. | 411 | * free the page cache entry if it is the last user. |
| 393 | */ | 412 | */ |
| @@ -403,7 +422,7 @@ void free_swap_and_cache(swp_entry_t entry) | |||
| 403 | if (p) { | 422 | if (p) { |
| 404 | if (swap_entry_free(p, swp_offset(entry)) == 1) { | 423 | if (swap_entry_free(p, swp_offset(entry)) == 1) { |
| 405 | page = find_get_page(&swapper_space, entry.val); | 424 | page = find_get_page(&swapper_space, entry.val); |
| 406 | if (page && unlikely(!trylock_page(page))) { | 425 | if (page && !trylock_page(page)) { |
| 407 | page_cache_release(page); | 426 | page_cache_release(page); |
| 408 | page = NULL; | 427 | page = NULL; |
| 409 | } | 428 | } |
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index 8d7a27a6335c..3e67d575ee6e 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c | |||
| @@ -95,6 +95,7 @@ put_dentry: | |||
| 95 | put_memory: | 95 | put_memory: |
| 96 | return ERR_PTR(error); | 96 | return ERR_PTR(error); |
| 97 | } | 97 | } |
| 98 | EXPORT_SYMBOL_GPL(shmem_file_setup); | ||
| 98 | 99 | ||
| 99 | /** | 100 | /** |
| 100 | * shmem_zero_setup - setup a shared anonymous mapping | 101 | * shmem_zero_setup - setup a shared anonymous mapping |
diff --git a/mm/truncate.c b/mm/truncate.c index 6650c1d878b4..1229211104f8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
| @@ -3,7 +3,7 @@ | |||
| 3 | * | 3 | * |
| 4 | * Copyright (C) 2002, Linus Torvalds | 4 | * Copyright (C) 2002, Linus Torvalds |
| 5 | * | 5 | * |
| 6 | * 10Sep2002 akpm@zip.com.au | 6 | * 10Sep2002 Andrew Morton |
| 7 | * Initial version. | 7 | * Initial version. |
| 8 | */ | 8 | */ |
| 9 | 9 | ||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/task_io_accounting_ops.h> | 18 | #include <linux/task_io_accounting_ops.h> |
| 19 | #include <linux/buffer_head.h> /* grr. try_to_release_page, | 19 | #include <linux/buffer_head.h> /* grr. try_to_release_page, |
| 20 | do_invalidatepage */ | 20 | do_invalidatepage */ |
| 21 | #include "internal.h" | ||
| 21 | 22 | ||
| 22 | 23 | ||
| 23 | /** | 24 | /** |
| @@ -103,6 +104,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
| 103 | 104 | ||
| 104 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | 105 | cancel_dirty_page(page, PAGE_CACHE_SIZE); |
| 105 | 106 | ||
| 107 | clear_page_mlock(page); | ||
| 106 | remove_from_page_cache(page); | 108 | remove_from_page_cache(page); |
| 107 | ClearPageMappedToDisk(page); | 109 | ClearPageMappedToDisk(page); |
| 108 | page_cache_release(page); /* pagecache ref */ | 110 | page_cache_release(page); /* pagecache ref */ |
| @@ -127,6 +129,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) | |||
| 127 | if (PagePrivate(page) && !try_to_release_page(page, 0)) | 129 | if (PagePrivate(page) && !try_to_release_page(page, 0)) |
| 128 | return 0; | 130 | return 0; |
| 129 | 131 | ||
| 132 | clear_page_mlock(page); | ||
| 130 | ret = remove_mapping(mapping, page); | 133 | ret = remove_mapping(mapping, page); |
| 131 | 134 | ||
| 132 | return ret; | 135 | return ret; |
| @@ -352,6 +355,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | |||
| 352 | if (PageDirty(page)) | 355 | if (PageDirty(page)) |
| 353 | goto failed; | 356 | goto failed; |
| 354 | 357 | ||
| 358 | clear_page_mlock(page); | ||
| 355 | BUG_ON(PagePrivate(page)); | 359 | BUG_ON(PagePrivate(page)); |
| 356 | __remove_from_page_cache(page); | 360 | __remove_from_page_cache(page); |
| 357 | spin_unlock_irq(&mapping->tree_lock); | 361 | spin_unlock_irq(&mapping->tree_lock); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 85b9a0d2c877..ba6b0f5f7fac 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -8,26 +8,28 @@ | |||
| 8 | * Numa awareness, Christoph Lameter, SGI, June 2005 | 8 | * Numa awareness, Christoph Lameter, SGI, June 2005 |
| 9 | */ | 9 | */ |
| 10 | 10 | ||
| 11 | #include <linux/vmalloc.h> | ||
| 11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
| 12 | #include <linux/module.h> | 13 | #include <linux/module.h> |
| 13 | #include <linux/highmem.h> | 14 | #include <linux/highmem.h> |
| 14 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
| 15 | #include <linux/spinlock.h> | 16 | #include <linux/spinlock.h> |
| 16 | #include <linux/interrupt.h> | 17 | #include <linux/interrupt.h> |
| 18 | #include <linux/proc_fs.h> | ||
| 17 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
| 18 | #include <linux/debugobjects.h> | 20 | #include <linux/debugobjects.h> |
| 19 | #include <linux/vmalloc.h> | ||
| 20 | #include <linux/kallsyms.h> | 21 | #include <linux/kallsyms.h> |
| 22 | #include <linux/list.h> | ||
| 23 | #include <linux/rbtree.h> | ||
| 24 | #include <linux/radix-tree.h> | ||
| 25 | #include <linux/rcupdate.h> | ||
| 21 | 26 | ||
| 27 | #include <asm/atomic.h> | ||
| 22 | #include <asm/uaccess.h> | 28 | #include <asm/uaccess.h> |
| 23 | #include <asm/tlbflush.h> | 29 | #include <asm/tlbflush.h> |
| 24 | 30 | ||
| 25 | 31 | ||
| 26 | DEFINE_RWLOCK(vmlist_lock); | 32 | /*** Page table manipulation functions ***/ |
| 27 | struct vm_struct *vmlist; | ||
| 28 | |||
| 29 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | ||
| 30 | int node, void *caller); | ||
| 31 | 33 | ||
| 32 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | 34 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) |
| 33 | { | 35 | { |
| @@ -40,8 +42,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | |||
| 40 | } while (pte++, addr += PAGE_SIZE, addr != end); | 42 | } while (pte++, addr += PAGE_SIZE, addr != end); |
| 41 | } | 43 | } |
| 42 | 44 | ||
| 43 | static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, | 45 | static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) |
| 44 | unsigned long end) | ||
| 45 | { | 46 | { |
| 46 | pmd_t *pmd; | 47 | pmd_t *pmd; |
| 47 | unsigned long next; | 48 | unsigned long next; |
| @@ -55,8 +56,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, | |||
| 55 | } while (pmd++, addr = next, addr != end); | 56 | } while (pmd++, addr = next, addr != end); |
| 56 | } | 57 | } |
| 57 | 58 | ||
| 58 | static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, | 59 | static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) |
| 59 | unsigned long end) | ||
| 60 | { | 60 | { |
| 61 | pud_t *pud; | 61 | pud_t *pud; |
| 62 | unsigned long next; | 62 | unsigned long next; |
| @@ -70,12 +70,10 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, | |||
| 70 | } while (pud++, addr = next, addr != end); | 70 | } while (pud++, addr = next, addr != end); |
| 71 | } | 71 | } |
| 72 | 72 | ||
| 73 | void unmap_kernel_range(unsigned long addr, unsigned long size) | 73 | static void vunmap_page_range(unsigned long addr, unsigned long end) |
| 74 | { | 74 | { |
| 75 | pgd_t *pgd; | 75 | pgd_t *pgd; |
| 76 | unsigned long next; | 76 | unsigned long next; |
| 77 | unsigned long start = addr; | ||
| 78 | unsigned long end = addr + size; | ||
| 79 | 77 | ||
| 80 | BUG_ON(addr >= end); | 78 | BUG_ON(addr >= end); |
| 81 | pgd = pgd_offset_k(addr); | 79 | pgd = pgd_offset_k(addr); |
| @@ -86,35 +84,36 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) | |||
| 86 | continue; | 84 | continue; |
| 87 | vunmap_pud_range(pgd, addr, next); | 85 | vunmap_pud_range(pgd, addr, next); |
| 88 | } while (pgd++, addr = next, addr != end); | 86 | } while (pgd++, addr = next, addr != end); |
| 89 | flush_tlb_kernel_range(start, end); | ||
| 90 | } | ||
| 91 | |||
| 92 | static void unmap_vm_area(struct vm_struct *area) | ||
| 93 | { | ||
| 94 | unmap_kernel_range((unsigned long)area->addr, area->size); | ||
| 95 | } | 87 | } |
| 96 | 88 | ||
| 97 | static int vmap_pte_range(pmd_t *pmd, unsigned long addr, | 89 | static int vmap_pte_range(pmd_t *pmd, unsigned long addr, |
| 98 | unsigned long end, pgprot_t prot, struct page ***pages) | 90 | unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
| 99 | { | 91 | { |
| 100 | pte_t *pte; | 92 | pte_t *pte; |
| 101 | 93 | ||
| 94 | /* | ||
| 95 | * nr is a running index into the array which helps higher level | ||
| 96 | * callers keep track of where we're up to. | ||
| 97 | */ | ||
| 98 | |||
| 102 | pte = pte_alloc_kernel(pmd, addr); | 99 | pte = pte_alloc_kernel(pmd, addr); |
| 103 | if (!pte) | 100 | if (!pte) |
| 104 | return -ENOMEM; | 101 | return -ENOMEM; |
| 105 | do { | 102 | do { |
| 106 | struct page *page = **pages; | 103 | struct page *page = pages[*nr]; |
| 107 | WARN_ON(!pte_none(*pte)); | 104 | |
| 108 | if (!page) | 105 | if (WARN_ON(!pte_none(*pte))) |
| 106 | return -EBUSY; | ||
| 107 | if (WARN_ON(!page)) | ||
| 109 | return -ENOMEM; | 108 | return -ENOMEM; |
| 110 | set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); | 109 | set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); |
| 111 | (*pages)++; | 110 | (*nr)++; |
| 112 | } while (pte++, addr += PAGE_SIZE, addr != end); | 111 | } while (pte++, addr += PAGE_SIZE, addr != end); |
| 113 | return 0; | 112 | return 0; |
| 114 | } | 113 | } |
| 115 | 114 | ||
| 116 | static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, | 115 | static int vmap_pmd_range(pud_t *pud, unsigned long addr, |
| 117 | unsigned long end, pgprot_t prot, struct page ***pages) | 116 | unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
| 118 | { | 117 | { |
| 119 | pmd_t *pmd; | 118 | pmd_t *pmd; |
| 120 | unsigned long next; | 119 | unsigned long next; |
| @@ -124,14 +123,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, | |||
| 124 | return -ENOMEM; | 123 | return -ENOMEM; |
| 125 | do { | 124 | do { |
| 126 | next = pmd_addr_end(addr, end); | 125 | next = pmd_addr_end(addr, end); |
| 127 | if (vmap_pte_range(pmd, addr, next, prot, pages)) | 126 | if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) |
| 128 | return -ENOMEM; | 127 | return -ENOMEM; |
| 129 | } while (pmd++, addr = next, addr != end); | 128 | } while (pmd++, addr = next, addr != end); |
| 130 | return 0; | 129 | return 0; |
| 131 | } | 130 | } |
| 132 | 131 | ||
| 133 | static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, | 132 | static int vmap_pud_range(pgd_t *pgd, unsigned long addr, |
| 134 | unsigned long end, pgprot_t prot, struct page ***pages) | 133 | unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
| 135 | { | 134 | { |
| 136 | pud_t *pud; | 135 | pud_t *pud; |
| 137 | unsigned long next; | 136 | unsigned long next; |
| @@ -141,50 +140,78 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, | |||
| 141 | return -ENOMEM; | 140 | return -ENOMEM; |
| 142 | do { | 141 | do { |
| 143 | next = pud_addr_end(addr, end); | 142 | next = pud_addr_end(addr, end); |
| 144 | if (vmap_pmd_range(pud, addr, next, prot, pages)) | 143 | if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) |
| 145 | return -ENOMEM; | 144 | return -ENOMEM; |
| 146 | } while (pud++, addr = next, addr != end); | 145 | } while (pud++, addr = next, addr != end); |
| 147 | return 0; | 146 | return 0; |
| 148 | } | 147 | } |
| 149 | 148 | ||
| 150 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | 149 | /* |
| 150 | * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and | ||
| 151 | * will have pfns corresponding to the "pages" array. | ||
| 152 | * | ||
| 153 | * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] | ||
| 154 | */ | ||
| 155 | static int vmap_page_range(unsigned long addr, unsigned long end, | ||
| 156 | pgprot_t prot, struct page **pages) | ||
| 151 | { | 157 | { |
| 152 | pgd_t *pgd; | 158 | pgd_t *pgd; |
| 153 | unsigned long next; | 159 | unsigned long next; |
| 154 | unsigned long addr = (unsigned long) area->addr; | 160 | int err = 0; |
| 155 | unsigned long end = addr + area->size - PAGE_SIZE; | 161 | int nr = 0; |
| 156 | int err; | ||
| 157 | 162 | ||
| 158 | BUG_ON(addr >= end); | 163 | BUG_ON(addr >= end); |
| 159 | pgd = pgd_offset_k(addr); | 164 | pgd = pgd_offset_k(addr); |
| 160 | do { | 165 | do { |
| 161 | next = pgd_addr_end(addr, end); | 166 | next = pgd_addr_end(addr, end); |
| 162 | err = vmap_pud_range(pgd, addr, next, prot, pages); | 167 | err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); |
| 163 | if (err) | 168 | if (err) |
| 164 | break; | 169 | break; |
| 165 | } while (pgd++, addr = next, addr != end); | 170 | } while (pgd++, addr = next, addr != end); |
| 166 | flush_cache_vmap((unsigned long) area->addr, end); | 171 | flush_cache_vmap(addr, end); |
| 167 | return err; | 172 | |
| 173 | if (unlikely(err)) | ||
| 174 | return err; | ||
| 175 | return nr; | ||
| 176 | } | ||
| 177 | |||
| 178 | static inline int is_vmalloc_or_module_addr(const void *x) | ||
| 179 | { | ||
| 180 | /* | ||
| 181 | * ARM, x86-64 and sparc64 put modules in a special place, | ||
| 182 | * and fall back on vmalloc() if that fails. Others | ||
| 183 | * just put it in the vmalloc space. | ||
| 184 | */ | ||
| 185 | #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) | ||
| 186 | unsigned long addr = (unsigned long)x; | ||
| 187 | if (addr >= MODULES_VADDR && addr < MODULES_END) | ||
| 188 | return 1; | ||
| 189 | #endif | ||
| 190 | return is_vmalloc_addr(x); | ||
| 168 | } | 191 | } |
| 169 | EXPORT_SYMBOL_GPL(map_vm_area); | ||
| 170 | 192 | ||
| 171 | /* | 193 | /* |
| 172 | * Map a vmalloc()-space virtual address to the physical page. | 194 | * Walk a vmap address to the struct page it maps. |
| 173 | */ | 195 | */ |
| 174 | struct page *vmalloc_to_page(const void *vmalloc_addr) | 196 | struct page *vmalloc_to_page(const void *vmalloc_addr) |
| 175 | { | 197 | { |
| 176 | unsigned long addr = (unsigned long) vmalloc_addr; | 198 | unsigned long addr = (unsigned long) vmalloc_addr; |
| 177 | struct page *page = NULL; | 199 | struct page *page = NULL; |
| 178 | pgd_t *pgd = pgd_offset_k(addr); | 200 | pgd_t *pgd = pgd_offset_k(addr); |
| 179 | pud_t *pud; | 201 | |
| 180 | pmd_t *pmd; | 202 | /* |
| 181 | pte_t *ptep, pte; | 203 | * XXX we might need to change this if we add VIRTUAL_BUG_ON for |
| 204 | * architectures that do not vmalloc module space | ||
| 205 | */ | ||
| 206 | VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); | ||
| 182 | 207 | ||
| 183 | if (!pgd_none(*pgd)) { | 208 | if (!pgd_none(*pgd)) { |
| 184 | pud = pud_offset(pgd, addr); | 209 | pud_t *pud = pud_offset(pgd, addr); |
| 185 | if (!pud_none(*pud)) { | 210 | if (!pud_none(*pud)) { |
| 186 | pmd = pmd_offset(pud, addr); | 211 | pmd_t *pmd = pmd_offset(pud, addr); |
| 187 | if (!pmd_none(*pmd)) { | 212 | if (!pmd_none(*pmd)) { |
| 213 | pte_t *ptep, pte; | ||
| 214 | |||
| 188 | ptep = pte_offset_map(pmd, addr); | 215 | ptep = pte_offset_map(pmd, addr); |
| 189 | pte = *ptep; | 216 | pte = *ptep; |
| 190 | if (pte_present(pte)) | 217 | if (pte_present(pte)) |
| @@ -206,13 +233,759 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) | |||
| 206 | } | 233 | } |
| 207 | EXPORT_SYMBOL(vmalloc_to_pfn); | 234 | EXPORT_SYMBOL(vmalloc_to_pfn); |
| 208 | 235 | ||
| 209 | static struct vm_struct * | 236 | |
| 210 | __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, | 237 | /*** Global kva allocator ***/ |
| 211 | unsigned long end, int node, gfp_t gfp_mask, void *caller) | 238 | |
| 239 | #define VM_LAZY_FREE 0x01 | ||
| 240 | #define VM_LAZY_FREEING 0x02 | ||
| 241 | #define VM_VM_AREA 0x04 | ||
| 242 | |||
| 243 | struct vmap_area { | ||
| 244 | unsigned long va_start; | ||
| 245 | unsigned long va_end; | ||
| 246 | unsigned long flags; | ||
| 247 | struct rb_node rb_node; /* address sorted rbtree */ | ||
| 248 | struct list_head list; /* address sorted list */ | ||
| 249 | struct list_head purge_list; /* "lazy purge" list */ | ||
| 250 | void *private; | ||
| 251 | struct rcu_head rcu_head; | ||
| 252 | }; | ||
| 253 | |||
| 254 | static DEFINE_SPINLOCK(vmap_area_lock); | ||
| 255 | static struct rb_root vmap_area_root = RB_ROOT; | ||
| 256 | static LIST_HEAD(vmap_area_list); | ||
| 257 | |||
| 258 | static struct vmap_area *__find_vmap_area(unsigned long addr) | ||
| 212 | { | 259 | { |
| 213 | struct vm_struct **p, *tmp, *area; | 260 | struct rb_node *n = vmap_area_root.rb_node; |
| 214 | unsigned long align = 1; | 261 | |
| 262 | while (n) { | ||
| 263 | struct vmap_area *va; | ||
| 264 | |||
| 265 | va = rb_entry(n, struct vmap_area, rb_node); | ||
| 266 | if (addr < va->va_start) | ||
| 267 | n = n->rb_left; | ||
| 268 | else if (addr > va->va_start) | ||
| 269 | n = n->rb_right; | ||
| 270 | else | ||
| 271 | return va; | ||
| 272 | } | ||
| 273 | |||
| 274 | return NULL; | ||
| 275 | } | ||
| 276 | |||
| 277 | static void __insert_vmap_area(struct vmap_area *va) | ||
| 278 | { | ||
| 279 | struct rb_node **p = &vmap_area_root.rb_node; | ||
| 280 | struct rb_node *parent = NULL; | ||
| 281 | struct rb_node *tmp; | ||
| 282 | |||
| 283 | while (*p) { | ||
| 284 | struct vmap_area *tmp; | ||
| 285 | |||
| 286 | parent = *p; | ||
| 287 | tmp = rb_entry(parent, struct vmap_area, rb_node); | ||
| 288 | if (va->va_start < tmp->va_end) | ||
| 289 | p = &(*p)->rb_left; | ||
| 290 | else if (va->va_end > tmp->va_start) | ||
| 291 | p = &(*p)->rb_right; | ||
| 292 | else | ||
| 293 | BUG(); | ||
| 294 | } | ||
| 295 | |||
| 296 | rb_link_node(&va->rb_node, parent, p); | ||
| 297 | rb_insert_color(&va->rb_node, &vmap_area_root); | ||
| 298 | |||
| 299 | /* address-sort this list so it is usable like the vmlist */ | ||
| 300 | tmp = rb_prev(&va->rb_node); | ||
| 301 | if (tmp) { | ||
| 302 | struct vmap_area *prev; | ||
| 303 | prev = rb_entry(tmp, struct vmap_area, rb_node); | ||
| 304 | list_add_rcu(&va->list, &prev->list); | ||
| 305 | } else | ||
| 306 | list_add_rcu(&va->list, &vmap_area_list); | ||
| 307 | } | ||
| 308 | |||
| 309 | static void purge_vmap_area_lazy(void); | ||
| 310 | |||
| 311 | /* | ||
| 312 | * Allocate a region of KVA of the specified size and alignment, within the | ||
| 313 | * vstart and vend. | ||
| 314 | */ | ||
| 315 | static struct vmap_area *alloc_vmap_area(unsigned long size, | ||
| 316 | unsigned long align, | ||
| 317 | unsigned long vstart, unsigned long vend, | ||
| 318 | int node, gfp_t gfp_mask) | ||
| 319 | { | ||
| 320 | struct vmap_area *va; | ||
| 321 | struct rb_node *n; | ||
| 322 | unsigned long addr; | ||
| 323 | int purged = 0; | ||
| 324 | |||
| 325 | BUG_ON(size & ~PAGE_MASK); | ||
| 326 | |||
| 327 | addr = ALIGN(vstart, align); | ||
| 328 | |||
| 329 | va = kmalloc_node(sizeof(struct vmap_area), | ||
| 330 | gfp_mask & GFP_RECLAIM_MASK, node); | ||
| 331 | if (unlikely(!va)) | ||
| 332 | return ERR_PTR(-ENOMEM); | ||
| 333 | |||
| 334 | retry: | ||
| 335 | spin_lock(&vmap_area_lock); | ||
| 336 | /* XXX: could have a last_hole cache */ | ||
| 337 | n = vmap_area_root.rb_node; | ||
| 338 | if (n) { | ||
| 339 | struct vmap_area *first = NULL; | ||
| 340 | |||
| 341 | do { | ||
| 342 | struct vmap_area *tmp; | ||
| 343 | tmp = rb_entry(n, struct vmap_area, rb_node); | ||
| 344 | if (tmp->va_end >= addr) { | ||
| 345 | if (!first && tmp->va_start < addr + size) | ||
| 346 | first = tmp; | ||
| 347 | n = n->rb_left; | ||
| 348 | } else { | ||
| 349 | first = tmp; | ||
| 350 | n = n->rb_right; | ||
| 351 | } | ||
| 352 | } while (n); | ||
| 353 | |||
| 354 | if (!first) | ||
| 355 | goto found; | ||
| 356 | |||
| 357 | if (first->va_end < addr) { | ||
| 358 | n = rb_next(&first->rb_node); | ||
| 359 | if (n) | ||
| 360 | first = rb_entry(n, struct vmap_area, rb_node); | ||
| 361 | else | ||
| 362 | goto found; | ||
| 363 | } | ||
| 364 | |||
| 365 | while (addr + size >= first->va_start && addr + size <= vend) { | ||
| 366 | addr = ALIGN(first->va_end + PAGE_SIZE, align); | ||
| 367 | |||
| 368 | n = rb_next(&first->rb_node); | ||
| 369 | if (n) | ||
| 370 | first = rb_entry(n, struct vmap_area, rb_node); | ||
| 371 | else | ||
| 372 | goto found; | ||
| 373 | } | ||
| 374 | } | ||
| 375 | found: | ||
| 376 | if (addr + size > vend) { | ||
| 377 | spin_unlock(&vmap_area_lock); | ||
| 378 | if (!purged) { | ||
| 379 | purge_vmap_area_lazy(); | ||
| 380 | purged = 1; | ||
| 381 | goto retry; | ||
| 382 | } | ||
| 383 | if (printk_ratelimit()) | ||
| 384 | printk(KERN_WARNING "vmap allocation failed: " | ||
| 385 | "use vmalloc=<size> to increase size.\n"); | ||
| 386 | return ERR_PTR(-EBUSY); | ||
| 387 | } | ||
| 388 | |||
| 389 | BUG_ON(addr & (align-1)); | ||
| 390 | |||
| 391 | va->va_start = addr; | ||
| 392 | va->va_end = addr + size; | ||
| 393 | va->flags = 0; | ||
| 394 | __insert_vmap_area(va); | ||
| 395 | spin_unlock(&vmap_area_lock); | ||
| 396 | |||
| 397 | return va; | ||
| 398 | } | ||
| 399 | |||
| 400 | static void rcu_free_va(struct rcu_head *head) | ||
| 401 | { | ||
| 402 | struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); | ||
| 403 | |||
| 404 | kfree(va); | ||
| 405 | } | ||
| 406 | |||
| 407 | static void __free_vmap_area(struct vmap_area *va) | ||
| 408 | { | ||
| 409 | BUG_ON(RB_EMPTY_NODE(&va->rb_node)); | ||
| 410 | rb_erase(&va->rb_node, &vmap_area_root); | ||
| 411 | RB_CLEAR_NODE(&va->rb_node); | ||
| 412 | list_del_rcu(&va->list); | ||
| 413 | |||
| 414 | call_rcu(&va->rcu_head, rcu_free_va); | ||
| 415 | } | ||
| 416 | |||
| 417 | /* | ||
| 418 | * Free a region of KVA allocated by alloc_vmap_area | ||
| 419 | */ | ||
| 420 | static void free_vmap_area(struct vmap_area *va) | ||
| 421 | { | ||
| 422 | spin_lock(&vmap_area_lock); | ||
| 423 | __free_vmap_area(va); | ||
| 424 | spin_unlock(&vmap_area_lock); | ||
| 425 | } | ||
| 426 | |||
| 427 | /* | ||
| 428 | * Clear the pagetable entries of a given vmap_area | ||
| 429 | */ | ||
| 430 | static void unmap_vmap_area(struct vmap_area *va) | ||
| 431 | { | ||
| 432 | vunmap_page_range(va->va_start, va->va_end); | ||
| 433 | } | ||
| 434 | |||
| 435 | /* | ||
| 436 | * lazy_max_pages is the maximum amount of virtual address space we gather up | ||
| 437 | * before attempting to purge with a TLB flush. | ||
| 438 | * | ||
| 439 | * There is a tradeoff here: a larger number will cover more kernel page tables | ||
| 440 | * and take slightly longer to purge, but it will linearly reduce the number of | ||
| 441 | * global TLB flushes that must be performed. It would seem natural to scale | ||
| 442 | * this number up linearly with the number of CPUs (because vmapping activity | ||
| 443 | * could also scale linearly with the number of CPUs), however it is likely | ||
| 444 | * that in practice, workloads might be constrained in other ways that mean | ||
| 445 | * vmap activity will not scale linearly with CPUs. Also, I want to be | ||
| 446 | * conservative and not introduce a big latency on huge systems, so go with | ||
| 447 | * a less aggressive log scale. It will still be an improvement over the old | ||
| 448 | * code, and it will be simple to change the scale factor if we find that it | ||
| 449 | * becomes a problem on bigger systems. | ||
| 450 | */ | ||
| 451 | static unsigned long lazy_max_pages(void) | ||
| 452 | { | ||
| 453 | unsigned int log; | ||
| 454 | |||
| 455 | log = fls(num_online_cpus()); | ||
| 456 | |||
| 457 | return log * (32UL * 1024 * 1024 / PAGE_SIZE); | ||
| 458 | } | ||
| 459 | |||
| 460 | static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); | ||
| 461 | |||
| 462 | /* | ||
| 463 | * Purges all lazily-freed vmap areas. | ||
| 464 | * | ||
| 465 | * If sync is 0 then don't purge if there is already a purge in progress. | ||
| 466 | * If force_flush is 1, then flush kernel TLBs between *start and *end even | ||
| 467 | * if we found no lazy vmap areas to unmap (callers can use this to optimise | ||
| 468 | * their own TLB flushing). | ||
| 469 | * Returns with *start = min(*start, lowest purged address) | ||
| 470 | * *end = max(*end, highest purged address) | ||
| 471 | */ | ||
| 472 | static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | ||
| 473 | int sync, int force_flush) | ||
| 474 | { | ||
| 475 | static DEFINE_SPINLOCK(purge_lock); | ||
| 476 | LIST_HEAD(valist); | ||
| 477 | struct vmap_area *va; | ||
| 478 | int nr = 0; | ||
| 479 | |||
| 480 | /* | ||
| 481 | * If sync is 0 but force_flush is 1, we'll go sync anyway but callers | ||
| 482 | * should not expect such behaviour. This just simplifies locking for | ||
| 483 | * the case that isn't actually used at the moment anyway. | ||
| 484 | */ | ||
| 485 | if (!sync && !force_flush) { | ||
| 486 | if (!spin_trylock(&purge_lock)) | ||
| 487 | return; | ||
| 488 | } else | ||
| 489 | spin_lock(&purge_lock); | ||
| 490 | |||
| 491 | rcu_read_lock(); | ||
| 492 | list_for_each_entry_rcu(va, &vmap_area_list, list) { | ||
| 493 | if (va->flags & VM_LAZY_FREE) { | ||
| 494 | if (va->va_start < *start) | ||
| 495 | *start = va->va_start; | ||
| 496 | if (va->va_end > *end) | ||
| 497 | *end = va->va_end; | ||
| 498 | nr += (va->va_end - va->va_start) >> PAGE_SHIFT; | ||
| 499 | unmap_vmap_area(va); | ||
| 500 | list_add_tail(&va->purge_list, &valist); | ||
| 501 | va->flags |= VM_LAZY_FREEING; | ||
| 502 | va->flags &= ~VM_LAZY_FREE; | ||
| 503 | } | ||
| 504 | } | ||
| 505 | rcu_read_unlock(); | ||
| 506 | |||
| 507 | if (nr) { | ||
| 508 | BUG_ON(nr > atomic_read(&vmap_lazy_nr)); | ||
| 509 | atomic_sub(nr, &vmap_lazy_nr); | ||
| 510 | } | ||
| 511 | |||
| 512 | if (nr || force_flush) | ||
| 513 | flush_tlb_kernel_range(*start, *end); | ||
| 514 | |||
| 515 | if (nr) { | ||
| 516 | spin_lock(&vmap_area_lock); | ||
| 517 | list_for_each_entry(va, &valist, purge_list) | ||
| 518 | __free_vmap_area(va); | ||
| 519 | spin_unlock(&vmap_area_lock); | ||
| 520 | } | ||
| 521 | spin_unlock(&purge_lock); | ||
| 522 | } | ||
| 523 | |||
| 524 | /* | ||
| 525 | * Kick off a purge of the outstanding lazy areas. | ||
| 526 | */ | ||
| 527 | static void purge_vmap_area_lazy(void) | ||
| 528 | { | ||
| 529 | unsigned long start = ULONG_MAX, end = 0; | ||
| 530 | |||
| 531 | __purge_vmap_area_lazy(&start, &end, 0, 0); | ||
| 532 | } | ||
| 533 | |||
| 534 | /* | ||
| 535 | * Free and unmap a vmap area | ||
| 536 | */ | ||
| 537 | static void free_unmap_vmap_area(struct vmap_area *va) | ||
| 538 | { | ||
| 539 | va->flags |= VM_LAZY_FREE; | ||
| 540 | atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); | ||
| 541 | if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) | ||
| 542 | purge_vmap_area_lazy(); | ||
| 543 | } | ||
| 544 | |||
| 545 | static struct vmap_area *find_vmap_area(unsigned long addr) | ||
| 546 | { | ||
| 547 | struct vmap_area *va; | ||
| 548 | |||
| 549 | spin_lock(&vmap_area_lock); | ||
| 550 | va = __find_vmap_area(addr); | ||
| 551 | spin_unlock(&vmap_area_lock); | ||
| 552 | |||
| 553 | return va; | ||
| 554 | } | ||
| 555 | |||
| 556 | static void free_unmap_vmap_area_addr(unsigned long addr) | ||
| 557 | { | ||
| 558 | struct vmap_area *va; | ||
| 559 | |||
| 560 | va = find_vmap_area(addr); | ||
| 561 | BUG_ON(!va); | ||
| 562 | free_unmap_vmap_area(va); | ||
| 563 | } | ||
| 564 | |||
| 565 | |||
| 566 | /*** Per cpu kva allocator ***/ | ||
| 567 | |||
| 568 | /* | ||
| 569 | * vmap space is limited especially on 32 bit architectures. Ensure there is | ||
| 570 | * room for at least 16 percpu vmap blocks per CPU. | ||
| 571 | */ | ||
| 572 | /* | ||
| 573 | * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able | ||
| 574 | * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess | ||
| 575 | * instead (we just need a rough idea) | ||
| 576 | */ | ||
| 577 | #if BITS_PER_LONG == 32 | ||
| 578 | #define VMALLOC_SPACE (128UL*1024*1024) | ||
| 579 | #else | ||
| 580 | #define VMALLOC_SPACE (128UL*1024*1024*1024) | ||
| 581 | #endif | ||
| 582 | |||
| 583 | #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) | ||
| 584 | #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ | ||
| 585 | #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ | ||
| 586 | #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) | ||
| 587 | #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ | ||
| 588 | #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ | ||
| 589 | #define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ | ||
| 590 | VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ | ||
| 591 | VMALLOC_PAGES / NR_CPUS / 16)) | ||
| 592 | |||
| 593 | #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) | ||
| 594 | |||
| 595 | static bool vmap_initialized __read_mostly = false; | ||
| 596 | |||
| 597 | struct vmap_block_queue { | ||
| 598 | spinlock_t lock; | ||
| 599 | struct list_head free; | ||
| 600 | struct list_head dirty; | ||
| 601 | unsigned int nr_dirty; | ||
| 602 | }; | ||
| 603 | |||
| 604 | struct vmap_block { | ||
| 605 | spinlock_t lock; | ||
| 606 | struct vmap_area *va; | ||
| 607 | struct vmap_block_queue *vbq; | ||
| 608 | unsigned long free, dirty; | ||
| 609 | DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); | ||
| 610 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); | ||
| 611 | union { | ||
| 612 | struct { | ||
| 613 | struct list_head free_list; | ||
| 614 | struct list_head dirty_list; | ||
| 615 | }; | ||
| 616 | struct rcu_head rcu_head; | ||
| 617 | }; | ||
| 618 | }; | ||
| 619 | |||
| 620 | /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ | ||
| 621 | static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); | ||
| 622 | |||
| 623 | /* | ||
| 624 | * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block | ||
| 625 | * in the free path. Could get rid of this if we change the API to return a | ||
| 626 | * "cookie" from alloc, to be passed to free. But no big deal yet. | ||
| 627 | */ | ||
| 628 | static DEFINE_SPINLOCK(vmap_block_tree_lock); | ||
| 629 | static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); | ||
| 630 | |||
| 631 | /* | ||
| 632 | * We should probably have a fallback mechanism to allocate virtual memory | ||
| 633 | * out of partially filled vmap blocks. However vmap block sizing should be | ||
| 634 | * fairly reasonable according to the vmalloc size, so it shouldn't be a | ||
| 635 | * big problem. | ||
| 636 | */ | ||
| 637 | |||
| 638 | static unsigned long addr_to_vb_idx(unsigned long addr) | ||
| 639 | { | ||
| 640 | addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); | ||
| 641 | addr /= VMAP_BLOCK_SIZE; | ||
| 642 | return addr; | ||
| 643 | } | ||
| 644 | |||
| 645 | static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | ||
| 646 | { | ||
| 647 | struct vmap_block_queue *vbq; | ||
| 648 | struct vmap_block *vb; | ||
| 649 | struct vmap_area *va; | ||
| 650 | unsigned long vb_idx; | ||
| 651 | int node, err; | ||
| 652 | |||
| 653 | node = numa_node_id(); | ||
| 654 | |||
| 655 | vb = kmalloc_node(sizeof(struct vmap_block), | ||
| 656 | gfp_mask & GFP_RECLAIM_MASK, node); | ||
| 657 | if (unlikely(!vb)) | ||
| 658 | return ERR_PTR(-ENOMEM); | ||
| 659 | |||
| 660 | va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, | ||
| 661 | VMALLOC_START, VMALLOC_END, | ||
| 662 | node, gfp_mask); | ||
| 663 | if (unlikely(IS_ERR(va))) { | ||
| 664 | kfree(vb); | ||
| 665 | return ERR_PTR(PTR_ERR(va)); | ||
| 666 | } | ||
| 667 | |||
| 668 | err = radix_tree_preload(gfp_mask); | ||
| 669 | if (unlikely(err)) { | ||
| 670 | kfree(vb); | ||
| 671 | free_vmap_area(va); | ||
| 672 | return ERR_PTR(err); | ||
| 673 | } | ||
| 674 | |||
| 675 | spin_lock_init(&vb->lock); | ||
| 676 | vb->va = va; | ||
| 677 | vb->free = VMAP_BBMAP_BITS; | ||
| 678 | vb->dirty = 0; | ||
| 679 | bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); | ||
| 680 | bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); | ||
| 681 | INIT_LIST_HEAD(&vb->free_list); | ||
| 682 | INIT_LIST_HEAD(&vb->dirty_list); | ||
| 683 | |||
| 684 | vb_idx = addr_to_vb_idx(va->va_start); | ||
| 685 | spin_lock(&vmap_block_tree_lock); | ||
| 686 | err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); | ||
| 687 | spin_unlock(&vmap_block_tree_lock); | ||
| 688 | BUG_ON(err); | ||
| 689 | radix_tree_preload_end(); | ||
| 690 | |||
| 691 | vbq = &get_cpu_var(vmap_block_queue); | ||
| 692 | vb->vbq = vbq; | ||
| 693 | spin_lock(&vbq->lock); | ||
| 694 | list_add(&vb->free_list, &vbq->free); | ||
| 695 | spin_unlock(&vbq->lock); | ||
| 696 | put_cpu_var(vmap_cpu_blocks); | ||
| 697 | |||
| 698 | return vb; | ||
| 699 | } | ||
| 700 | |||
| 701 | static void rcu_free_vb(struct rcu_head *head) | ||
| 702 | { | ||
| 703 | struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); | ||
| 704 | |||
| 705 | kfree(vb); | ||
| 706 | } | ||
| 707 | |||
| 708 | static void free_vmap_block(struct vmap_block *vb) | ||
| 709 | { | ||
| 710 | struct vmap_block *tmp; | ||
| 711 | unsigned long vb_idx; | ||
| 712 | |||
| 713 | spin_lock(&vb->vbq->lock); | ||
| 714 | if (!list_empty(&vb->free_list)) | ||
| 715 | list_del(&vb->free_list); | ||
| 716 | if (!list_empty(&vb->dirty_list)) | ||
| 717 | list_del(&vb->dirty_list); | ||
| 718 | spin_unlock(&vb->vbq->lock); | ||
| 719 | |||
| 720 | vb_idx = addr_to_vb_idx(vb->va->va_start); | ||
| 721 | spin_lock(&vmap_block_tree_lock); | ||
| 722 | tmp = radix_tree_delete(&vmap_block_tree, vb_idx); | ||
| 723 | spin_unlock(&vmap_block_tree_lock); | ||
| 724 | BUG_ON(tmp != vb); | ||
| 725 | |||
| 726 | free_unmap_vmap_area(vb->va); | ||
| 727 | call_rcu(&vb->rcu_head, rcu_free_vb); | ||
| 728 | } | ||
| 729 | |||
| 730 | static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | ||
| 731 | { | ||
| 732 | struct vmap_block_queue *vbq; | ||
| 733 | struct vmap_block *vb; | ||
| 734 | unsigned long addr = 0; | ||
| 735 | unsigned int order; | ||
| 736 | |||
| 737 | BUG_ON(size & ~PAGE_MASK); | ||
| 738 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | ||
| 739 | order = get_order(size); | ||
| 740 | |||
| 741 | again: | ||
| 742 | rcu_read_lock(); | ||
| 743 | vbq = &get_cpu_var(vmap_block_queue); | ||
| 744 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | ||
| 745 | int i; | ||
| 746 | |||
| 747 | spin_lock(&vb->lock); | ||
| 748 | i = bitmap_find_free_region(vb->alloc_map, | ||
| 749 | VMAP_BBMAP_BITS, order); | ||
| 750 | |||
| 751 | if (i >= 0) { | ||
| 752 | addr = vb->va->va_start + (i << PAGE_SHIFT); | ||
| 753 | BUG_ON(addr_to_vb_idx(addr) != | ||
| 754 | addr_to_vb_idx(vb->va->va_start)); | ||
| 755 | vb->free -= 1UL << order; | ||
| 756 | if (vb->free == 0) { | ||
| 757 | spin_lock(&vbq->lock); | ||
| 758 | list_del_init(&vb->free_list); | ||
| 759 | spin_unlock(&vbq->lock); | ||
| 760 | } | ||
| 761 | spin_unlock(&vb->lock); | ||
| 762 | break; | ||
| 763 | } | ||
| 764 | spin_unlock(&vb->lock); | ||
| 765 | } | ||
| 766 | put_cpu_var(vmap_cpu_blocks); | ||
| 767 | rcu_read_unlock(); | ||
| 768 | |||
| 769 | if (!addr) { | ||
| 770 | vb = new_vmap_block(gfp_mask); | ||
| 771 | if (IS_ERR(vb)) | ||
| 772 | return vb; | ||
| 773 | goto again; | ||
| 774 | } | ||
| 775 | |||
| 776 | return (void *)addr; | ||
| 777 | } | ||
| 778 | |||
| 779 | static void vb_free(const void *addr, unsigned long size) | ||
| 780 | { | ||
| 781 | unsigned long offset; | ||
| 782 | unsigned long vb_idx; | ||
| 783 | unsigned int order; | ||
| 784 | struct vmap_block *vb; | ||
| 785 | |||
| 786 | BUG_ON(size & ~PAGE_MASK); | ||
| 787 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | ||
| 788 | order = get_order(size); | ||
| 789 | |||
| 790 | offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); | ||
| 791 | |||
| 792 | vb_idx = addr_to_vb_idx((unsigned long)addr); | ||
| 793 | rcu_read_lock(); | ||
| 794 | vb = radix_tree_lookup(&vmap_block_tree, vb_idx); | ||
| 795 | rcu_read_unlock(); | ||
| 796 | BUG_ON(!vb); | ||
| 797 | |||
| 798 | spin_lock(&vb->lock); | ||
| 799 | bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); | ||
| 800 | if (!vb->dirty) { | ||
| 801 | spin_lock(&vb->vbq->lock); | ||
| 802 | list_add(&vb->dirty_list, &vb->vbq->dirty); | ||
| 803 | spin_unlock(&vb->vbq->lock); | ||
| 804 | } | ||
| 805 | vb->dirty += 1UL << order; | ||
| 806 | if (vb->dirty == VMAP_BBMAP_BITS) { | ||
| 807 | BUG_ON(vb->free || !list_empty(&vb->free_list)); | ||
| 808 | spin_unlock(&vb->lock); | ||
| 809 | free_vmap_block(vb); | ||
| 810 | } else | ||
| 811 | spin_unlock(&vb->lock); | ||
| 812 | } | ||
| 813 | |||
| 814 | /** | ||
| 815 | * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer | ||
| 816 | * | ||
| 817 | * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily | ||
| 818 | * to amortize TLB flushing overheads. What this means is that any page you | ||
| 819 | * have now, may, in a former life, have been mapped into kernel virtual | ||
| 820 | * address by the vmap layer and so there might be some CPUs with TLB entries | ||
| 821 | * still referencing that page (additional to the regular 1:1 kernel mapping). | ||
| 822 | * | ||
| 823 | * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can | ||
| 824 | * be sure that none of the pages we have control over will have any aliases | ||
| 825 | * from the vmap layer. | ||
| 826 | */ | ||
| 827 | void vm_unmap_aliases(void) | ||
| 828 | { | ||
| 829 | unsigned long start = ULONG_MAX, end = 0; | ||
| 830 | int cpu; | ||
| 831 | int flush = 0; | ||
| 832 | |||
| 833 | if (unlikely(!vmap_initialized)) | ||
| 834 | return; | ||
| 835 | |||
| 836 | for_each_possible_cpu(cpu) { | ||
| 837 | struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); | ||
| 838 | struct vmap_block *vb; | ||
| 839 | |||
| 840 | rcu_read_lock(); | ||
| 841 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | ||
| 842 | int i; | ||
| 843 | |||
| 844 | spin_lock(&vb->lock); | ||
| 845 | i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); | ||
| 846 | while (i < VMAP_BBMAP_BITS) { | ||
| 847 | unsigned long s, e; | ||
| 848 | int j; | ||
| 849 | j = find_next_zero_bit(vb->dirty_map, | ||
| 850 | VMAP_BBMAP_BITS, i); | ||
| 851 | |||
| 852 | s = vb->va->va_start + (i << PAGE_SHIFT); | ||
| 853 | e = vb->va->va_start + (j << PAGE_SHIFT); | ||
| 854 | vunmap_page_range(s, e); | ||
| 855 | flush = 1; | ||
| 856 | |||
| 857 | if (s < start) | ||
| 858 | start = s; | ||
| 859 | if (e > end) | ||
| 860 | end = e; | ||
| 861 | |||
| 862 | i = j; | ||
| 863 | i = find_next_bit(vb->dirty_map, | ||
| 864 | VMAP_BBMAP_BITS, i); | ||
| 865 | } | ||
| 866 | spin_unlock(&vb->lock); | ||
| 867 | } | ||
| 868 | rcu_read_unlock(); | ||
| 869 | } | ||
| 870 | |||
| 871 | __purge_vmap_area_lazy(&start, &end, 1, flush); | ||
| 872 | } | ||
| 873 | EXPORT_SYMBOL_GPL(vm_unmap_aliases); | ||
| 874 | |||
| 875 | /** | ||
| 876 | * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram | ||
| 877 | * @mem: the pointer returned by vm_map_ram | ||
| 878 | * @count: the count passed to that vm_map_ram call (cannot unmap partial) | ||
| 879 | */ | ||
| 880 | void vm_unmap_ram(const void *mem, unsigned int count) | ||
| 881 | { | ||
| 882 | unsigned long size = count << PAGE_SHIFT; | ||
| 883 | unsigned long addr = (unsigned long)mem; | ||
| 884 | |||
| 885 | BUG_ON(!addr); | ||
| 886 | BUG_ON(addr < VMALLOC_START); | ||
| 887 | BUG_ON(addr > VMALLOC_END); | ||
| 888 | BUG_ON(addr & (PAGE_SIZE-1)); | ||
| 889 | |||
| 890 | debug_check_no_locks_freed(mem, size); | ||
| 891 | |||
| 892 | if (likely(count <= VMAP_MAX_ALLOC)) | ||
| 893 | vb_free(mem, size); | ||
| 894 | else | ||
| 895 | free_unmap_vmap_area_addr(addr); | ||
| 896 | } | ||
| 897 | EXPORT_SYMBOL(vm_unmap_ram); | ||
| 898 | |||
| 899 | /** | ||
| 900 | * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) | ||
| 901 | * @pages: an array of pointers to the pages to be mapped | ||
| 902 | * @count: number of pages | ||
| 903 | * @node: prefer to allocate data structures on this node | ||
| 904 | * @prot: memory protection to use. PAGE_KERNEL for regular RAM | ||
| 905 | * | ||
| 906 | * Returns: a pointer to the address that has been mapped, or %NULL on failure | ||
| 907 | */ | ||
| 908 | void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) | ||
| 909 | { | ||
| 910 | unsigned long size = count << PAGE_SHIFT; | ||
| 215 | unsigned long addr; | 911 | unsigned long addr; |
| 912 | void *mem; | ||
| 913 | |||
| 914 | if (likely(count <= VMAP_MAX_ALLOC)) { | ||
| 915 | mem = vb_alloc(size, GFP_KERNEL); | ||
| 916 | if (IS_ERR(mem)) | ||
| 917 | return NULL; | ||
| 918 | addr = (unsigned long)mem; | ||
| 919 | } else { | ||
| 920 | struct vmap_area *va; | ||
| 921 | va = alloc_vmap_area(size, PAGE_SIZE, | ||
| 922 | VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); | ||
| 923 | if (IS_ERR(va)) | ||
| 924 | return NULL; | ||
| 925 | |||
| 926 | addr = va->va_start; | ||
| 927 | mem = (void *)addr; | ||
| 928 | } | ||
| 929 | if (vmap_page_range(addr, addr + size, prot, pages) < 0) { | ||
| 930 | vm_unmap_ram(mem, count); | ||
| 931 | return NULL; | ||
| 932 | } | ||
| 933 | return mem; | ||
| 934 | } | ||
| 935 | EXPORT_SYMBOL(vm_map_ram); | ||
| 936 | |||
| 937 | void __init vmalloc_init(void) | ||
| 938 | { | ||
| 939 | int i; | ||
| 940 | |||
| 941 | for_each_possible_cpu(i) { | ||
| 942 | struct vmap_block_queue *vbq; | ||
| 943 | |||
| 944 | vbq = &per_cpu(vmap_block_queue, i); | ||
| 945 | spin_lock_init(&vbq->lock); | ||
| 946 | INIT_LIST_HEAD(&vbq->free); | ||
| 947 | INIT_LIST_HEAD(&vbq->dirty); | ||
| 948 | vbq->nr_dirty = 0; | ||
| 949 | } | ||
| 950 | |||
| 951 | vmap_initialized = true; | ||
| 952 | } | ||
| 953 | |||
| 954 | void unmap_kernel_range(unsigned long addr, unsigned long size) | ||
| 955 | { | ||
| 956 | unsigned long end = addr + size; | ||
| 957 | vunmap_page_range(addr, end); | ||
| 958 | flush_tlb_kernel_range(addr, end); | ||
| 959 | } | ||
| 960 | |||
| 961 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | ||
| 962 | { | ||
| 963 | unsigned long addr = (unsigned long)area->addr; | ||
| 964 | unsigned long end = addr + area->size - PAGE_SIZE; | ||
| 965 | int err; | ||
| 966 | |||
| 967 | err = vmap_page_range(addr, end, prot, *pages); | ||
| 968 | if (err > 0) { | ||
| 969 | *pages += err; | ||
| 970 | err = 0; | ||
| 971 | } | ||
| 972 | |||
| 973 | return err; | ||
| 974 | } | ||
| 975 | EXPORT_SYMBOL_GPL(map_vm_area); | ||
| 976 | |||
| 977 | /*** Old vmalloc interfaces ***/ | ||
| 978 | DEFINE_RWLOCK(vmlist_lock); | ||
| 979 | struct vm_struct *vmlist; | ||
| 980 | |||
| 981 | static struct vm_struct *__get_vm_area_node(unsigned long size, | ||
| 982 | unsigned long flags, unsigned long start, unsigned long end, | ||
| 983 | int node, gfp_t gfp_mask, void *caller) | ||
| 984 | { | ||
| 985 | static struct vmap_area *va; | ||
| 986 | struct vm_struct *area; | ||
| 987 | struct vm_struct *tmp, **p; | ||
| 988 | unsigned long align = 1; | ||
| 216 | 989 | ||
| 217 | BUG_ON(in_interrupt()); | 990 | BUG_ON(in_interrupt()); |
| 218 | if (flags & VM_IOREMAP) { | 991 | if (flags & VM_IOREMAP) { |
| @@ -225,13 +998,12 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, | |||
| 225 | 998 | ||
| 226 | align = 1ul << bit; | 999 | align = 1ul << bit; |
| 227 | } | 1000 | } |
| 228 | addr = ALIGN(start, align); | 1001 | |
| 229 | size = PAGE_ALIGN(size); | 1002 | size = PAGE_ALIGN(size); |
| 230 | if (unlikely(!size)) | 1003 | if (unlikely(!size)) |
| 231 | return NULL; | 1004 | return NULL; |
| 232 | 1005 | ||
| 233 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); | 1006 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); |
| 234 | |||
| 235 | if (unlikely(!area)) | 1007 | if (unlikely(!area)) |
| 236 | return NULL; | 1008 | return NULL; |
| 237 | 1009 | ||
| @@ -240,48 +1012,32 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, | |||
| 240 | */ | 1012 | */ |
| 241 | size += PAGE_SIZE; | 1013 | size += PAGE_SIZE; |
| 242 | 1014 | ||
| 243 | write_lock(&vmlist_lock); | 1015 | va = alloc_vmap_area(size, align, start, end, node, gfp_mask); |
| 244 | for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) { | 1016 | if (IS_ERR(va)) { |
| 245 | if ((unsigned long)tmp->addr < addr) { | 1017 | kfree(area); |
| 246 | if((unsigned long)tmp->addr + tmp->size >= addr) | 1018 | return NULL; |
| 247 | addr = ALIGN(tmp->size + | ||
| 248 | (unsigned long)tmp->addr, align); | ||
| 249 | continue; | ||
| 250 | } | ||
| 251 | if ((size + addr) < addr) | ||
| 252 | goto out; | ||
| 253 | if (size + addr <= (unsigned long)tmp->addr) | ||
| 254 | goto found; | ||
| 255 | addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align); | ||
| 256 | if (addr > end - size) | ||
| 257 | goto out; | ||
| 258 | } | 1019 | } |
| 259 | if ((size + addr) < addr) | ||
| 260 | goto out; | ||
| 261 | if (addr > end - size) | ||
| 262 | goto out; | ||
| 263 | |||
| 264 | found: | ||
| 265 | area->next = *p; | ||
| 266 | *p = area; | ||
| 267 | 1020 | ||
| 268 | area->flags = flags; | 1021 | area->flags = flags; |
| 269 | area->addr = (void *)addr; | 1022 | area->addr = (void *)va->va_start; |
| 270 | area->size = size; | 1023 | area->size = size; |
| 271 | area->pages = NULL; | 1024 | area->pages = NULL; |
| 272 | area->nr_pages = 0; | 1025 | area->nr_pages = 0; |
| 273 | area->phys_addr = 0; | 1026 | area->phys_addr = 0; |
| 274 | area->caller = caller; | 1027 | area->caller = caller; |
| 1028 | va->private = area; | ||
| 1029 | va->flags |= VM_VM_AREA; | ||
| 1030 | |||
| 1031 | write_lock(&vmlist_lock); | ||
| 1032 | for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { | ||
| 1033 | if (tmp->addr >= area->addr) | ||
| 1034 | break; | ||
| 1035 | } | ||
| 1036 | area->next = *p; | ||
| 1037 | *p = area; | ||
| 275 | write_unlock(&vmlist_lock); | 1038 | write_unlock(&vmlist_lock); |
| 276 | 1039 | ||
| 277 | return area; | 1040 | return area; |
| 278 | |||
| 279 | out: | ||
| 280 | write_unlock(&vmlist_lock); | ||
| 281 | kfree(area); | ||
| 282 | if (printk_ratelimit()) | ||
| 283 | printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n"); | ||
| 284 | return NULL; | ||
| 285 | } | 1041 | } |
| 286 | 1042 | ||
| 287 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | 1043 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, |
| @@ -321,39 +1077,15 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, | |||
| 321 | gfp_mask, __builtin_return_address(0)); | 1077 | gfp_mask, __builtin_return_address(0)); |
| 322 | } | 1078 | } |
| 323 | 1079 | ||
| 324 | /* Caller must hold vmlist_lock */ | 1080 | static struct vm_struct *find_vm_area(const void *addr) |
| 325 | static struct vm_struct *__find_vm_area(const void *addr) | ||
| 326 | { | 1081 | { |
| 327 | struct vm_struct *tmp; | 1082 | struct vmap_area *va; |
| 328 | |||
| 329 | for (tmp = vmlist; tmp != NULL; tmp = tmp->next) { | ||
| 330 | if (tmp->addr == addr) | ||
| 331 | break; | ||
| 332 | } | ||
| 333 | |||
| 334 | return tmp; | ||
| 335 | } | ||
| 336 | 1083 | ||
| 337 | /* Caller must hold vmlist_lock */ | 1084 | va = find_vmap_area((unsigned long)addr); |
| 338 | static struct vm_struct *__remove_vm_area(const void *addr) | 1085 | if (va && va->flags & VM_VM_AREA) |
| 339 | { | 1086 | return va->private; |
| 340 | struct vm_struct **p, *tmp; | ||
| 341 | 1087 | ||
| 342 | for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) { | ||
| 343 | if (tmp->addr == addr) | ||
| 344 | goto found; | ||
| 345 | } | ||
| 346 | return NULL; | 1088 | return NULL; |
| 347 | |||
| 348 | found: | ||
| 349 | unmap_vm_area(tmp); | ||
| 350 | *p = tmp->next; | ||
| 351 | |||
| 352 | /* | ||
| 353 | * Remove the guard page. | ||
| 354 | */ | ||
| 355 | tmp->size -= PAGE_SIZE; | ||
| 356 | return tmp; | ||
| 357 | } | 1089 | } |
| 358 | 1090 | ||
| 359 | /** | 1091 | /** |
| @@ -366,11 +1098,24 @@ found: | |||
| 366 | */ | 1098 | */ |
| 367 | struct vm_struct *remove_vm_area(const void *addr) | 1099 | struct vm_struct *remove_vm_area(const void *addr) |
| 368 | { | 1100 | { |
| 369 | struct vm_struct *v; | 1101 | struct vmap_area *va; |
| 370 | write_lock(&vmlist_lock); | 1102 | |
| 371 | v = __remove_vm_area(addr); | 1103 | va = find_vmap_area((unsigned long)addr); |
| 372 | write_unlock(&vmlist_lock); | 1104 | if (va && va->flags & VM_VM_AREA) { |
| 373 | return v; | 1105 | struct vm_struct *vm = va->private; |
| 1106 | struct vm_struct *tmp, **p; | ||
| 1107 | free_unmap_vmap_area(va); | ||
| 1108 | vm->size -= PAGE_SIZE; | ||
| 1109 | |||
| 1110 | write_lock(&vmlist_lock); | ||
| 1111 | for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) | ||
| 1112 | ; | ||
| 1113 | *p = tmp->next; | ||
| 1114 | write_unlock(&vmlist_lock); | ||
| 1115 | |||
| 1116 | return vm; | ||
| 1117 | } | ||
| 1118 | return NULL; | ||
| 374 | } | 1119 | } |
| 375 | 1120 | ||
| 376 | static void __vunmap(const void *addr, int deallocate_pages) | 1121 | static void __vunmap(const void *addr, int deallocate_pages) |
| @@ -480,6 +1225,8 @@ void *vmap(struct page **pages, unsigned int count, | |||
| 480 | } | 1225 | } |
| 481 | EXPORT_SYMBOL(vmap); | 1226 | EXPORT_SYMBOL(vmap); |
| 482 | 1227 | ||
| 1228 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | ||
| 1229 | int node, void *caller); | ||
| 483 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | 1230 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
| 484 | pgprot_t prot, int node, void *caller) | 1231 | pgprot_t prot, int node, void *caller) |
| 485 | { | 1232 | { |
| @@ -606,10 +1353,8 @@ void *vmalloc_user(unsigned long size) | |||
| 606 | 1353 | ||
| 607 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); | 1354 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); |
| 608 | if (ret) { | 1355 | if (ret) { |
| 609 | write_lock(&vmlist_lock); | 1356 | area = find_vm_area(ret); |
| 610 | area = __find_vm_area(ret); | ||
| 611 | area->flags |= VM_USERMAP; | 1357 | area->flags |= VM_USERMAP; |
| 612 | write_unlock(&vmlist_lock); | ||
| 613 | } | 1358 | } |
| 614 | return ret; | 1359 | return ret; |
| 615 | } | 1360 | } |
| @@ -689,10 +1434,8 @@ void *vmalloc_32_user(unsigned long size) | |||
| 689 | 1434 | ||
| 690 | ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); | 1435 | ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); |
| 691 | if (ret) { | 1436 | if (ret) { |
| 692 | write_lock(&vmlist_lock); | 1437 | area = find_vm_area(ret); |
| 693 | area = __find_vm_area(ret); | ||
| 694 | area->flags |= VM_USERMAP; | 1438 | area->flags |= VM_USERMAP; |
| 695 | write_unlock(&vmlist_lock); | ||
| 696 | } | 1439 | } |
| 697 | return ret; | 1440 | return ret; |
| 698 | } | 1441 | } |
| @@ -793,26 +1536,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
| 793 | struct vm_struct *area; | 1536 | struct vm_struct *area; |
| 794 | unsigned long uaddr = vma->vm_start; | 1537 | unsigned long uaddr = vma->vm_start; |
| 795 | unsigned long usize = vma->vm_end - vma->vm_start; | 1538 | unsigned long usize = vma->vm_end - vma->vm_start; |
| 796 | int ret; | ||
| 797 | 1539 | ||
| 798 | if ((PAGE_SIZE-1) & (unsigned long)addr) | 1540 | if ((PAGE_SIZE-1) & (unsigned long)addr) |
| 799 | return -EINVAL; | 1541 | return -EINVAL; |
| 800 | 1542 | ||
| 801 | read_lock(&vmlist_lock); | 1543 | area = find_vm_area(addr); |
| 802 | area = __find_vm_area(addr); | ||
| 803 | if (!area) | 1544 | if (!area) |
| 804 | goto out_einval_locked; | 1545 | return -EINVAL; |
| 805 | 1546 | ||
| 806 | if (!(area->flags & VM_USERMAP)) | 1547 | if (!(area->flags & VM_USERMAP)) |
| 807 | goto out_einval_locked; | 1548 | return -EINVAL; |
| 808 | 1549 | ||
| 809 | if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) | 1550 | if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) |
| 810 | goto out_einval_locked; | 1551 | return -EINVAL; |
| 811 | read_unlock(&vmlist_lock); | ||
| 812 | 1552 | ||
| 813 | addr += pgoff << PAGE_SHIFT; | 1553 | addr += pgoff << PAGE_SHIFT; |
| 814 | do { | 1554 | do { |
| 815 | struct page *page = vmalloc_to_page(addr); | 1555 | struct page *page = vmalloc_to_page(addr); |
| 1556 | int ret; | ||
| 1557 | |||
| 816 | ret = vm_insert_page(vma, uaddr, page); | 1558 | ret = vm_insert_page(vma, uaddr, page); |
| 817 | if (ret) | 1559 | if (ret) |
| 818 | return ret; | 1560 | return ret; |
| @@ -825,11 +1567,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
| 825 | /* Prevent "things" like memory migration? VM_flags need a cleanup... */ | 1567 | /* Prevent "things" like memory migration? VM_flags need a cleanup... */ |
| 826 | vma->vm_flags |= VM_RESERVED; | 1568 | vma->vm_flags |= VM_RESERVED; |
| 827 | 1569 | ||
| 828 | return ret; | 1570 | return 0; |
| 829 | |||
| 830 | out_einval_locked: | ||
| 831 | read_unlock(&vmlist_lock); | ||
| 832 | return -EINVAL; | ||
| 833 | } | 1571 | } |
| 834 | EXPORT_SYMBOL(remap_vmalloc_range); | 1572 | EXPORT_SYMBOL(remap_vmalloc_range); |
| 835 | 1573 | ||
| @@ -989,11 +1727,41 @@ static int s_show(struct seq_file *m, void *p) | |||
| 989 | return 0; | 1727 | return 0; |
| 990 | } | 1728 | } |
| 991 | 1729 | ||
| 992 | const struct seq_operations vmalloc_op = { | 1730 | static const struct seq_operations vmalloc_op = { |
| 993 | .start = s_start, | 1731 | .start = s_start, |
| 994 | .next = s_next, | 1732 | .next = s_next, |
| 995 | .stop = s_stop, | 1733 | .stop = s_stop, |
| 996 | .show = s_show, | 1734 | .show = s_show, |
| 997 | }; | 1735 | }; |
| 1736 | |||
| 1737 | static int vmalloc_open(struct inode *inode, struct file *file) | ||
| 1738 | { | ||
| 1739 | unsigned int *ptr = NULL; | ||
| 1740 | int ret; | ||
| 1741 | |||
| 1742 | if (NUMA_BUILD) | ||
| 1743 | ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); | ||
| 1744 | ret = seq_open(file, &vmalloc_op); | ||
| 1745 | if (!ret) { | ||
| 1746 | struct seq_file *m = file->private_data; | ||
| 1747 | m->private = ptr; | ||
| 1748 | } else | ||
| 1749 | kfree(ptr); | ||
| 1750 | return ret; | ||
| 1751 | } | ||
| 1752 | |||
| 1753 | static const struct file_operations proc_vmalloc_operations = { | ||
| 1754 | .open = vmalloc_open, | ||
| 1755 | .read = seq_read, | ||
| 1756 | .llseek = seq_lseek, | ||
| 1757 | .release = seq_release_private, | ||
| 1758 | }; | ||
| 1759 | |||
| 1760 | static int __init proc_vmalloc_init(void) | ||
| 1761 | { | ||
| 1762 | proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations); | ||
| 1763 | return 0; | ||
| 1764 | } | ||
| 1765 | module_init(proc_vmalloc_init); | ||
| 998 | #endif | 1766 | #endif |
| 999 | 1767 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 1ff1a58e7c10..c141b3e78071 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -39,6 +39,7 @@ | |||
| 39 | #include <linux/freezer.h> | 39 | #include <linux/freezer.h> |
| 40 | #include <linux/memcontrol.h> | 40 | #include <linux/memcontrol.h> |
| 41 | #include <linux/delayacct.h> | 41 | #include <linux/delayacct.h> |
| 42 | #include <linux/sysctl.h> | ||
| 42 | 43 | ||
| 43 | #include <asm/tlbflush.h> | 44 | #include <asm/tlbflush.h> |
| 44 | #include <asm/div64.h> | 45 | #include <asm/div64.h> |
| @@ -78,7 +79,7 @@ struct scan_control { | |||
| 78 | unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, | 79 | unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, |
| 79 | unsigned long *scanned, int order, int mode, | 80 | unsigned long *scanned, int order, int mode, |
| 80 | struct zone *z, struct mem_cgroup *mem_cont, | 81 | struct zone *z, struct mem_cgroup *mem_cont, |
| 81 | int active); | 82 | int active, int file); |
| 82 | }; | 83 | }; |
| 83 | 84 | ||
| 84 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 85 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
| @@ -470,6 +471,85 @@ int remove_mapping(struct address_space *mapping, struct page *page) | |||
| 470 | return 0; | 471 | return 0; |
| 471 | } | 472 | } |
| 472 | 473 | ||
| 474 | /** | ||
| 475 | * putback_lru_page - put previously isolated page onto appropriate LRU list | ||
| 476 | * @page: page to be put back to appropriate lru list | ||
| 477 | * | ||
| 478 | * Add previously isolated @page to appropriate LRU list. | ||
| 479 | * Page may still be unevictable for other reasons. | ||
| 480 | * | ||
| 481 | * lru_lock must not be held, interrupts must be enabled. | ||
| 482 | */ | ||
| 483 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 484 | void putback_lru_page(struct page *page) | ||
| 485 | { | ||
| 486 | int lru; | ||
| 487 | int active = !!TestClearPageActive(page); | ||
| 488 | int was_unevictable = PageUnevictable(page); | ||
| 489 | |||
| 490 | VM_BUG_ON(PageLRU(page)); | ||
| 491 | |||
| 492 | redo: | ||
| 493 | ClearPageUnevictable(page); | ||
| 494 | |||
| 495 | if (page_evictable(page, NULL)) { | ||
| 496 | /* | ||
| 497 | * For evictable pages, we can use the cache. | ||
| 498 | * In event of a race, worst case is we end up with an | ||
| 499 | * unevictable page on [in]active list. | ||
| 500 | * We know how to handle that. | ||
| 501 | */ | ||
| 502 | lru = active + page_is_file_cache(page); | ||
| 503 | lru_cache_add_lru(page, lru); | ||
| 504 | } else { | ||
| 505 | /* | ||
| 506 | * Put unevictable pages directly on zone's unevictable | ||
| 507 | * list. | ||
| 508 | */ | ||
| 509 | lru = LRU_UNEVICTABLE; | ||
| 510 | add_page_to_unevictable_list(page); | ||
| 511 | } | ||
| 512 | mem_cgroup_move_lists(page, lru); | ||
| 513 | |||
| 514 | /* | ||
| 515 | * page's status can change while we move it among lru. If an evictable | ||
| 516 | * page is on unevictable list, it never be freed. To avoid that, | ||
| 517 | * check after we added it to the list, again. | ||
| 518 | */ | ||
| 519 | if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { | ||
| 520 | if (!isolate_lru_page(page)) { | ||
| 521 | put_page(page); | ||
| 522 | goto redo; | ||
| 523 | } | ||
| 524 | /* This means someone else dropped this page from LRU | ||
| 525 | * So, it will be freed or putback to LRU again. There is | ||
| 526 | * nothing to do here. | ||
| 527 | */ | ||
| 528 | } | ||
| 529 | |||
| 530 | if (was_unevictable && lru != LRU_UNEVICTABLE) | ||
| 531 | count_vm_event(UNEVICTABLE_PGRESCUED); | ||
| 532 | else if (!was_unevictable && lru == LRU_UNEVICTABLE) | ||
| 533 | count_vm_event(UNEVICTABLE_PGCULLED); | ||
| 534 | |||
| 535 | put_page(page); /* drop ref from isolate */ | ||
| 536 | } | ||
| 537 | |||
| 538 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
| 539 | |||
| 540 | void putback_lru_page(struct page *page) | ||
| 541 | { | ||
| 542 | int lru; | ||
| 543 | VM_BUG_ON(PageLRU(page)); | ||
| 544 | |||
| 545 | lru = !!TestClearPageActive(page) + page_is_file_cache(page); | ||
| 546 | lru_cache_add_lru(page, lru); | ||
| 547 | mem_cgroup_move_lists(page, lru); | ||
| 548 | put_page(page); | ||
| 549 | } | ||
| 550 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
| 551 | |||
| 552 | |||
| 473 | /* | 553 | /* |
| 474 | * shrink_page_list() returns the number of reclaimed pages | 554 | * shrink_page_list() returns the number of reclaimed pages |
| 475 | */ | 555 | */ |
| @@ -503,6 +583,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 503 | 583 | ||
| 504 | sc->nr_scanned++; | 584 | sc->nr_scanned++; |
| 505 | 585 | ||
| 586 | if (unlikely(!page_evictable(page, NULL))) | ||
| 587 | goto cull_mlocked; | ||
| 588 | |||
| 506 | if (!sc->may_swap && page_mapped(page)) | 589 | if (!sc->may_swap && page_mapped(page)) |
| 507 | goto keep_locked; | 590 | goto keep_locked; |
| 508 | 591 | ||
| @@ -539,9 +622,19 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 539 | * Anonymous process memory has backing store? | 622 | * Anonymous process memory has backing store? |
| 540 | * Try to allocate it some swap space here. | 623 | * Try to allocate it some swap space here. |
| 541 | */ | 624 | */ |
| 542 | if (PageAnon(page) && !PageSwapCache(page)) | 625 | if (PageAnon(page) && !PageSwapCache(page)) { |
| 626 | switch (try_to_munlock(page)) { | ||
| 627 | case SWAP_FAIL: /* shouldn't happen */ | ||
| 628 | case SWAP_AGAIN: | ||
| 629 | goto keep_locked; | ||
| 630 | case SWAP_MLOCK: | ||
| 631 | goto cull_mlocked; | ||
| 632 | case SWAP_SUCCESS: | ||
| 633 | ; /* fall thru'; add to swap cache */ | ||
| 634 | } | ||
| 543 | if (!add_to_swap(page, GFP_ATOMIC)) | 635 | if (!add_to_swap(page, GFP_ATOMIC)) |
| 544 | goto activate_locked; | 636 | goto activate_locked; |
| 637 | } | ||
| 545 | #endif /* CONFIG_SWAP */ | 638 | #endif /* CONFIG_SWAP */ |
| 546 | 639 | ||
| 547 | mapping = page_mapping(page); | 640 | mapping = page_mapping(page); |
| @@ -556,6 +649,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 556 | goto activate_locked; | 649 | goto activate_locked; |
| 557 | case SWAP_AGAIN: | 650 | case SWAP_AGAIN: |
| 558 | goto keep_locked; | 651 | goto keep_locked; |
| 652 | case SWAP_MLOCK: | ||
| 653 | goto cull_mlocked; | ||
| 559 | case SWAP_SUCCESS: | 654 | case SWAP_SUCCESS: |
| 560 | ; /* try to free the page below */ | 655 | ; /* try to free the page below */ |
| 561 | } | 656 | } |
| @@ -602,7 +697,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 602 | * possible for a page to have PageDirty set, but it is actually | 697 | * possible for a page to have PageDirty set, but it is actually |
| 603 | * clean (all its buffers are clean). This happens if the | 698 | * clean (all its buffers are clean). This happens if the |
| 604 | * buffers were written out directly, with submit_bh(). ext3 | 699 | * buffers were written out directly, with submit_bh(). ext3 |
| 605 | * will do this, as well as the blockdev mapping. | 700 | * will do this, as well as the blockdev mapping. |
| 606 | * try_to_release_page() will discover that cleanness and will | 701 | * try_to_release_page() will discover that cleanness and will |
| 607 | * drop the buffers and mark the page clean - it can be freed. | 702 | * drop the buffers and mark the page clean - it can be freed. |
| 608 | * | 703 | * |
| @@ -637,7 +732,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 637 | if (!mapping || !__remove_mapping(mapping, page)) | 732 | if (!mapping || !__remove_mapping(mapping, page)) |
| 638 | goto keep_locked; | 733 | goto keep_locked; |
| 639 | 734 | ||
| 640 | unlock_page(page); | 735 | /* |
| 736 | * At this point, we have no other references and there is | ||
| 737 | * no way to pick any more up (removed from LRU, removed | ||
| 738 | * from pagecache). Can use non-atomic bitops now (and | ||
| 739 | * we obviously don't have to worry about waking up a process | ||
| 740 | * waiting on the page lock, because there are no references. | ||
| 741 | */ | ||
| 742 | __clear_page_locked(page); | ||
| 641 | free_it: | 743 | free_it: |
| 642 | nr_reclaimed++; | 744 | nr_reclaimed++; |
| 643 | if (!pagevec_add(&freed_pvec, page)) { | 745 | if (!pagevec_add(&freed_pvec, page)) { |
| @@ -646,14 +748,23 @@ free_it: | |||
| 646 | } | 748 | } |
| 647 | continue; | 749 | continue; |
| 648 | 750 | ||
| 751 | cull_mlocked: | ||
| 752 | unlock_page(page); | ||
| 753 | putback_lru_page(page); | ||
| 754 | continue; | ||
| 755 | |||
| 649 | activate_locked: | 756 | activate_locked: |
| 757 | /* Not a candidate for swapping, so reclaim swap space. */ | ||
| 758 | if (PageSwapCache(page) && vm_swap_full()) | ||
| 759 | remove_exclusive_swap_page_ref(page); | ||
| 760 | VM_BUG_ON(PageActive(page)); | ||
| 650 | SetPageActive(page); | 761 | SetPageActive(page); |
| 651 | pgactivate++; | 762 | pgactivate++; |
| 652 | keep_locked: | 763 | keep_locked: |
| 653 | unlock_page(page); | 764 | unlock_page(page); |
| 654 | keep: | 765 | keep: |
| 655 | list_add(&page->lru, &ret_pages); | 766 | list_add(&page->lru, &ret_pages); |
| 656 | VM_BUG_ON(PageLRU(page)); | 767 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
| 657 | } | 768 | } |
| 658 | list_splice(&ret_pages, page_list); | 769 | list_splice(&ret_pages, page_list); |
| 659 | if (pagevec_count(&freed_pvec)) | 770 | if (pagevec_count(&freed_pvec)) |
| @@ -677,7 +788,7 @@ keep: | |||
| 677 | * | 788 | * |
| 678 | * returns 0 on success, -ve errno on failure. | 789 | * returns 0 on success, -ve errno on failure. |
| 679 | */ | 790 | */ |
| 680 | int __isolate_lru_page(struct page *page, int mode) | 791 | int __isolate_lru_page(struct page *page, int mode, int file) |
| 681 | { | 792 | { |
| 682 | int ret = -EINVAL; | 793 | int ret = -EINVAL; |
| 683 | 794 | ||
| @@ -693,6 +804,17 @@ int __isolate_lru_page(struct page *page, int mode) | |||
| 693 | if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) | 804 | if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) |
| 694 | return ret; | 805 | return ret; |
| 695 | 806 | ||
| 807 | if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) | ||
| 808 | return ret; | ||
| 809 | |||
| 810 | /* | ||
| 811 | * When this function is being called for lumpy reclaim, we | ||
| 812 | * initially look into all LRU pages, active, inactive and | ||
| 813 | * unevictable; only give shrink_page_list evictable pages. | ||
| 814 | */ | ||
| 815 | if (PageUnevictable(page)) | ||
| 816 | return ret; | ||
| 817 | |||
| 696 | ret = -EBUSY; | 818 | ret = -EBUSY; |
| 697 | if (likely(get_page_unless_zero(page))) { | 819 | if (likely(get_page_unless_zero(page))) { |
| 698 | /* | 820 | /* |
| @@ -723,12 +845,13 @@ int __isolate_lru_page(struct page *page, int mode) | |||
| 723 | * @scanned: The number of pages that were scanned. | 845 | * @scanned: The number of pages that were scanned. |
| 724 | * @order: The caller's attempted allocation order | 846 | * @order: The caller's attempted allocation order |
| 725 | * @mode: One of the LRU isolation modes | 847 | * @mode: One of the LRU isolation modes |
| 848 | * @file: True [1] if isolating file [!anon] pages | ||
| 726 | * | 849 | * |
| 727 | * returns how many pages were moved onto *@dst. | 850 | * returns how many pages were moved onto *@dst. |
| 728 | */ | 851 | */ |
| 729 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | 852 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
| 730 | struct list_head *src, struct list_head *dst, | 853 | struct list_head *src, struct list_head *dst, |
| 731 | unsigned long *scanned, int order, int mode) | 854 | unsigned long *scanned, int order, int mode, int file) |
| 732 | { | 855 | { |
| 733 | unsigned long nr_taken = 0; | 856 | unsigned long nr_taken = 0; |
| 734 | unsigned long scan; | 857 | unsigned long scan; |
| @@ -745,7 +868,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
| 745 | 868 | ||
| 746 | VM_BUG_ON(!PageLRU(page)); | 869 | VM_BUG_ON(!PageLRU(page)); |
| 747 | 870 | ||
| 748 | switch (__isolate_lru_page(page, mode)) { | 871 | switch (__isolate_lru_page(page, mode, file)) { |
| 749 | case 0: | 872 | case 0: |
| 750 | list_move(&page->lru, dst); | 873 | list_move(&page->lru, dst); |
| 751 | nr_taken++; | 874 | nr_taken++; |
| @@ -788,10 +911,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
| 788 | break; | 911 | break; |
| 789 | 912 | ||
| 790 | cursor_page = pfn_to_page(pfn); | 913 | cursor_page = pfn_to_page(pfn); |
| 914 | |||
| 791 | /* Check that we have not crossed a zone boundary. */ | 915 | /* Check that we have not crossed a zone boundary. */ |
| 792 | if (unlikely(page_zone_id(cursor_page) != zone_id)) | 916 | if (unlikely(page_zone_id(cursor_page) != zone_id)) |
| 793 | continue; | 917 | continue; |
| 794 | switch (__isolate_lru_page(cursor_page, mode)) { | 918 | switch (__isolate_lru_page(cursor_page, mode, file)) { |
| 795 | case 0: | 919 | case 0: |
| 796 | list_move(&cursor_page->lru, dst); | 920 | list_move(&cursor_page->lru, dst); |
| 797 | nr_taken++; | 921 | nr_taken++; |
| @@ -802,7 +926,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
| 802 | /* else it is being freed elsewhere */ | 926 | /* else it is being freed elsewhere */ |
| 803 | list_move(&cursor_page->lru, src); | 927 | list_move(&cursor_page->lru, src); |
| 804 | default: | 928 | default: |
| 805 | break; | 929 | break; /* ! on LRU or wrong list */ |
| 806 | } | 930 | } |
| 807 | } | 931 | } |
| 808 | } | 932 | } |
| @@ -816,40 +940,93 @@ static unsigned long isolate_pages_global(unsigned long nr, | |||
| 816 | unsigned long *scanned, int order, | 940 | unsigned long *scanned, int order, |
| 817 | int mode, struct zone *z, | 941 | int mode, struct zone *z, |
| 818 | struct mem_cgroup *mem_cont, | 942 | struct mem_cgroup *mem_cont, |
| 819 | int active) | 943 | int active, int file) |
| 820 | { | 944 | { |
| 945 | int lru = LRU_BASE; | ||
| 821 | if (active) | 946 | if (active) |
| 822 | return isolate_lru_pages(nr, &z->active_list, dst, | 947 | lru += LRU_ACTIVE; |
| 823 | scanned, order, mode); | 948 | if (file) |
| 824 | else | 949 | lru += LRU_FILE; |
| 825 | return isolate_lru_pages(nr, &z->inactive_list, dst, | 950 | return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, |
| 826 | scanned, order, mode); | 951 | mode, !!file); |
| 827 | } | 952 | } |
| 828 | 953 | ||
| 829 | /* | 954 | /* |
| 830 | * clear_active_flags() is a helper for shrink_active_list(), clearing | 955 | * clear_active_flags() is a helper for shrink_active_list(), clearing |
| 831 | * any active bits from the pages in the list. | 956 | * any active bits from the pages in the list. |
| 832 | */ | 957 | */ |
| 833 | static unsigned long clear_active_flags(struct list_head *page_list) | 958 | static unsigned long clear_active_flags(struct list_head *page_list, |
| 959 | unsigned int *count) | ||
| 834 | { | 960 | { |
| 835 | int nr_active = 0; | 961 | int nr_active = 0; |
| 962 | int lru; | ||
| 836 | struct page *page; | 963 | struct page *page; |
| 837 | 964 | ||
| 838 | list_for_each_entry(page, page_list, lru) | 965 | list_for_each_entry(page, page_list, lru) { |
| 966 | lru = page_is_file_cache(page); | ||
| 839 | if (PageActive(page)) { | 967 | if (PageActive(page)) { |
| 968 | lru += LRU_ACTIVE; | ||
| 840 | ClearPageActive(page); | 969 | ClearPageActive(page); |
| 841 | nr_active++; | 970 | nr_active++; |
| 842 | } | 971 | } |
| 972 | count[lru]++; | ||
| 973 | } | ||
| 843 | 974 | ||
| 844 | return nr_active; | 975 | return nr_active; |
| 845 | } | 976 | } |
| 846 | 977 | ||
| 978 | /** | ||
| 979 | * isolate_lru_page - tries to isolate a page from its LRU list | ||
| 980 | * @page: page to isolate from its LRU list | ||
| 981 | * | ||
| 982 | * Isolates a @page from an LRU list, clears PageLRU and adjusts the | ||
| 983 | * vmstat statistic corresponding to whatever LRU list the page was on. | ||
| 984 | * | ||
| 985 | * Returns 0 if the page was removed from an LRU list. | ||
| 986 | * Returns -EBUSY if the page was not on an LRU list. | ||
| 987 | * | ||
| 988 | * The returned page will have PageLRU() cleared. If it was found on | ||
| 989 | * the active list, it will have PageActive set. If it was found on | ||
| 990 | * the unevictable list, it will have the PageUnevictable bit set. That flag | ||
| 991 | * may need to be cleared by the caller before letting the page go. | ||
| 992 | * | ||
| 993 | * The vmstat statistic corresponding to the list on which the page was | ||
| 994 | * found will be decremented. | ||
| 995 | * | ||
| 996 | * Restrictions: | ||
| 997 | * (1) Must be called with an elevated refcount on the page. This is a | ||
| 998 | * fundamentnal difference from isolate_lru_pages (which is called | ||
| 999 | * without a stable reference). | ||
| 1000 | * (2) the lru_lock must not be held. | ||
| 1001 | * (3) interrupts must be enabled. | ||
| 1002 | */ | ||
| 1003 | int isolate_lru_page(struct page *page) | ||
| 1004 | { | ||
| 1005 | int ret = -EBUSY; | ||
| 1006 | |||
| 1007 | if (PageLRU(page)) { | ||
| 1008 | struct zone *zone = page_zone(page); | ||
| 1009 | |||
| 1010 | spin_lock_irq(&zone->lru_lock); | ||
| 1011 | if (PageLRU(page) && get_page_unless_zero(page)) { | ||
| 1012 | int lru = page_lru(page); | ||
| 1013 | ret = 0; | ||
| 1014 | ClearPageLRU(page); | ||
| 1015 | |||
| 1016 | del_page_from_lru_list(zone, page, lru); | ||
| 1017 | } | ||
| 1018 | spin_unlock_irq(&zone->lru_lock); | ||
| 1019 | } | ||
| 1020 | return ret; | ||
| 1021 | } | ||
| 1022 | |||
| 847 | /* | 1023 | /* |
| 848 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number | 1024 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number |
| 849 | * of reclaimed pages | 1025 | * of reclaimed pages |
| 850 | */ | 1026 | */ |
| 851 | static unsigned long shrink_inactive_list(unsigned long max_scan, | 1027 | static unsigned long shrink_inactive_list(unsigned long max_scan, |
| 852 | struct zone *zone, struct scan_control *sc) | 1028 | struct zone *zone, struct scan_control *sc, |
| 1029 | int priority, int file) | ||
| 853 | { | 1030 | { |
| 854 | LIST_HEAD(page_list); | 1031 | LIST_HEAD(page_list); |
| 855 | struct pagevec pvec; | 1032 | struct pagevec pvec; |
| @@ -866,20 +1043,43 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 866 | unsigned long nr_scan; | 1043 | unsigned long nr_scan; |
| 867 | unsigned long nr_freed; | 1044 | unsigned long nr_freed; |
| 868 | unsigned long nr_active; | 1045 | unsigned long nr_active; |
| 1046 | unsigned int count[NR_LRU_LISTS] = { 0, }; | ||
| 1047 | int mode = ISOLATE_INACTIVE; | ||
| 1048 | |||
| 1049 | /* | ||
| 1050 | * If we need a large contiguous chunk of memory, or have | ||
| 1051 | * trouble getting a small set of contiguous pages, we | ||
| 1052 | * will reclaim both active and inactive pages. | ||
| 1053 | * | ||
| 1054 | * We use the same threshold as pageout congestion_wait below. | ||
| 1055 | */ | ||
| 1056 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
| 1057 | mode = ISOLATE_BOTH; | ||
| 1058 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
| 1059 | mode = ISOLATE_BOTH; | ||
| 869 | 1060 | ||
| 870 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, | 1061 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, |
| 871 | &page_list, &nr_scan, sc->order, | 1062 | &page_list, &nr_scan, sc->order, mode, |
| 872 | (sc->order > PAGE_ALLOC_COSTLY_ORDER)? | 1063 | zone, sc->mem_cgroup, 0, file); |
| 873 | ISOLATE_BOTH : ISOLATE_INACTIVE, | 1064 | nr_active = clear_active_flags(&page_list, count); |
| 874 | zone, sc->mem_cgroup, 0); | ||
| 875 | nr_active = clear_active_flags(&page_list); | ||
| 876 | __count_vm_events(PGDEACTIVATE, nr_active); | 1065 | __count_vm_events(PGDEACTIVATE, nr_active); |
| 877 | 1066 | ||
| 878 | __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); | 1067 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, |
| 879 | __mod_zone_page_state(zone, NR_INACTIVE, | 1068 | -count[LRU_ACTIVE_FILE]); |
| 880 | -(nr_taken - nr_active)); | 1069 | __mod_zone_page_state(zone, NR_INACTIVE_FILE, |
| 881 | if (scan_global_lru(sc)) | 1070 | -count[LRU_INACTIVE_FILE]); |
| 1071 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, | ||
| 1072 | -count[LRU_ACTIVE_ANON]); | ||
| 1073 | __mod_zone_page_state(zone, NR_INACTIVE_ANON, | ||
| 1074 | -count[LRU_INACTIVE_ANON]); | ||
| 1075 | |||
| 1076 | if (scan_global_lru(sc)) { | ||
| 882 | zone->pages_scanned += nr_scan; | 1077 | zone->pages_scanned += nr_scan; |
| 1078 | zone->recent_scanned[0] += count[LRU_INACTIVE_ANON]; | ||
| 1079 | zone->recent_scanned[0] += count[LRU_ACTIVE_ANON]; | ||
| 1080 | zone->recent_scanned[1] += count[LRU_INACTIVE_FILE]; | ||
| 1081 | zone->recent_scanned[1] += count[LRU_ACTIVE_FILE]; | ||
| 1082 | } | ||
| 883 | spin_unlock_irq(&zone->lru_lock); | 1083 | spin_unlock_irq(&zone->lru_lock); |
| 884 | 1084 | ||
| 885 | nr_scanned += nr_scan; | 1085 | nr_scanned += nr_scan; |
| @@ -899,7 +1099,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 899 | * The attempt at page out may have made some | 1099 | * The attempt at page out may have made some |
| 900 | * of the pages active, mark them inactive again. | 1100 | * of the pages active, mark them inactive again. |
| 901 | */ | 1101 | */ |
| 902 | nr_active = clear_active_flags(&page_list); | 1102 | nr_active = clear_active_flags(&page_list, count); |
| 903 | count_vm_events(PGDEACTIVATE, nr_active); | 1103 | count_vm_events(PGDEACTIVATE, nr_active); |
| 904 | 1104 | ||
| 905 | nr_freed += shrink_page_list(&page_list, sc, | 1105 | nr_freed += shrink_page_list(&page_list, sc, |
| @@ -924,14 +1124,24 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 924 | * Put back any unfreeable pages. | 1124 | * Put back any unfreeable pages. |
| 925 | */ | 1125 | */ |
| 926 | while (!list_empty(&page_list)) { | 1126 | while (!list_empty(&page_list)) { |
| 1127 | int lru; | ||
| 927 | page = lru_to_page(&page_list); | 1128 | page = lru_to_page(&page_list); |
| 928 | VM_BUG_ON(PageLRU(page)); | 1129 | VM_BUG_ON(PageLRU(page)); |
| 929 | SetPageLRU(page); | ||
| 930 | list_del(&page->lru); | 1130 | list_del(&page->lru); |
| 931 | if (PageActive(page)) | 1131 | if (unlikely(!page_evictable(page, NULL))) { |
| 932 | add_page_to_active_list(zone, page); | 1132 | spin_unlock_irq(&zone->lru_lock); |
| 933 | else | 1133 | putback_lru_page(page); |
| 934 | add_page_to_inactive_list(zone, page); | 1134 | spin_lock_irq(&zone->lru_lock); |
| 1135 | continue; | ||
| 1136 | } | ||
| 1137 | SetPageLRU(page); | ||
| 1138 | lru = page_lru(page); | ||
| 1139 | add_page_to_lru_list(zone, page, lru); | ||
| 1140 | mem_cgroup_move_lists(page, lru); | ||
| 1141 | if (PageActive(page) && scan_global_lru(sc)) { | ||
| 1142 | int file = !!page_is_file_cache(page); | ||
| 1143 | zone->recent_rotated[file]++; | ||
| 1144 | } | ||
| 935 | if (!pagevec_add(&pvec, page)) { | 1145 | if (!pagevec_add(&pvec, page)) { |
| 936 | spin_unlock_irq(&zone->lru_lock); | 1146 | spin_unlock_irq(&zone->lru_lock); |
| 937 | __pagevec_release(&pvec); | 1147 | __pagevec_release(&pvec); |
| @@ -962,115 +1172,7 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) | |||
| 962 | 1172 | ||
| 963 | static inline int zone_is_near_oom(struct zone *zone) | 1173 | static inline int zone_is_near_oom(struct zone *zone) |
| 964 | { | 1174 | { |
| 965 | return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) | 1175 | return zone->pages_scanned >= (zone_lru_pages(zone) * 3); |
| 966 | + zone_page_state(zone, NR_INACTIVE))*3; | ||
| 967 | } | ||
| 968 | |||
| 969 | /* | ||
| 970 | * Determine we should try to reclaim mapped pages. | ||
| 971 | * This is called only when sc->mem_cgroup is NULL. | ||
| 972 | */ | ||
| 973 | static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, | ||
| 974 | int priority) | ||
| 975 | { | ||
| 976 | long mapped_ratio; | ||
| 977 | long distress; | ||
| 978 | long swap_tendency; | ||
| 979 | long imbalance; | ||
| 980 | int reclaim_mapped = 0; | ||
| 981 | int prev_priority; | ||
| 982 | |||
| 983 | if (scan_global_lru(sc) && zone_is_near_oom(zone)) | ||
| 984 | return 1; | ||
| 985 | /* | ||
| 986 | * `distress' is a measure of how much trouble we're having | ||
| 987 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. | ||
| 988 | */ | ||
| 989 | if (scan_global_lru(sc)) | ||
| 990 | prev_priority = zone->prev_priority; | ||
| 991 | else | ||
| 992 | prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup); | ||
| 993 | |||
| 994 | distress = 100 >> min(prev_priority, priority); | ||
| 995 | |||
| 996 | /* | ||
| 997 | * The point of this algorithm is to decide when to start | ||
| 998 | * reclaiming mapped memory instead of just pagecache. Work out | ||
| 999 | * how much memory | ||
| 1000 | * is mapped. | ||
| 1001 | */ | ||
| 1002 | if (scan_global_lru(sc)) | ||
| 1003 | mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + | ||
| 1004 | global_page_state(NR_ANON_PAGES)) * 100) / | ||
| 1005 | vm_total_pages; | ||
| 1006 | else | ||
| 1007 | mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup); | ||
| 1008 | |||
| 1009 | /* | ||
| 1010 | * Now decide how much we really want to unmap some pages. The | ||
| 1011 | * mapped ratio is downgraded - just because there's a lot of | ||
| 1012 | * mapped memory doesn't necessarily mean that page reclaim | ||
| 1013 | * isn't succeeding. | ||
| 1014 | * | ||
| 1015 | * The distress ratio is important - we don't want to start | ||
| 1016 | * going oom. | ||
| 1017 | * | ||
| 1018 | * A 100% value of vm_swappiness overrides this algorithm | ||
| 1019 | * altogether. | ||
| 1020 | */ | ||
| 1021 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; | ||
| 1022 | |||
| 1023 | /* | ||
| 1024 | * If there's huge imbalance between active and inactive | ||
| 1025 | * (think active 100 times larger than inactive) we should | ||
| 1026 | * become more permissive, or the system will take too much | ||
| 1027 | * cpu before it start swapping during memory pressure. | ||
| 1028 | * Distress is about avoiding early-oom, this is about | ||
| 1029 | * making swappiness graceful despite setting it to low | ||
| 1030 | * values. | ||
| 1031 | * | ||
| 1032 | * Avoid div by zero with nr_inactive+1, and max resulting | ||
| 1033 | * value is vm_total_pages. | ||
| 1034 | */ | ||
| 1035 | if (scan_global_lru(sc)) { | ||
| 1036 | imbalance = zone_page_state(zone, NR_ACTIVE); | ||
| 1037 | imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; | ||
| 1038 | } else | ||
| 1039 | imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup); | ||
| 1040 | |||
| 1041 | /* | ||
| 1042 | * Reduce the effect of imbalance if swappiness is low, | ||
| 1043 | * this means for a swappiness very low, the imbalance | ||
| 1044 | * must be much higher than 100 for this logic to make | ||
| 1045 | * the difference. | ||
| 1046 | * | ||
| 1047 | * Max temporary value is vm_total_pages*100. | ||
| 1048 | */ | ||
| 1049 | imbalance *= (vm_swappiness + 1); | ||
| 1050 | imbalance /= 100; | ||
| 1051 | |||
| 1052 | /* | ||
| 1053 | * If not much of the ram is mapped, makes the imbalance | ||
| 1054 | * less relevant, it's high priority we refill the inactive | ||
| 1055 | * list with mapped pages only in presence of high ratio of | ||
| 1056 | * mapped pages. | ||
| 1057 | * | ||
| 1058 | * Max temporary value is vm_total_pages*100. | ||
| 1059 | */ | ||
| 1060 | imbalance *= mapped_ratio; | ||
| 1061 | imbalance /= 100; | ||
| 1062 | |||
| 1063 | /* apply imbalance feedback to swap_tendency */ | ||
| 1064 | swap_tendency += imbalance; | ||
| 1065 | |||
| 1066 | /* | ||
| 1067 | * Now use this metric to decide whether to start moving mapped | ||
| 1068 | * memory onto the inactive list. | ||
| 1069 | */ | ||
| 1070 | if (swap_tendency >= 100) | ||
| 1071 | reclaim_mapped = 1; | ||
| 1072 | |||
| 1073 | return reclaim_mapped; | ||
| 1074 | } | 1176 | } |
| 1075 | 1177 | ||
| 1076 | /* | 1178 | /* |
| @@ -1093,53 +1195,71 @@ static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, | |||
| 1093 | 1195 | ||
| 1094 | 1196 | ||
| 1095 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | 1197 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, |
| 1096 | struct scan_control *sc, int priority) | 1198 | struct scan_control *sc, int priority, int file) |
| 1097 | { | 1199 | { |
| 1098 | unsigned long pgmoved; | 1200 | unsigned long pgmoved; |
| 1099 | int pgdeactivate = 0; | 1201 | int pgdeactivate = 0; |
| 1100 | unsigned long pgscanned; | 1202 | unsigned long pgscanned; |
| 1101 | LIST_HEAD(l_hold); /* The pages which were snipped off */ | 1203 | LIST_HEAD(l_hold); /* The pages which were snipped off */ |
| 1102 | LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ | 1204 | LIST_HEAD(l_inactive); |
| 1103 | LIST_HEAD(l_active); /* Pages to go onto the active_list */ | ||
| 1104 | struct page *page; | 1205 | struct page *page; |
| 1105 | struct pagevec pvec; | 1206 | struct pagevec pvec; |
| 1106 | int reclaim_mapped = 0; | 1207 | enum lru_list lru; |
| 1107 | |||
| 1108 | if (sc->may_swap) | ||
| 1109 | reclaim_mapped = calc_reclaim_mapped(sc, zone, priority); | ||
| 1110 | 1208 | ||
| 1111 | lru_add_drain(); | 1209 | lru_add_drain(); |
| 1112 | spin_lock_irq(&zone->lru_lock); | 1210 | spin_lock_irq(&zone->lru_lock); |
| 1113 | pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, | 1211 | pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, |
| 1114 | ISOLATE_ACTIVE, zone, | 1212 | ISOLATE_ACTIVE, zone, |
| 1115 | sc->mem_cgroup, 1); | 1213 | sc->mem_cgroup, 1, file); |
| 1116 | /* | 1214 | /* |
| 1117 | * zone->pages_scanned is used for detect zone's oom | 1215 | * zone->pages_scanned is used for detect zone's oom |
| 1118 | * mem_cgroup remembers nr_scan by itself. | 1216 | * mem_cgroup remembers nr_scan by itself. |
| 1119 | */ | 1217 | */ |
| 1120 | if (scan_global_lru(sc)) | 1218 | if (scan_global_lru(sc)) { |
| 1121 | zone->pages_scanned += pgscanned; | 1219 | zone->pages_scanned += pgscanned; |
| 1220 | zone->recent_scanned[!!file] += pgmoved; | ||
| 1221 | } | ||
| 1122 | 1222 | ||
| 1123 | __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); | 1223 | if (file) |
| 1224 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); | ||
| 1225 | else | ||
| 1226 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); | ||
| 1124 | spin_unlock_irq(&zone->lru_lock); | 1227 | spin_unlock_irq(&zone->lru_lock); |
| 1125 | 1228 | ||
| 1229 | pgmoved = 0; | ||
| 1126 | while (!list_empty(&l_hold)) { | 1230 | while (!list_empty(&l_hold)) { |
| 1127 | cond_resched(); | 1231 | cond_resched(); |
| 1128 | page = lru_to_page(&l_hold); | 1232 | page = lru_to_page(&l_hold); |
| 1129 | list_del(&page->lru); | 1233 | list_del(&page->lru); |
| 1130 | if (page_mapped(page)) { | 1234 | |
| 1131 | if (!reclaim_mapped || | 1235 | if (unlikely(!page_evictable(page, NULL))) { |
| 1132 | (total_swap_pages == 0 && PageAnon(page)) || | 1236 | putback_lru_page(page); |
| 1133 | page_referenced(page, 0, sc->mem_cgroup)) { | 1237 | continue; |
| 1134 | list_add(&page->lru, &l_active); | ||
| 1135 | continue; | ||
| 1136 | } | ||
| 1137 | } | 1238 | } |
| 1239 | |||
| 1240 | /* page_referenced clears PageReferenced */ | ||
| 1241 | if (page_mapping_inuse(page) && | ||
| 1242 | page_referenced(page, 0, sc->mem_cgroup)) | ||
| 1243 | pgmoved++; | ||
| 1244 | |||
| 1138 | list_add(&page->lru, &l_inactive); | 1245 | list_add(&page->lru, &l_inactive); |
| 1139 | } | 1246 | } |
| 1140 | 1247 | ||
| 1248 | /* | ||
| 1249 | * Count referenced pages from currently used mappings as | ||
| 1250 | * rotated, even though they are moved to the inactive list. | ||
| 1251 | * This helps balance scan pressure between file and anonymous | ||
| 1252 | * pages in get_scan_ratio. | ||
| 1253 | */ | ||
| 1254 | zone->recent_rotated[!!file] += pgmoved; | ||
| 1255 | |||
| 1256 | /* | ||
| 1257 | * Move the pages to the [file or anon] inactive list. | ||
| 1258 | */ | ||
| 1141 | pagevec_init(&pvec, 1); | 1259 | pagevec_init(&pvec, 1); |
| 1260 | |||
| 1142 | pgmoved = 0; | 1261 | pgmoved = 0; |
| 1262 | lru = LRU_BASE + file * LRU_FILE; | ||
| 1143 | spin_lock_irq(&zone->lru_lock); | 1263 | spin_lock_irq(&zone->lru_lock); |
| 1144 | while (!list_empty(&l_inactive)) { | 1264 | while (!list_empty(&l_inactive)) { |
| 1145 | page = lru_to_page(&l_inactive); | 1265 | page = lru_to_page(&l_inactive); |
| @@ -1149,11 +1269,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 1149 | VM_BUG_ON(!PageActive(page)); | 1269 | VM_BUG_ON(!PageActive(page)); |
| 1150 | ClearPageActive(page); | 1270 | ClearPageActive(page); |
| 1151 | 1271 | ||
| 1152 | list_move(&page->lru, &zone->inactive_list); | 1272 | list_move(&page->lru, &zone->lru[lru].list); |
| 1153 | mem_cgroup_move_lists(page, false); | 1273 | mem_cgroup_move_lists(page, lru); |
| 1154 | pgmoved++; | 1274 | pgmoved++; |
| 1155 | if (!pagevec_add(&pvec, page)) { | 1275 | if (!pagevec_add(&pvec, page)) { |
| 1156 | __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); | 1276 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); |
| 1157 | spin_unlock_irq(&zone->lru_lock); | 1277 | spin_unlock_irq(&zone->lru_lock); |
| 1158 | pgdeactivate += pgmoved; | 1278 | pgdeactivate += pgmoved; |
| 1159 | pgmoved = 0; | 1279 | pgmoved = 0; |
| @@ -1163,104 +1283,189 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 1163 | spin_lock_irq(&zone->lru_lock); | 1283 | spin_lock_irq(&zone->lru_lock); |
| 1164 | } | 1284 | } |
| 1165 | } | 1285 | } |
| 1166 | __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); | 1286 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); |
| 1167 | pgdeactivate += pgmoved; | 1287 | pgdeactivate += pgmoved; |
| 1168 | if (buffer_heads_over_limit) { | 1288 | if (buffer_heads_over_limit) { |
| 1169 | spin_unlock_irq(&zone->lru_lock); | 1289 | spin_unlock_irq(&zone->lru_lock); |
| 1170 | pagevec_strip(&pvec); | 1290 | pagevec_strip(&pvec); |
| 1171 | spin_lock_irq(&zone->lru_lock); | 1291 | spin_lock_irq(&zone->lru_lock); |
| 1172 | } | 1292 | } |
| 1173 | |||
| 1174 | pgmoved = 0; | ||
| 1175 | while (!list_empty(&l_active)) { | ||
| 1176 | page = lru_to_page(&l_active); | ||
| 1177 | prefetchw_prev_lru_page(page, &l_active, flags); | ||
| 1178 | VM_BUG_ON(PageLRU(page)); | ||
| 1179 | SetPageLRU(page); | ||
| 1180 | VM_BUG_ON(!PageActive(page)); | ||
| 1181 | |||
| 1182 | list_move(&page->lru, &zone->active_list); | ||
| 1183 | mem_cgroup_move_lists(page, true); | ||
| 1184 | pgmoved++; | ||
| 1185 | if (!pagevec_add(&pvec, page)) { | ||
| 1186 | __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); | ||
| 1187 | pgmoved = 0; | ||
| 1188 | spin_unlock_irq(&zone->lru_lock); | ||
| 1189 | __pagevec_release(&pvec); | ||
| 1190 | spin_lock_irq(&zone->lru_lock); | ||
| 1191 | } | ||
| 1192 | } | ||
| 1193 | __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); | ||
| 1194 | |||
| 1195 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | 1293 | __count_zone_vm_events(PGREFILL, zone, pgscanned); |
| 1196 | __count_vm_events(PGDEACTIVATE, pgdeactivate); | 1294 | __count_vm_events(PGDEACTIVATE, pgdeactivate); |
| 1197 | spin_unlock_irq(&zone->lru_lock); | 1295 | spin_unlock_irq(&zone->lru_lock); |
| 1296 | if (vm_swap_full()) | ||
| 1297 | pagevec_swap_free(&pvec); | ||
| 1198 | 1298 | ||
| 1199 | pagevec_release(&pvec); | 1299 | pagevec_release(&pvec); |
| 1200 | } | 1300 | } |
| 1201 | 1301 | ||
| 1302 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | ||
| 1303 | struct zone *zone, struct scan_control *sc, int priority) | ||
| 1304 | { | ||
| 1305 | int file = is_file_lru(lru); | ||
| 1306 | |||
| 1307 | if (lru == LRU_ACTIVE_FILE) { | ||
| 1308 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | ||
| 1309 | return 0; | ||
| 1310 | } | ||
| 1311 | |||
| 1312 | if (lru == LRU_ACTIVE_ANON && | ||
| 1313 | (!scan_global_lru(sc) || inactive_anon_is_low(zone))) { | ||
| 1314 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | ||
| 1315 | return 0; | ||
| 1316 | } | ||
| 1317 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); | ||
| 1318 | } | ||
| 1319 | |||
| 1320 | /* | ||
| 1321 | * Determine how aggressively the anon and file LRU lists should be | ||
| 1322 | * scanned. The relative value of each set of LRU lists is determined | ||
| 1323 | * by looking at the fraction of the pages scanned we did rotate back | ||
| 1324 | * onto the active list instead of evict. | ||
| 1325 | * | ||
| 1326 | * percent[0] specifies how much pressure to put on ram/swap backed | ||
| 1327 | * memory, while percent[1] determines pressure on the file LRUs. | ||
| 1328 | */ | ||
| 1329 | static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | ||
| 1330 | unsigned long *percent) | ||
| 1331 | { | ||
| 1332 | unsigned long anon, file, free; | ||
| 1333 | unsigned long anon_prio, file_prio; | ||
| 1334 | unsigned long ap, fp; | ||
| 1335 | |||
| 1336 | anon = zone_page_state(zone, NR_ACTIVE_ANON) + | ||
| 1337 | zone_page_state(zone, NR_INACTIVE_ANON); | ||
| 1338 | file = zone_page_state(zone, NR_ACTIVE_FILE) + | ||
| 1339 | zone_page_state(zone, NR_INACTIVE_FILE); | ||
| 1340 | free = zone_page_state(zone, NR_FREE_PAGES); | ||
| 1341 | |||
| 1342 | /* If we have no swap space, do not bother scanning anon pages. */ | ||
| 1343 | if (nr_swap_pages <= 0) { | ||
| 1344 | percent[0] = 0; | ||
| 1345 | percent[1] = 100; | ||
| 1346 | return; | ||
| 1347 | } | ||
| 1348 | |||
| 1349 | /* If we have very few page cache pages, force-scan anon pages. */ | ||
| 1350 | if (unlikely(file + free <= zone->pages_high)) { | ||
| 1351 | percent[0] = 100; | ||
| 1352 | percent[1] = 0; | ||
| 1353 | return; | ||
| 1354 | } | ||
| 1355 | |||
| 1356 | /* | ||
| 1357 | * OK, so we have swap space and a fair amount of page cache | ||
| 1358 | * pages. We use the recently rotated / recently scanned | ||
| 1359 | * ratios to determine how valuable each cache is. | ||
| 1360 | * | ||
| 1361 | * Because workloads change over time (and to avoid overflow) | ||
| 1362 | * we keep these statistics as a floating average, which ends | ||
| 1363 | * up weighing recent references more than old ones. | ||
| 1364 | * | ||
| 1365 | * anon in [0], file in [1] | ||
| 1366 | */ | ||
| 1367 | if (unlikely(zone->recent_scanned[0] > anon / 4)) { | ||
| 1368 | spin_lock_irq(&zone->lru_lock); | ||
| 1369 | zone->recent_scanned[0] /= 2; | ||
| 1370 | zone->recent_rotated[0] /= 2; | ||
| 1371 | spin_unlock_irq(&zone->lru_lock); | ||
| 1372 | } | ||
| 1373 | |||
| 1374 | if (unlikely(zone->recent_scanned[1] > file / 4)) { | ||
| 1375 | spin_lock_irq(&zone->lru_lock); | ||
| 1376 | zone->recent_scanned[1] /= 2; | ||
| 1377 | zone->recent_rotated[1] /= 2; | ||
| 1378 | spin_unlock_irq(&zone->lru_lock); | ||
| 1379 | } | ||
| 1380 | |||
| 1381 | /* | ||
| 1382 | * With swappiness at 100, anonymous and file have the same priority. | ||
| 1383 | * This scanning priority is essentially the inverse of IO cost. | ||
| 1384 | */ | ||
| 1385 | anon_prio = sc->swappiness; | ||
| 1386 | file_prio = 200 - sc->swappiness; | ||
| 1387 | |||
| 1388 | /* | ||
| 1389 | * anon recent_rotated[0] | ||
| 1390 | * %anon = 100 * ----------- / ----------------- * IO cost | ||
| 1391 | * anon + file rotate_sum | ||
| 1392 | */ | ||
| 1393 | ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1); | ||
| 1394 | ap /= zone->recent_rotated[0] + 1; | ||
| 1395 | |||
| 1396 | fp = (file_prio + 1) * (zone->recent_scanned[1] + 1); | ||
| 1397 | fp /= zone->recent_rotated[1] + 1; | ||
| 1398 | |||
| 1399 | /* Normalize to percentages */ | ||
| 1400 | percent[0] = 100 * ap / (ap + fp + 1); | ||
| 1401 | percent[1] = 100 - percent[0]; | ||
| 1402 | } | ||
| 1403 | |||
| 1404 | |||
| 1202 | /* | 1405 | /* |
| 1203 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1406 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
| 1204 | */ | 1407 | */ |
| 1205 | static unsigned long shrink_zone(int priority, struct zone *zone, | 1408 | static unsigned long shrink_zone(int priority, struct zone *zone, |
| 1206 | struct scan_control *sc) | 1409 | struct scan_control *sc) |
| 1207 | { | 1410 | { |
| 1208 | unsigned long nr_active; | 1411 | unsigned long nr[NR_LRU_LISTS]; |
| 1209 | unsigned long nr_inactive; | ||
| 1210 | unsigned long nr_to_scan; | 1412 | unsigned long nr_to_scan; |
| 1211 | unsigned long nr_reclaimed = 0; | 1413 | unsigned long nr_reclaimed = 0; |
| 1414 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ | ||
| 1415 | enum lru_list l; | ||
| 1212 | 1416 | ||
| 1213 | if (scan_global_lru(sc)) { | 1417 | get_scan_ratio(zone, sc, percent); |
| 1214 | /* | ||
| 1215 | * Add one to nr_to_scan just to make sure that the kernel | ||
| 1216 | * will slowly sift through the active list. | ||
| 1217 | */ | ||
| 1218 | zone->nr_scan_active += | ||
| 1219 | (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; | ||
| 1220 | nr_active = zone->nr_scan_active; | ||
| 1221 | zone->nr_scan_inactive += | ||
| 1222 | (zone_page_state(zone, NR_INACTIVE) >> priority) + 1; | ||
| 1223 | nr_inactive = zone->nr_scan_inactive; | ||
| 1224 | if (nr_inactive >= sc->swap_cluster_max) | ||
| 1225 | zone->nr_scan_inactive = 0; | ||
| 1226 | else | ||
| 1227 | nr_inactive = 0; | ||
| 1228 | |||
| 1229 | if (nr_active >= sc->swap_cluster_max) | ||
| 1230 | zone->nr_scan_active = 0; | ||
| 1231 | else | ||
| 1232 | nr_active = 0; | ||
| 1233 | } else { | ||
| 1234 | /* | ||
| 1235 | * This reclaim occurs not because zone memory shortage but | ||
| 1236 | * because memory controller hits its limit. | ||
| 1237 | * Then, don't modify zone reclaim related data. | ||
| 1238 | */ | ||
| 1239 | nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup, | ||
| 1240 | zone, priority); | ||
| 1241 | |||
| 1242 | nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup, | ||
| 1243 | zone, priority); | ||
| 1244 | } | ||
| 1245 | 1418 | ||
| 1419 | for_each_evictable_lru(l) { | ||
| 1420 | if (scan_global_lru(sc)) { | ||
| 1421 | int file = is_file_lru(l); | ||
| 1422 | int scan; | ||
| 1246 | 1423 | ||
| 1247 | while (nr_active || nr_inactive) { | 1424 | scan = zone_page_state(zone, NR_LRU_BASE + l); |
| 1248 | if (nr_active) { | 1425 | if (priority) { |
| 1249 | nr_to_scan = min(nr_active, | 1426 | scan >>= priority; |
| 1250 | (unsigned long)sc->swap_cluster_max); | 1427 | scan = (scan * percent[file]) / 100; |
| 1251 | nr_active -= nr_to_scan; | 1428 | } |
| 1252 | shrink_active_list(nr_to_scan, zone, sc, priority); | 1429 | zone->lru[l].nr_scan += scan; |
| 1430 | nr[l] = zone->lru[l].nr_scan; | ||
| 1431 | if (nr[l] >= sc->swap_cluster_max) | ||
| 1432 | zone->lru[l].nr_scan = 0; | ||
| 1433 | else | ||
| 1434 | nr[l] = 0; | ||
| 1435 | } else { | ||
| 1436 | /* | ||
| 1437 | * This reclaim occurs not because zone memory shortage | ||
| 1438 | * but because memory controller hits its limit. | ||
| 1439 | * Don't modify zone reclaim related data. | ||
| 1440 | */ | ||
| 1441 | nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone, | ||
| 1442 | priority, l); | ||
| 1253 | } | 1443 | } |
| 1444 | } | ||
| 1254 | 1445 | ||
| 1255 | if (nr_inactive) { | 1446 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
| 1256 | nr_to_scan = min(nr_inactive, | 1447 | nr[LRU_INACTIVE_FILE]) { |
| 1448 | for_each_evictable_lru(l) { | ||
| 1449 | if (nr[l]) { | ||
| 1450 | nr_to_scan = min(nr[l], | ||
| 1257 | (unsigned long)sc->swap_cluster_max); | 1451 | (unsigned long)sc->swap_cluster_max); |
| 1258 | nr_inactive -= nr_to_scan; | 1452 | nr[l] -= nr_to_scan; |
| 1259 | nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, | 1453 | |
| 1260 | sc); | 1454 | nr_reclaimed += shrink_list(l, nr_to_scan, |
| 1455 | zone, sc, priority); | ||
| 1456 | } | ||
| 1261 | } | 1457 | } |
| 1262 | } | 1458 | } |
| 1263 | 1459 | ||
| 1460 | /* | ||
| 1461 | * Even if we did not try to evict anon pages at all, we want to | ||
| 1462 | * rebalance the anon lru active/inactive ratio. | ||
| 1463 | */ | ||
| 1464 | if (!scan_global_lru(sc) || inactive_anon_is_low(zone)) | ||
| 1465 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | ||
| 1466 | else if (!scan_global_lru(sc)) | ||
| 1467 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | ||
| 1468 | |||
| 1264 | throttle_vm_writeout(sc->gfp_mask); | 1469 | throttle_vm_writeout(sc->gfp_mask); |
| 1265 | return nr_reclaimed; | 1470 | return nr_reclaimed; |
| 1266 | } | 1471 | } |
| @@ -1321,7 +1526,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, | |||
| 1321 | 1526 | ||
| 1322 | return nr_reclaimed; | 1527 | return nr_reclaimed; |
| 1323 | } | 1528 | } |
| 1324 | 1529 | ||
| 1325 | /* | 1530 | /* |
| 1326 | * This is the main entry point to direct page reclaim. | 1531 | * This is the main entry point to direct page reclaim. |
| 1327 | * | 1532 | * |
| @@ -1364,8 +1569,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
| 1364 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1569 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
| 1365 | continue; | 1570 | continue; |
| 1366 | 1571 | ||
| 1367 | lru_pages += zone_page_state(zone, NR_ACTIVE) | 1572 | lru_pages += zone_lru_pages(zone); |
| 1368 | + zone_page_state(zone, NR_INACTIVE); | ||
| 1369 | } | 1573 | } |
| 1370 | } | 1574 | } |
| 1371 | 1575 | ||
| @@ -1555,6 +1759,14 @@ loop_again: | |||
| 1555 | priority != DEF_PRIORITY) | 1759 | priority != DEF_PRIORITY) |
| 1556 | continue; | 1760 | continue; |
| 1557 | 1761 | ||
| 1762 | /* | ||
| 1763 | * Do some background aging of the anon list, to give | ||
| 1764 | * pages a chance to be referenced before reclaiming. | ||
| 1765 | */ | ||
| 1766 | if (inactive_anon_is_low(zone)) | ||
| 1767 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | ||
| 1768 | &sc, priority, 0); | ||
| 1769 | |||
| 1558 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1770 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
| 1559 | 0, 0)) { | 1771 | 0, 0)) { |
| 1560 | end_zone = i; | 1772 | end_zone = i; |
| @@ -1567,8 +1779,7 @@ loop_again: | |||
| 1567 | for (i = 0; i <= end_zone; i++) { | 1779 | for (i = 0; i <= end_zone; i++) { |
| 1568 | struct zone *zone = pgdat->node_zones + i; | 1780 | struct zone *zone = pgdat->node_zones + i; |
| 1569 | 1781 | ||
| 1570 | lru_pages += zone_page_state(zone, NR_ACTIVE) | 1782 | lru_pages += zone_lru_pages(zone); |
| 1571 | + zone_page_state(zone, NR_INACTIVE); | ||
| 1572 | } | 1783 | } |
| 1573 | 1784 | ||
| 1574 | /* | 1785 | /* |
| @@ -1612,8 +1823,7 @@ loop_again: | |||
| 1612 | if (zone_is_all_unreclaimable(zone)) | 1823 | if (zone_is_all_unreclaimable(zone)) |
| 1613 | continue; | 1824 | continue; |
| 1614 | if (nr_slab == 0 && zone->pages_scanned >= | 1825 | if (nr_slab == 0 && zone->pages_scanned >= |
| 1615 | (zone_page_state(zone, NR_ACTIVE) | 1826 | (zone_lru_pages(zone) * 6)) |
| 1616 | + zone_page_state(zone, NR_INACTIVE)) * 6) | ||
| 1617 | zone_set_flag(zone, | 1827 | zone_set_flag(zone, |
| 1618 | ZONE_ALL_UNRECLAIMABLE); | 1828 | ZONE_ALL_UNRECLAIMABLE); |
| 1619 | /* | 1829 | /* |
| @@ -1667,7 +1877,7 @@ out: | |||
| 1667 | 1877 | ||
| 1668 | /* | 1878 | /* |
| 1669 | * The background pageout daemon, started as a kernel thread | 1879 | * The background pageout daemon, started as a kernel thread |
| 1670 | * from the init process. | 1880 | * from the init process. |
| 1671 | * | 1881 | * |
| 1672 | * This basically trickles out pages so that we have _some_ | 1882 | * This basically trickles out pages so that we have _some_ |
| 1673 | * free memory available even if there is no other activity | 1883 | * free memory available even if there is no other activity |
| @@ -1761,6 +1971,14 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
| 1761 | wake_up_interruptible(&pgdat->kswapd_wait); | 1971 | wake_up_interruptible(&pgdat->kswapd_wait); |
| 1762 | } | 1972 | } |
| 1763 | 1973 | ||
| 1974 | unsigned long global_lru_pages(void) | ||
| 1975 | { | ||
| 1976 | return global_page_state(NR_ACTIVE_ANON) | ||
| 1977 | + global_page_state(NR_ACTIVE_FILE) | ||
| 1978 | + global_page_state(NR_INACTIVE_ANON) | ||
| 1979 | + global_page_state(NR_INACTIVE_FILE); | ||
| 1980 | } | ||
| 1981 | |||
| 1764 | #ifdef CONFIG_PM | 1982 | #ifdef CONFIG_PM |
| 1765 | /* | 1983 | /* |
| 1766 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages | 1984 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages |
| @@ -1774,6 +1992,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, | |||
| 1774 | { | 1992 | { |
| 1775 | struct zone *zone; | 1993 | struct zone *zone; |
| 1776 | unsigned long nr_to_scan, ret = 0; | 1994 | unsigned long nr_to_scan, ret = 0; |
| 1995 | enum lru_list l; | ||
| 1777 | 1996 | ||
| 1778 | for_each_zone(zone) { | 1997 | for_each_zone(zone) { |
| 1779 | 1998 | ||
| @@ -1783,38 +2002,31 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, | |||
| 1783 | if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) | 2002 | if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) |
| 1784 | continue; | 2003 | continue; |
| 1785 | 2004 | ||
| 1786 | /* For pass = 0 we don't shrink the active list */ | 2005 | for_each_evictable_lru(l) { |
| 1787 | if (pass > 0) { | 2006 | /* For pass = 0, we don't shrink the active list */ |
| 1788 | zone->nr_scan_active += | 2007 | if (pass == 0 && |
| 1789 | (zone_page_state(zone, NR_ACTIVE) >> prio) + 1; | 2008 | (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE)) |
| 1790 | if (zone->nr_scan_active >= nr_pages || pass > 3) { | 2009 | continue; |
| 1791 | zone->nr_scan_active = 0; | 2010 | |
| 2011 | zone->lru[l].nr_scan += | ||
| 2012 | (zone_page_state(zone, NR_LRU_BASE + l) | ||
| 2013 | >> prio) + 1; | ||
| 2014 | if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { | ||
| 2015 | zone->lru[l].nr_scan = 0; | ||
| 1792 | nr_to_scan = min(nr_pages, | 2016 | nr_to_scan = min(nr_pages, |
| 1793 | zone_page_state(zone, NR_ACTIVE)); | 2017 | zone_page_state(zone, |
| 1794 | shrink_active_list(nr_to_scan, zone, sc, prio); | 2018 | NR_LRU_BASE + l)); |
| 2019 | ret += shrink_list(l, nr_to_scan, zone, | ||
| 2020 | sc, prio); | ||
| 2021 | if (ret >= nr_pages) | ||
| 2022 | return ret; | ||
| 1795 | } | 2023 | } |
| 1796 | } | 2024 | } |
| 1797 | |||
| 1798 | zone->nr_scan_inactive += | ||
| 1799 | (zone_page_state(zone, NR_INACTIVE) >> prio) + 1; | ||
| 1800 | if (zone->nr_scan_inactive >= nr_pages || pass > 3) { | ||
| 1801 | zone->nr_scan_inactive = 0; | ||
| 1802 | nr_to_scan = min(nr_pages, | ||
| 1803 | zone_page_state(zone, NR_INACTIVE)); | ||
| 1804 | ret += shrink_inactive_list(nr_to_scan, zone, sc); | ||
| 1805 | if (ret >= nr_pages) | ||
| 1806 | return ret; | ||
| 1807 | } | ||
| 1808 | } | 2025 | } |
| 1809 | 2026 | ||
| 1810 | return ret; | 2027 | return ret; |
| 1811 | } | 2028 | } |
| 1812 | 2029 | ||
| 1813 | static unsigned long count_lru_pages(void) | ||
| 1814 | { | ||
| 1815 | return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE); | ||
| 1816 | } | ||
| 1817 | |||
| 1818 | /* | 2030 | /* |
| 1819 | * Try to free `nr_pages' of memory, system-wide, and return the number of | 2031 | * Try to free `nr_pages' of memory, system-wide, and return the number of |
| 1820 | * freed pages. | 2032 | * freed pages. |
| @@ -1840,7 +2052,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
| 1840 | 2052 | ||
| 1841 | current->reclaim_state = &reclaim_state; | 2053 | current->reclaim_state = &reclaim_state; |
| 1842 | 2054 | ||
| 1843 | lru_pages = count_lru_pages(); | 2055 | lru_pages = global_lru_pages(); |
| 1844 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); | 2056 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); |
| 1845 | /* If slab caches are huge, it's better to hit them first */ | 2057 | /* If slab caches are huge, it's better to hit them first */ |
| 1846 | while (nr_slab >= lru_pages) { | 2058 | while (nr_slab >= lru_pages) { |
| @@ -1883,7 +2095,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
| 1883 | 2095 | ||
| 1884 | reclaim_state.reclaimed_slab = 0; | 2096 | reclaim_state.reclaimed_slab = 0; |
| 1885 | shrink_slab(sc.nr_scanned, sc.gfp_mask, | 2097 | shrink_slab(sc.nr_scanned, sc.gfp_mask, |
| 1886 | count_lru_pages()); | 2098 | global_lru_pages()); |
| 1887 | ret += reclaim_state.reclaimed_slab; | 2099 | ret += reclaim_state.reclaimed_slab; |
| 1888 | if (ret >= nr_pages) | 2100 | if (ret >= nr_pages) |
| 1889 | goto out; | 2101 | goto out; |
| @@ -1900,7 +2112,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
| 1900 | if (!ret) { | 2112 | if (!ret) { |
| 1901 | do { | 2113 | do { |
| 1902 | reclaim_state.reclaimed_slab = 0; | 2114 | reclaim_state.reclaimed_slab = 0; |
| 1903 | shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); | 2115 | shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); |
| 1904 | ret += reclaim_state.reclaimed_slab; | 2116 | ret += reclaim_state.reclaimed_slab; |
| 1905 | } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); | 2117 | } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); |
| 1906 | } | 2118 | } |
| @@ -2128,3 +2340,250 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 2128 | return ret; | 2340 | return ret; |
| 2129 | } | 2341 | } |
| 2130 | #endif | 2342 | #endif |
| 2343 | |||
| 2344 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 2345 | /* | ||
| 2346 | * page_evictable - test whether a page is evictable | ||
| 2347 | * @page: the page to test | ||
| 2348 | * @vma: the VMA in which the page is or will be mapped, may be NULL | ||
| 2349 | * | ||
| 2350 | * Test whether page is evictable--i.e., should be placed on active/inactive | ||
| 2351 | * lists vs unevictable list. The vma argument is !NULL when called from the | ||
| 2352 | * fault path to determine how to instantate a new page. | ||
| 2353 | * | ||
| 2354 | * Reasons page might not be evictable: | ||
| 2355 | * (1) page's mapping marked unevictable | ||
| 2356 | * (2) page is part of an mlocked VMA | ||
| 2357 | * | ||
| 2358 | */ | ||
| 2359 | int page_evictable(struct page *page, struct vm_area_struct *vma) | ||
| 2360 | { | ||
| 2361 | |||
| 2362 | if (mapping_unevictable(page_mapping(page))) | ||
| 2363 | return 0; | ||
| 2364 | |||
| 2365 | if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) | ||
| 2366 | return 0; | ||
| 2367 | |||
| 2368 | return 1; | ||
| 2369 | } | ||
| 2370 | |||
| 2371 | /** | ||
| 2372 | * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list | ||
| 2373 | * @page: page to check evictability and move to appropriate lru list | ||
| 2374 | * @zone: zone page is in | ||
| 2375 | * | ||
| 2376 | * Checks a page for evictability and moves the page to the appropriate | ||
| 2377 | * zone lru list. | ||
| 2378 | * | ||
| 2379 | * Restrictions: zone->lru_lock must be held, page must be on LRU and must | ||
| 2380 | * have PageUnevictable set. | ||
| 2381 | */ | ||
| 2382 | static void check_move_unevictable_page(struct page *page, struct zone *zone) | ||
| 2383 | { | ||
| 2384 | VM_BUG_ON(PageActive(page)); | ||
| 2385 | |||
| 2386 | retry: | ||
| 2387 | ClearPageUnevictable(page); | ||
| 2388 | if (page_evictable(page, NULL)) { | ||
| 2389 | enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); | ||
| 2390 | |||
| 2391 | __dec_zone_state(zone, NR_UNEVICTABLE); | ||
| 2392 | list_move(&page->lru, &zone->lru[l].list); | ||
| 2393 | __inc_zone_state(zone, NR_INACTIVE_ANON + l); | ||
| 2394 | __count_vm_event(UNEVICTABLE_PGRESCUED); | ||
| 2395 | } else { | ||
| 2396 | /* | ||
| 2397 | * rotate unevictable list | ||
| 2398 | */ | ||
| 2399 | SetPageUnevictable(page); | ||
| 2400 | list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); | ||
| 2401 | if (page_evictable(page, NULL)) | ||
| 2402 | goto retry; | ||
| 2403 | } | ||
| 2404 | } | ||
| 2405 | |||
| 2406 | /** | ||
| 2407 | * scan_mapping_unevictable_pages - scan an address space for evictable pages | ||
| 2408 | * @mapping: struct address_space to scan for evictable pages | ||
| 2409 | * | ||
| 2410 | * Scan all pages in mapping. Check unevictable pages for | ||
| 2411 | * evictability and move them to the appropriate zone lru list. | ||
| 2412 | */ | ||
| 2413 | void scan_mapping_unevictable_pages(struct address_space *mapping) | ||
| 2414 | { | ||
| 2415 | pgoff_t next = 0; | ||
| 2416 | pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> | ||
| 2417 | PAGE_CACHE_SHIFT; | ||
| 2418 | struct zone *zone; | ||
| 2419 | struct pagevec pvec; | ||
| 2420 | |||
| 2421 | if (mapping->nrpages == 0) | ||
| 2422 | return; | ||
| 2423 | |||
| 2424 | pagevec_init(&pvec, 0); | ||
| 2425 | while (next < end && | ||
| 2426 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | ||
| 2427 | int i; | ||
| 2428 | int pg_scanned = 0; | ||
| 2429 | |||
| 2430 | zone = NULL; | ||
| 2431 | |||
| 2432 | for (i = 0; i < pagevec_count(&pvec); i++) { | ||
| 2433 | struct page *page = pvec.pages[i]; | ||
| 2434 | pgoff_t page_index = page->index; | ||
| 2435 | struct zone *pagezone = page_zone(page); | ||
| 2436 | |||
| 2437 | pg_scanned++; | ||
| 2438 | if (page_index > next) | ||
| 2439 | next = page_index; | ||
| 2440 | next++; | ||
| 2441 | |||
| 2442 | if (pagezone != zone) { | ||
| 2443 | if (zone) | ||
| 2444 | spin_unlock_irq(&zone->lru_lock); | ||
| 2445 | zone = pagezone; | ||
| 2446 | spin_lock_irq(&zone->lru_lock); | ||
| 2447 | } | ||
| 2448 | |||
| 2449 | if (PageLRU(page) && PageUnevictable(page)) | ||
| 2450 | check_move_unevictable_page(page, zone); | ||
| 2451 | } | ||
| 2452 | if (zone) | ||
| 2453 | spin_unlock_irq(&zone->lru_lock); | ||
| 2454 | pagevec_release(&pvec); | ||
| 2455 | |||
| 2456 | count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); | ||
| 2457 | } | ||
| 2458 | |||
| 2459 | } | ||
| 2460 | |||
| 2461 | /** | ||
| 2462 | * scan_zone_unevictable_pages - check unevictable list for evictable pages | ||
| 2463 | * @zone - zone of which to scan the unevictable list | ||
| 2464 | * | ||
| 2465 | * Scan @zone's unevictable LRU lists to check for pages that have become | ||
| 2466 | * evictable. Move those that have to @zone's inactive list where they | ||
| 2467 | * become candidates for reclaim, unless shrink_inactive_zone() decides | ||
| 2468 | * to reactivate them. Pages that are still unevictable are rotated | ||
| 2469 | * back onto @zone's unevictable list. | ||
| 2470 | */ | ||
| 2471 | #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ | ||
| 2472 | void scan_zone_unevictable_pages(struct zone *zone) | ||
| 2473 | { | ||
| 2474 | struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; | ||
| 2475 | unsigned long scan; | ||
| 2476 | unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); | ||
| 2477 | |||
| 2478 | while (nr_to_scan > 0) { | ||
| 2479 | unsigned long batch_size = min(nr_to_scan, | ||
| 2480 | SCAN_UNEVICTABLE_BATCH_SIZE); | ||
| 2481 | |||
| 2482 | spin_lock_irq(&zone->lru_lock); | ||
| 2483 | for (scan = 0; scan < batch_size; scan++) { | ||
| 2484 | struct page *page = lru_to_page(l_unevictable); | ||
| 2485 | |||
| 2486 | if (!trylock_page(page)) | ||
| 2487 | continue; | ||
| 2488 | |||
| 2489 | prefetchw_prev_lru_page(page, l_unevictable, flags); | ||
| 2490 | |||
| 2491 | if (likely(PageLRU(page) && PageUnevictable(page))) | ||
| 2492 | check_move_unevictable_page(page, zone); | ||
| 2493 | |||
| 2494 | unlock_page(page); | ||
| 2495 | } | ||
| 2496 | spin_unlock_irq(&zone->lru_lock); | ||
| 2497 | |||
| 2498 | nr_to_scan -= batch_size; | ||
| 2499 | } | ||
| 2500 | } | ||
| 2501 | |||
| 2502 | |||
| 2503 | /** | ||
| 2504 | * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages | ||
| 2505 | * | ||
| 2506 | * A really big hammer: scan all zones' unevictable LRU lists to check for | ||
| 2507 | * pages that have become evictable. Move those back to the zones' | ||
| 2508 | * inactive list where they become candidates for reclaim. | ||
| 2509 | * This occurs when, e.g., we have unswappable pages on the unevictable lists, | ||
| 2510 | * and we add swap to the system. As such, it runs in the context of a task | ||
| 2511 | * that has possibly/probably made some previously unevictable pages | ||
| 2512 | * evictable. | ||
| 2513 | */ | ||
| 2514 | void scan_all_zones_unevictable_pages(void) | ||
| 2515 | { | ||
| 2516 | struct zone *zone; | ||
| 2517 | |||
| 2518 | for_each_zone(zone) { | ||
| 2519 | scan_zone_unevictable_pages(zone); | ||
| 2520 | } | ||
| 2521 | } | ||
| 2522 | |||
| 2523 | /* | ||
| 2524 | * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of | ||
| 2525 | * all nodes' unevictable lists for evictable pages | ||
| 2526 | */ | ||
| 2527 | unsigned long scan_unevictable_pages; | ||
| 2528 | |||
| 2529 | int scan_unevictable_handler(struct ctl_table *table, int write, | ||
| 2530 | struct file *file, void __user *buffer, | ||
| 2531 | size_t *length, loff_t *ppos) | ||
| 2532 | { | ||
| 2533 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | ||
| 2534 | |||
| 2535 | if (write && *(unsigned long *)table->data) | ||
| 2536 | scan_all_zones_unevictable_pages(); | ||
| 2537 | |||
| 2538 | scan_unevictable_pages = 0; | ||
| 2539 | return 0; | ||
| 2540 | } | ||
| 2541 | |||
| 2542 | /* | ||
| 2543 | * per node 'scan_unevictable_pages' attribute. On demand re-scan of | ||
| 2544 | * a specified node's per zone unevictable lists for evictable pages. | ||
| 2545 | */ | ||
| 2546 | |||
| 2547 | static ssize_t read_scan_unevictable_node(struct sys_device *dev, | ||
| 2548 | struct sysdev_attribute *attr, | ||
| 2549 | char *buf) | ||
| 2550 | { | ||
| 2551 | return sprintf(buf, "0\n"); /* always zero; should fit... */ | ||
| 2552 | } | ||
| 2553 | |||
| 2554 | static ssize_t write_scan_unevictable_node(struct sys_device *dev, | ||
| 2555 | struct sysdev_attribute *attr, | ||
| 2556 | const char *buf, size_t count) | ||
| 2557 | { | ||
| 2558 | struct zone *node_zones = NODE_DATA(dev->id)->node_zones; | ||
| 2559 | struct zone *zone; | ||
| 2560 | unsigned long res; | ||
| 2561 | unsigned long req = strict_strtoul(buf, 10, &res); | ||
| 2562 | |||
| 2563 | if (!req) | ||
| 2564 | return 1; /* zero is no-op */ | ||
| 2565 | |||
| 2566 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
| 2567 | if (!populated_zone(zone)) | ||
| 2568 | continue; | ||
| 2569 | scan_zone_unevictable_pages(zone); | ||
| 2570 | } | ||
| 2571 | return 1; | ||
| 2572 | } | ||
| 2573 | |||
| 2574 | |||
| 2575 | static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, | ||
| 2576 | read_scan_unevictable_node, | ||
| 2577 | write_scan_unevictable_node); | ||
| 2578 | |||
| 2579 | int scan_unevictable_register_node(struct node *node) | ||
| 2580 | { | ||
| 2581 | return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages); | ||
| 2582 | } | ||
| 2583 | |||
| 2584 | void scan_unevictable_unregister_node(struct node *node) | ||
| 2585 | { | ||
| 2586 | sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); | ||
| 2587 | } | ||
| 2588 | |||
| 2589 | #endif | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index d7826af2fb07..c3ccfda23adc 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
| @@ -8,7 +8,7 @@ | |||
| 8 | * Copyright (C) 2006 Silicon Graphics, Inc., | 8 | * Copyright (C) 2006 Silicon Graphics, Inc., |
| 9 | * Christoph Lameter <christoph@lameter.com> | 9 | * Christoph Lameter <christoph@lameter.com> |
| 10 | */ | 10 | */ |
| 11 | 11 | #include <linux/fs.h> | |
| 12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
| 13 | #include <linux/err.h> | 13 | #include <linux/err.h> |
| 14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| @@ -384,7 +384,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z) | |||
| 384 | #endif | 384 | #endif |
| 385 | 385 | ||
| 386 | #ifdef CONFIG_PROC_FS | 386 | #ifdef CONFIG_PROC_FS |
| 387 | 387 | #include <linux/proc_fs.h> | |
| 388 | #include <linux/seq_file.h> | 388 | #include <linux/seq_file.h> |
| 389 | 389 | ||
| 390 | static char * const migratetype_names[MIGRATE_TYPES] = { | 390 | static char * const migratetype_names[MIGRATE_TYPES] = { |
| @@ -581,20 +581,44 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg) | |||
| 581 | return 0; | 581 | return 0; |
| 582 | } | 582 | } |
| 583 | 583 | ||
| 584 | const struct seq_operations fragmentation_op = { | 584 | static const struct seq_operations fragmentation_op = { |
| 585 | .start = frag_start, | 585 | .start = frag_start, |
| 586 | .next = frag_next, | 586 | .next = frag_next, |
| 587 | .stop = frag_stop, | 587 | .stop = frag_stop, |
| 588 | .show = frag_show, | 588 | .show = frag_show, |
| 589 | }; | 589 | }; |
| 590 | 590 | ||
| 591 | const struct seq_operations pagetypeinfo_op = { | 591 | static int fragmentation_open(struct inode *inode, struct file *file) |
| 592 | { | ||
| 593 | return seq_open(file, &fragmentation_op); | ||
| 594 | } | ||
| 595 | |||
| 596 | static const struct file_operations fragmentation_file_operations = { | ||
| 597 | .open = fragmentation_open, | ||
| 598 | .read = seq_read, | ||
| 599 | .llseek = seq_lseek, | ||
| 600 | .release = seq_release, | ||
| 601 | }; | ||
| 602 | |||
| 603 | static const struct seq_operations pagetypeinfo_op = { | ||
| 592 | .start = frag_start, | 604 | .start = frag_start, |
| 593 | .next = frag_next, | 605 | .next = frag_next, |
| 594 | .stop = frag_stop, | 606 | .stop = frag_stop, |
| 595 | .show = pagetypeinfo_show, | 607 | .show = pagetypeinfo_show, |
| 596 | }; | 608 | }; |
| 597 | 609 | ||
| 610 | static int pagetypeinfo_open(struct inode *inode, struct file *file) | ||
| 611 | { | ||
| 612 | return seq_open(file, &pagetypeinfo_op); | ||
| 613 | } | ||
| 614 | |||
| 615 | static const struct file_operations pagetypeinfo_file_ops = { | ||
| 616 | .open = pagetypeinfo_open, | ||
| 617 | .read = seq_read, | ||
| 618 | .llseek = seq_lseek, | ||
| 619 | .release = seq_release, | ||
| 620 | }; | ||
| 621 | |||
| 598 | #ifdef CONFIG_ZONE_DMA | 622 | #ifdef CONFIG_ZONE_DMA |
| 599 | #define TEXT_FOR_DMA(xx) xx "_dma", | 623 | #define TEXT_FOR_DMA(xx) xx "_dma", |
| 600 | #else | 624 | #else |
| @@ -619,8 +643,14 @@ const struct seq_operations pagetypeinfo_op = { | |||
| 619 | static const char * const vmstat_text[] = { | 643 | static const char * const vmstat_text[] = { |
| 620 | /* Zoned VM counters */ | 644 | /* Zoned VM counters */ |
| 621 | "nr_free_pages", | 645 | "nr_free_pages", |
| 622 | "nr_inactive", | 646 | "nr_inactive_anon", |
| 623 | "nr_active", | 647 | "nr_active_anon", |
| 648 | "nr_inactive_file", | ||
| 649 | "nr_active_file", | ||
| 650 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 651 | "nr_unevictable", | ||
| 652 | "nr_mlock", | ||
| 653 | #endif | ||
| 624 | "nr_anon_pages", | 654 | "nr_anon_pages", |
| 625 | "nr_mapped", | 655 | "nr_mapped", |
| 626 | "nr_file_pages", | 656 | "nr_file_pages", |
| @@ -675,6 +705,16 @@ static const char * const vmstat_text[] = { | |||
| 675 | "htlb_buddy_alloc_success", | 705 | "htlb_buddy_alloc_success", |
| 676 | "htlb_buddy_alloc_fail", | 706 | "htlb_buddy_alloc_fail", |
| 677 | #endif | 707 | #endif |
| 708 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 709 | "unevictable_pgs_culled", | ||
| 710 | "unevictable_pgs_scanned", | ||
| 711 | "unevictable_pgs_rescued", | ||
| 712 | "unevictable_pgs_mlocked", | ||
| 713 | "unevictable_pgs_munlocked", | ||
| 714 | "unevictable_pgs_cleared", | ||
| 715 | "unevictable_pgs_stranded", | ||
| 716 | "unevictable_pgs_mlockfreed", | ||
| 717 | #endif | ||
| 678 | #endif | 718 | #endif |
| 679 | }; | 719 | }; |
| 680 | 720 | ||
| @@ -688,7 +728,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
| 688 | "\n min %lu" | 728 | "\n min %lu" |
| 689 | "\n low %lu" | 729 | "\n low %lu" |
| 690 | "\n high %lu" | 730 | "\n high %lu" |
| 691 | "\n scanned %lu (a: %lu i: %lu)" | 731 | "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" |
| 692 | "\n spanned %lu" | 732 | "\n spanned %lu" |
| 693 | "\n present %lu", | 733 | "\n present %lu", |
| 694 | zone_page_state(zone, NR_FREE_PAGES), | 734 | zone_page_state(zone, NR_FREE_PAGES), |
| @@ -696,7 +736,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
| 696 | zone->pages_low, | 736 | zone->pages_low, |
| 697 | zone->pages_high, | 737 | zone->pages_high, |
| 698 | zone->pages_scanned, | 738 | zone->pages_scanned, |
| 699 | zone->nr_scan_active, zone->nr_scan_inactive, | 739 | zone->lru[LRU_ACTIVE_ANON].nr_scan, |
| 740 | zone->lru[LRU_INACTIVE_ANON].nr_scan, | ||
| 741 | zone->lru[LRU_ACTIVE_FILE].nr_scan, | ||
| 742 | zone->lru[LRU_INACTIVE_FILE].nr_scan, | ||
| 700 | zone->spanned_pages, | 743 | zone->spanned_pages, |
| 701 | zone->present_pages); | 744 | zone->present_pages); |
| 702 | 745 | ||
| @@ -733,10 +776,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
| 733 | seq_printf(m, | 776 | seq_printf(m, |
| 734 | "\n all_unreclaimable: %u" | 777 | "\n all_unreclaimable: %u" |
| 735 | "\n prev_priority: %i" | 778 | "\n prev_priority: %i" |
| 736 | "\n start_pfn: %lu", | 779 | "\n start_pfn: %lu" |
| 780 | "\n inactive_ratio: %u", | ||
| 737 | zone_is_all_unreclaimable(zone), | 781 | zone_is_all_unreclaimable(zone), |
| 738 | zone->prev_priority, | 782 | zone->prev_priority, |
| 739 | zone->zone_start_pfn); | 783 | zone->zone_start_pfn, |
| 784 | zone->inactive_ratio); | ||
| 740 | seq_putc(m, '\n'); | 785 | seq_putc(m, '\n'); |
| 741 | } | 786 | } |
| 742 | 787 | ||
| @@ -750,7 +795,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
| 750 | return 0; | 795 | return 0; |
| 751 | } | 796 | } |
| 752 | 797 | ||
| 753 | const struct seq_operations zoneinfo_op = { | 798 | static const struct seq_operations zoneinfo_op = { |
| 754 | .start = frag_start, /* iterate over all zones. The same as in | 799 | .start = frag_start, /* iterate over all zones. The same as in |
| 755 | * fragmentation. */ | 800 | * fragmentation. */ |
| 756 | .next = frag_next, | 801 | .next = frag_next, |
| @@ -758,6 +803,18 @@ const struct seq_operations zoneinfo_op = { | |||
| 758 | .show = zoneinfo_show, | 803 | .show = zoneinfo_show, |
| 759 | }; | 804 | }; |
| 760 | 805 | ||
| 806 | static int zoneinfo_open(struct inode *inode, struct file *file) | ||
| 807 | { | ||
| 808 | return seq_open(file, &zoneinfo_op); | ||
| 809 | } | ||
| 810 | |||
| 811 | static const struct file_operations proc_zoneinfo_file_operations = { | ||
| 812 | .open = zoneinfo_open, | ||
| 813 | .read = seq_read, | ||
| 814 | .llseek = seq_lseek, | ||
| 815 | .release = seq_release, | ||
| 816 | }; | ||
| 817 | |||
| 761 | static void *vmstat_start(struct seq_file *m, loff_t *pos) | 818 | static void *vmstat_start(struct seq_file *m, loff_t *pos) |
| 762 | { | 819 | { |
| 763 | unsigned long *v; | 820 | unsigned long *v; |
| @@ -813,13 +870,24 @@ static void vmstat_stop(struct seq_file *m, void *arg) | |||
| 813 | m->private = NULL; | 870 | m->private = NULL; |
| 814 | } | 871 | } |
| 815 | 872 | ||
| 816 | const struct seq_operations vmstat_op = { | 873 | static const struct seq_operations vmstat_op = { |
| 817 | .start = vmstat_start, | 874 | .start = vmstat_start, |
| 818 | .next = vmstat_next, | 875 | .next = vmstat_next, |
| 819 | .stop = vmstat_stop, | 876 | .stop = vmstat_stop, |
| 820 | .show = vmstat_show, | 877 | .show = vmstat_show, |
| 821 | }; | 878 | }; |
| 822 | 879 | ||
| 880 | static int vmstat_open(struct inode *inode, struct file *file) | ||
| 881 | { | ||
| 882 | return seq_open(file, &vmstat_op); | ||
| 883 | } | ||
| 884 | |||
| 885 | static const struct file_operations proc_vmstat_file_operations = { | ||
| 886 | .open = vmstat_open, | ||
| 887 | .read = seq_read, | ||
| 888 | .llseek = seq_lseek, | ||
| 889 | .release = seq_release, | ||
| 890 | }; | ||
| 823 | #endif /* CONFIG_PROC_FS */ | 891 | #endif /* CONFIG_PROC_FS */ |
| 824 | 892 | ||
| 825 | #ifdef CONFIG_SMP | 893 | #ifdef CONFIG_SMP |
| @@ -877,9 +945,11 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
| 877 | 945 | ||
| 878 | static struct notifier_block __cpuinitdata vmstat_notifier = | 946 | static struct notifier_block __cpuinitdata vmstat_notifier = |
| 879 | { &vmstat_cpuup_callback, NULL, 0 }; | 947 | { &vmstat_cpuup_callback, NULL, 0 }; |
| 948 | #endif | ||
| 880 | 949 | ||
| 881 | static int __init setup_vmstat(void) | 950 | static int __init setup_vmstat(void) |
| 882 | { | 951 | { |
| 952 | #ifdef CONFIG_SMP | ||
| 883 | int cpu; | 953 | int cpu; |
| 884 | 954 | ||
| 885 | refresh_zone_stat_thresholds(); | 955 | refresh_zone_stat_thresholds(); |
| @@ -887,7 +957,13 @@ static int __init setup_vmstat(void) | |||
| 887 | 957 | ||
| 888 | for_each_online_cpu(cpu) | 958 | for_each_online_cpu(cpu) |
| 889 | start_cpu_timer(cpu); | 959 | start_cpu_timer(cpu); |
| 960 | #endif | ||
| 961 | #ifdef CONFIG_PROC_FS | ||
| 962 | proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); | ||
| 963 | proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); | ||
| 964 | proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); | ||
| 965 | proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); | ||
| 966 | #endif | ||
| 890 | return 0; | 967 | return 0; |
| 891 | } | 968 | } |
| 892 | module_init(setup_vmstat) | 969 | module_init(setup_vmstat) |
| 893 | #endif | ||
