diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-28 19:36:48 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-28 19:36:48 -0400 |
commit | 1c88e19b0f6a8471ee50d5062721ba30b8fd4ba9 (patch) | |
tree | 6d227487ca2cf391589c73af1c40ec7b7126feec | |
parent | 6039b80eb50a893476fea7d56e86ed2d19290054 (diff) | |
parent | c3486f5376696034d0fcbef8ba70c70cfcb26f51 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
"The rest of MM"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (101 commits)
mm, compaction: simplify contended compaction handling
mm, compaction: introduce direct compaction priority
mm, thp: remove __GFP_NORETRY from khugepaged and madvised allocations
mm, page_alloc: make THP-specific decisions more generic
mm, page_alloc: restructure direct compaction handling in slowpath
mm, page_alloc: don't retry initial attempt in slowpath
mm, page_alloc: set alloc_flags only once in slowpath
lib/stackdepot.c: use __GFP_NOWARN for stack allocations
mm, kasan: switch SLUB to stackdepot, enable memory quarantine for SLUB
mm, kasan: account for object redzone in SLUB's nearest_obj()
mm: fix use-after-free if memory allocation failed in vma_adjust()
zsmalloc: Delete an unnecessary check before the function call "iput"
mm/memblock.c: fix index adjustment error in __next_mem_range_rev()
mem-hotplug: alloc new page from a nearest neighbor node when mem-offline
mm: optimize copy_page_to/from_iter_iovec
mm: add cond_resched() to generic_swapfile_activate()
Revert "mm, mempool: only set __GFP_NOMEMALLOC if there are free elements"
mm, compaction: don't isolate PageWriteback pages in MIGRATE_SYNC_LIGHT mode
mm: hwpoison: remove incorrect comments
make __section_nr() more efficient
...
90 files changed, 2517 insertions, 1978 deletions
diff --git a/Documentation/cgroup-v1/memcg_test.txt b/Documentation/cgroup-v1/memcg_test.txt index 8870b0212150..78a8c2963b38 100644 --- a/Documentation/cgroup-v1/memcg_test.txt +++ b/Documentation/cgroup-v1/memcg_test.txt | |||
@@ -107,9 +107,9 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. | |||
107 | 107 | ||
108 | 8. LRU | 108 | 8. LRU |
109 | Each memcg has its own private LRU. Now, its handling is under global | 109 | Each memcg has its own private LRU. Now, its handling is under global |
110 | VM's control (means that it's handled under global zone->lru_lock). | 110 | VM's control (means that it's handled under global zone_lru_lock). |
111 | Almost all routines around memcg's LRU is called by global LRU's | 111 | Almost all routines around memcg's LRU is called by global LRU's |
112 | list management functions under zone->lru_lock(). | 112 | list management functions under zone_lru_lock(). |
113 | 113 | ||
114 | A special function is mem_cgroup_isolate_pages(). This scans | 114 | A special function is mem_cgroup_isolate_pages(). This scans |
115 | memcg's private LRU and call __isolate_lru_page() to extract a page | 115 | memcg's private LRU and call __isolate_lru_page() to extract a page |
diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt index b14abf217239..946e69103cdd 100644 --- a/Documentation/cgroup-v1/memory.txt +++ b/Documentation/cgroup-v1/memory.txt | |||
@@ -267,11 +267,11 @@ When oom event notifier is registered, event will be delivered. | |||
267 | Other lock order is following: | 267 | Other lock order is following: |
268 | PG_locked. | 268 | PG_locked. |
269 | mm->page_table_lock | 269 | mm->page_table_lock |
270 | zone->lru_lock | 270 | zone_lru_lock |
271 | lock_page_cgroup. | 271 | lock_page_cgroup. |
272 | In many cases, just lock_page_cgroup() is called. | 272 | In many cases, just lock_page_cgroup() is called. |
273 | per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by | 273 | per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by |
274 | zone->lru_lock, it has no lock of its own. | 274 | zone_lru_lock, it has no lock of its own. |
275 | 275 | ||
276 | 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) | 276 | 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) |
277 | 277 | ||
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 2ade7a6a10a7..bbb7ee76e319 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c | |||
@@ -224,7 +224,7 @@ void __init arm64_memblock_init(void) | |||
224 | * via the linear mapping. | 224 | * via the linear mapping. |
225 | */ | 225 | */ |
226 | if (memory_limit != (phys_addr_t)ULLONG_MAX) { | 226 | if (memory_limit != (phys_addr_t)ULLONG_MAX) { |
227 | memblock_enforce_memory_limit(memory_limit); | 227 | memblock_mem_limit_remove_map(memory_limit); |
228 | memblock_add(__pa(_text), (u64)(_end - _text)); | 228 | memblock_add(__pa(_text), (u64)(_end - _text)); |
229 | } | 229 | } |
230 | 230 | ||
diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c index edcf2a706942..598df5708501 100644 --- a/arch/s390/appldata/appldata_mem.c +++ b/arch/s390/appldata/appldata_mem.c | |||
@@ -102,7 +102,7 @@ static void appldata_get_mem_data(void *data) | |||
102 | mem_data->totalhigh = P2K(val.totalhigh); | 102 | mem_data->totalhigh = P2K(val.totalhigh); |
103 | mem_data->freehigh = P2K(val.freehigh); | 103 | mem_data->freehigh = P2K(val.freehigh); |
104 | mem_data->bufferram = P2K(val.bufferram); | 104 | mem_data->bufferram = P2K(val.bufferram); |
105 | mem_data->cached = P2K(global_page_state(NR_FILE_PAGES) | 105 | mem_data->cached = P2K(global_node_page_state(NR_FILE_PAGES) |
106 | - val.bufferram); | 106 | - val.bufferram); |
107 | 107 | ||
108 | si_swapinfo(&val); | 108 | si_swapinfo(&val); |
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index c4d5bf841a7f..7cc6ee7f1a58 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c | |||
@@ -45,20 +45,20 @@ void show_mem(unsigned int filter) | |||
45 | struct zone *zone; | 45 | struct zone *zone; |
46 | 46 | ||
47 | pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu pagecache:%lu swap:%lu\n", | 47 | pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu pagecache:%lu swap:%lu\n", |
48 | (global_page_state(NR_ACTIVE_ANON) + | 48 | (global_node_page_state(NR_ACTIVE_ANON) + |
49 | global_page_state(NR_ACTIVE_FILE)), | 49 | global_node_page_state(NR_ACTIVE_FILE)), |
50 | (global_page_state(NR_INACTIVE_ANON) + | 50 | (global_node_page_state(NR_INACTIVE_ANON) + |
51 | global_page_state(NR_INACTIVE_FILE)), | 51 | global_node_page_state(NR_INACTIVE_FILE)), |
52 | global_page_state(NR_FILE_DIRTY), | 52 | global_node_page_state(NR_FILE_DIRTY), |
53 | global_page_state(NR_WRITEBACK), | 53 | global_node_page_state(NR_WRITEBACK), |
54 | global_page_state(NR_UNSTABLE_NFS), | 54 | global_node_page_state(NR_UNSTABLE_NFS), |
55 | global_page_state(NR_FREE_PAGES), | 55 | global_page_state(NR_FREE_PAGES), |
56 | (global_page_state(NR_SLAB_RECLAIMABLE) + | 56 | (global_page_state(NR_SLAB_RECLAIMABLE) + |
57 | global_page_state(NR_SLAB_UNRECLAIMABLE)), | 57 | global_page_state(NR_SLAB_UNRECLAIMABLE)), |
58 | global_page_state(NR_FILE_MAPPED), | 58 | global_node_page_state(NR_FILE_MAPPED), |
59 | global_page_state(NR_PAGETABLE), | 59 | global_page_state(NR_PAGETABLE), |
60 | global_page_state(NR_BOUNCE), | 60 | global_page_state(NR_BOUNCE), |
61 | global_page_state(NR_FILE_PAGES), | 61 | global_node_page_state(NR_FILE_PAGES), |
62 | get_nr_swap_pages()); | 62 | get_nr_swap_pages()); |
63 | 63 | ||
64 | for_each_zone(zone) { | 64 | for_each_zone(zone) { |
diff --git a/drivers/base/node.c b/drivers/base/node.c index 51c7db2c4ee2..29cd96661b30 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c | |||
@@ -56,6 +56,7 @@ static ssize_t node_read_meminfo(struct device *dev, | |||
56 | { | 56 | { |
57 | int n; | 57 | int n; |
58 | int nid = dev->id; | 58 | int nid = dev->id; |
59 | struct pglist_data *pgdat = NODE_DATA(nid); | ||
59 | struct sysinfo i; | 60 | struct sysinfo i; |
60 | 61 | ||
61 | si_meminfo_node(&i, nid); | 62 | si_meminfo_node(&i, nid); |
@@ -74,16 +75,16 @@ static ssize_t node_read_meminfo(struct device *dev, | |||
74 | nid, K(i.totalram), | 75 | nid, K(i.totalram), |
75 | nid, K(i.freeram), | 76 | nid, K(i.freeram), |
76 | nid, K(i.totalram - i.freeram), | 77 | nid, K(i.totalram - i.freeram), |
77 | nid, K(node_page_state(nid, NR_ACTIVE_ANON) + | 78 | nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) + |
78 | node_page_state(nid, NR_ACTIVE_FILE)), | 79 | node_page_state(pgdat, NR_ACTIVE_FILE)), |
79 | nid, K(node_page_state(nid, NR_INACTIVE_ANON) + | 80 | nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) + |
80 | node_page_state(nid, NR_INACTIVE_FILE)), | 81 | node_page_state(pgdat, NR_INACTIVE_FILE)), |
81 | nid, K(node_page_state(nid, NR_ACTIVE_ANON)), | 82 | nid, K(node_page_state(pgdat, NR_ACTIVE_ANON)), |
82 | nid, K(node_page_state(nid, NR_INACTIVE_ANON)), | 83 | nid, K(node_page_state(pgdat, NR_INACTIVE_ANON)), |
83 | nid, K(node_page_state(nid, NR_ACTIVE_FILE)), | 84 | nid, K(node_page_state(pgdat, NR_ACTIVE_FILE)), |
84 | nid, K(node_page_state(nid, NR_INACTIVE_FILE)), | 85 | nid, K(node_page_state(pgdat, NR_INACTIVE_FILE)), |
85 | nid, K(node_page_state(nid, NR_UNEVICTABLE)), | 86 | nid, K(node_page_state(pgdat, NR_UNEVICTABLE)), |
86 | nid, K(node_page_state(nid, NR_MLOCK))); | 87 | nid, K(sum_zone_node_page_state(nid, NR_MLOCK))); |
87 | 88 | ||
88 | #ifdef CONFIG_HIGHMEM | 89 | #ifdef CONFIG_HIGHMEM |
89 | n += sprintf(buf + n, | 90 | n += sprintf(buf + n, |
@@ -117,31 +118,30 @@ static ssize_t node_read_meminfo(struct device *dev, | |||
117 | "Node %d ShmemPmdMapped: %8lu kB\n" | 118 | "Node %d ShmemPmdMapped: %8lu kB\n" |
118 | #endif | 119 | #endif |
119 | , | 120 | , |
120 | nid, K(node_page_state(nid, NR_FILE_DIRTY)), | 121 | nid, K(node_page_state(pgdat, NR_FILE_DIRTY)), |
121 | nid, K(node_page_state(nid, NR_WRITEBACK)), | 122 | nid, K(node_page_state(pgdat, NR_WRITEBACK)), |
122 | nid, K(node_page_state(nid, NR_FILE_PAGES)), | 123 | nid, K(node_page_state(pgdat, NR_FILE_PAGES)), |
123 | nid, K(node_page_state(nid, NR_FILE_MAPPED)), | 124 | nid, K(node_page_state(pgdat, NR_FILE_MAPPED)), |
124 | nid, K(node_page_state(nid, NR_ANON_PAGES)), | 125 | nid, K(node_page_state(pgdat, NR_ANON_MAPPED)), |
125 | nid, K(i.sharedram), | 126 | nid, K(i.sharedram), |
126 | nid, node_page_state(nid, NR_KERNEL_STACK) * | 127 | nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB), |
127 | THREAD_SIZE / 1024, | 128 | nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)), |
128 | nid, K(node_page_state(nid, NR_PAGETABLE)), | 129 | nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), |
129 | nid, K(node_page_state(nid, NR_UNSTABLE_NFS)), | 130 | nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), |
130 | nid, K(node_page_state(nid, NR_BOUNCE)), | 131 | nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), |
131 | nid, K(node_page_state(nid, NR_WRITEBACK_TEMP)), | 132 | nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE) + |
132 | nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) + | 133 | sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), |
133 | node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), | 134 | nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE)), |
134 | nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)), | ||
135 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 135 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
136 | nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), | 136 | nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), |
137 | nid, K(node_page_state(nid, NR_ANON_THPS) * | 137 | nid, K(node_page_state(pgdat, NR_ANON_THPS) * |
138 | HPAGE_PMD_NR), | 138 | HPAGE_PMD_NR), |
139 | nid, K(node_page_state(nid, NR_SHMEM_THPS) * | 139 | nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * |
140 | HPAGE_PMD_NR), | 140 | HPAGE_PMD_NR), |
141 | nid, K(node_page_state(nid, NR_SHMEM_PMDMAPPED) * | 141 | nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * |
142 | HPAGE_PMD_NR)); | 142 | HPAGE_PMD_NR)); |
143 | #else | 143 | #else |
144 | nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); | 144 | nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); |
145 | #endif | 145 | #endif |
146 | n += hugetlb_report_node_meminfo(nid, buf + n); | 146 | n += hugetlb_report_node_meminfo(nid, buf + n); |
147 | return n; | 147 | return n; |
@@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev, | |||
160 | "interleave_hit %lu\n" | 160 | "interleave_hit %lu\n" |
161 | "local_node %lu\n" | 161 | "local_node %lu\n" |
162 | "other_node %lu\n", | 162 | "other_node %lu\n", |
163 | node_page_state(dev->id, NUMA_HIT), | 163 | sum_zone_node_page_state(dev->id, NUMA_HIT), |
164 | node_page_state(dev->id, NUMA_MISS), | 164 | sum_zone_node_page_state(dev->id, NUMA_MISS), |
165 | node_page_state(dev->id, NUMA_FOREIGN), | 165 | sum_zone_node_page_state(dev->id, NUMA_FOREIGN), |
166 | node_page_state(dev->id, NUMA_INTERLEAVE_HIT), | 166 | sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT), |
167 | node_page_state(dev->id, NUMA_LOCAL), | 167 | sum_zone_node_page_state(dev->id, NUMA_LOCAL), |
168 | node_page_state(dev->id, NUMA_OTHER)); | 168 | sum_zone_node_page_state(dev->id, NUMA_OTHER)); |
169 | } | 169 | } |
170 | static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); | 170 | static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); |
171 | 171 | ||
@@ -173,12 +173,18 @@ static ssize_t node_read_vmstat(struct device *dev, | |||
173 | struct device_attribute *attr, char *buf) | 173 | struct device_attribute *attr, char *buf) |
174 | { | 174 | { |
175 | int nid = dev->id; | 175 | int nid = dev->id; |
176 | struct pglist_data *pgdat = NODE_DATA(nid); | ||
176 | int i; | 177 | int i; |
177 | int n = 0; | 178 | int n = 0; |
178 | 179 | ||
179 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 180 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
180 | n += sprintf(buf+n, "%s %lu\n", vmstat_text[i], | 181 | n += sprintf(buf+n, "%s %lu\n", vmstat_text[i], |
181 | node_page_state(nid, i)); | 182 | sum_zone_node_page_state(nid, i)); |
183 | |||
184 | for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) | ||
185 | n += sprintf(buf+n, "%s %lu\n", | ||
186 | vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], | ||
187 | node_page_state(pgdat, i)); | ||
182 | 188 | ||
183 | return n; | 189 | return n; |
184 | } | 190 | } |
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index 24d2745e9437..45a1b4ec4ca3 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c | |||
@@ -72,10 +72,10 @@ static unsigned long lowmem_deathpending_timeout; | |||
72 | static unsigned long lowmem_count(struct shrinker *s, | 72 | static unsigned long lowmem_count(struct shrinker *s, |
73 | struct shrink_control *sc) | 73 | struct shrink_control *sc) |
74 | { | 74 | { |
75 | return global_page_state(NR_ACTIVE_ANON) + | 75 | return global_node_page_state(NR_ACTIVE_ANON) + |
76 | global_page_state(NR_ACTIVE_FILE) + | 76 | global_node_page_state(NR_ACTIVE_FILE) + |
77 | global_page_state(NR_INACTIVE_ANON) + | 77 | global_node_page_state(NR_INACTIVE_ANON) + |
78 | global_page_state(NR_INACTIVE_FILE); | 78 | global_node_page_state(NR_INACTIVE_FILE); |
79 | } | 79 | } |
80 | 80 | ||
81 | static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) | 81 | static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) |
@@ -91,8 +91,8 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) | |||
91 | short selected_oom_score_adj; | 91 | short selected_oom_score_adj; |
92 | int array_size = ARRAY_SIZE(lowmem_adj); | 92 | int array_size = ARRAY_SIZE(lowmem_adj); |
93 | int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages; | 93 | int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages; |
94 | int other_file = global_page_state(NR_FILE_PAGES) - | 94 | int other_file = global_node_page_state(NR_FILE_PAGES) - |
95 | global_page_state(NR_SHMEM) - | 95 | global_node_page_state(NR_SHMEM) - |
96 | total_swapcache_pages(); | 96 | total_swapcache_pages(); |
97 | 97 | ||
98 | if (lowmem_adj_size < array_size) | 98 | if (lowmem_adj_size < array_size) |
diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c index d1a7d6beee60..d011135802d5 100644 --- a/drivers/staging/lustre/lustre/osc/osc_cache.c +++ b/drivers/staging/lustre/lustre/osc/osc_cache.c | |||
@@ -1864,7 +1864,8 @@ void osc_dec_unstable_pages(struct ptlrpc_request *req) | |||
1864 | LASSERT(page_count >= 0); | 1864 | LASSERT(page_count >= 0); |
1865 | 1865 | ||
1866 | for (i = 0; i < page_count; i++) | 1866 | for (i = 0; i < page_count; i++) |
1867 | dec_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS); | 1867 | dec_node_page_state(desc->bd_iov[i].kiov_page, |
1868 | NR_UNSTABLE_NFS); | ||
1868 | 1869 | ||
1869 | atomic_sub(page_count, &cli->cl_cache->ccc_unstable_nr); | 1870 | atomic_sub(page_count, &cli->cl_cache->ccc_unstable_nr); |
1870 | LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); | 1871 | LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); |
@@ -1898,7 +1899,8 @@ void osc_inc_unstable_pages(struct ptlrpc_request *req) | |||
1898 | LASSERT(page_count >= 0); | 1899 | LASSERT(page_count >= 0); |
1899 | 1900 | ||
1900 | for (i = 0; i < page_count; i++) | 1901 | for (i = 0; i < page_count; i++) |
1901 | inc_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS); | 1902 | inc_node_page_state(desc->bd_iov[i].kiov_page, |
1903 | NR_UNSTABLE_NFS); | ||
1902 | 1904 | ||
1903 | LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); | 1905 | LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); |
1904 | atomic_add(page_count, &cli->cl_cache->ccc_unstable_nr); | 1906 | atomic_add(page_count, &cli->cl_cache->ccc_unstable_nr); |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6f9c9f6f5157..56c8fda436c0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -1807,8 +1807,8 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) | |||
1807 | */ | 1807 | */ |
1808 | static unsigned long get_nr_dirty_pages(void) | 1808 | static unsigned long get_nr_dirty_pages(void) |
1809 | { | 1809 | { |
1810 | return global_page_state(NR_FILE_DIRTY) + | 1810 | return global_node_page_state(NR_FILE_DIRTY) + |
1811 | global_page_state(NR_UNSTABLE_NFS) + | 1811 | global_node_page_state(NR_UNSTABLE_NFS) + |
1812 | get_nr_dirty_inodes(); | 1812 | get_nr_dirty_inodes(); |
1813 | } | 1813 | } |
1814 | 1814 | ||
diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9154f8679024..2382f22a2a8b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c | |||
@@ -1452,7 +1452,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) | |||
1452 | list_del(&req->writepages_entry); | 1452 | list_del(&req->writepages_entry); |
1453 | for (i = 0; i < req->num_pages; i++) { | 1453 | for (i = 0; i < req->num_pages; i++) { |
1454 | dec_wb_stat(&bdi->wb, WB_WRITEBACK); | 1454 | dec_wb_stat(&bdi->wb, WB_WRITEBACK); |
1455 | dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP); | 1455 | dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP); |
1456 | wb_writeout_inc(&bdi->wb); | 1456 | wb_writeout_inc(&bdi->wb); |
1457 | } | 1457 | } |
1458 | wake_up(&fi->page_waitq); | 1458 | wake_up(&fi->page_waitq); |
@@ -1642,7 +1642,7 @@ static int fuse_writepage_locked(struct page *page) | |||
1642 | req->inode = inode; | 1642 | req->inode = inode; |
1643 | 1643 | ||
1644 | inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); | 1644 | inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); |
1645 | inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); | 1645 | inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); |
1646 | 1646 | ||
1647 | spin_lock(&fc->lock); | 1647 | spin_lock(&fc->lock); |
1648 | list_add(&req->writepages_entry, &fi->writepages); | 1648 | list_add(&req->writepages_entry, &fi->writepages); |
@@ -1756,7 +1756,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, | |||
1756 | spin_unlock(&fc->lock); | 1756 | spin_unlock(&fc->lock); |
1757 | 1757 | ||
1758 | dec_wb_stat(&bdi->wb, WB_WRITEBACK); | 1758 | dec_wb_stat(&bdi->wb, WB_WRITEBACK); |
1759 | dec_zone_page_state(page, NR_WRITEBACK_TEMP); | 1759 | dec_node_page_state(page, NR_WRITEBACK_TEMP); |
1760 | wb_writeout_inc(&bdi->wb); | 1760 | wb_writeout_inc(&bdi->wb); |
1761 | fuse_writepage_free(fc, new_req); | 1761 | fuse_writepage_free(fc, new_req); |
1762 | fuse_request_free(new_req); | 1762 | fuse_request_free(new_req); |
@@ -1855,7 +1855,7 @@ static int fuse_writepages_fill(struct page *page, | |||
1855 | req->page_descs[req->num_pages].length = PAGE_SIZE; | 1855 | req->page_descs[req->num_pages].length = PAGE_SIZE; |
1856 | 1856 | ||
1857 | inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); | 1857 | inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); |
1858 | inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); | 1858 | inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); |
1859 | 1859 | ||
1860 | err = 0; | 1860 | err = 0; |
1861 | if (is_writeback && fuse_writepage_in_flight(req, page)) { | 1861 | if (is_writeback && fuse_writepage_in_flight(req, page)) { |
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5154fa65a2f2..5ea04d87fc65 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h | |||
@@ -623,7 +623,7 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) | |||
623 | if (!cinfo->dreq) { | 623 | if (!cinfo->dreq) { |
624 | struct inode *inode = page_file_mapping(page)->host; | 624 | struct inode *inode = page_file_mapping(page)->host; |
625 | 625 | ||
626 | inc_zone_page_state(page, NR_UNSTABLE_NFS); | 626 | inc_node_page_state(page, NR_UNSTABLE_NFS); |
627 | inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); | 627 | inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); |
628 | __mark_inode_dirty(inode, I_DIRTY_DATASYNC); | 628 | __mark_inode_dirty(inode, I_DIRTY_DATASYNC); |
629 | } | 629 | } |
diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e1c74d3db64d..593fa21a02c0 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c | |||
@@ -898,7 +898,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, | |||
898 | static void | 898 | static void |
899 | nfs_clear_page_commit(struct page *page) | 899 | nfs_clear_page_commit(struct page *page) |
900 | { | 900 | { |
901 | dec_zone_page_state(page, NR_UNSTABLE_NFS); | 901 | dec_node_page_state(page, NR_UNSTABLE_NFS); |
902 | dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, | 902 | dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, |
903 | WB_RECLAIMABLE); | 903 | WB_RECLAIMABLE); |
904 | } | 904 | } |
diff --git a/fs/proc/base.c b/fs/proc/base.c index a11eb7196ec8..31370da2ee7c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -1024,23 +1024,107 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, | |||
1024 | char buffer[PROC_NUMBUF]; | 1024 | char buffer[PROC_NUMBUF]; |
1025 | int oom_adj = OOM_ADJUST_MIN; | 1025 | int oom_adj = OOM_ADJUST_MIN; |
1026 | size_t len; | 1026 | size_t len; |
1027 | unsigned long flags; | ||
1028 | 1027 | ||
1029 | if (!task) | 1028 | if (!task) |
1030 | return -ESRCH; | 1029 | return -ESRCH; |
1031 | if (lock_task_sighand(task, &flags)) { | 1030 | if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) |
1032 | if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) | 1031 | oom_adj = OOM_ADJUST_MAX; |
1033 | oom_adj = OOM_ADJUST_MAX; | 1032 | else |
1034 | else | 1033 | oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / |
1035 | oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / | 1034 | OOM_SCORE_ADJ_MAX; |
1036 | OOM_SCORE_ADJ_MAX; | ||
1037 | unlock_task_sighand(task, &flags); | ||
1038 | } | ||
1039 | put_task_struct(task); | 1035 | put_task_struct(task); |
1040 | len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); | 1036 | len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); |
1041 | return simple_read_from_buffer(buf, count, ppos, buffer, len); | 1037 | return simple_read_from_buffer(buf, count, ppos, buffer, len); |
1042 | } | 1038 | } |
1043 | 1039 | ||
1040 | static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) | ||
1041 | { | ||
1042 | static DEFINE_MUTEX(oom_adj_mutex); | ||
1043 | struct mm_struct *mm = NULL; | ||
1044 | struct task_struct *task; | ||
1045 | int err = 0; | ||
1046 | |||
1047 | task = get_proc_task(file_inode(file)); | ||
1048 | if (!task) | ||
1049 | return -ESRCH; | ||
1050 | |||
1051 | mutex_lock(&oom_adj_mutex); | ||
1052 | if (legacy) { | ||
1053 | if (oom_adj < task->signal->oom_score_adj && | ||
1054 | !capable(CAP_SYS_RESOURCE)) { | ||
1055 | err = -EACCES; | ||
1056 | goto err_unlock; | ||
1057 | } | ||
1058 | /* | ||
1059 | * /proc/pid/oom_adj is provided for legacy purposes, ask users to use | ||
1060 | * /proc/pid/oom_score_adj instead. | ||
1061 | */ | ||
1062 | pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", | ||
1063 | current->comm, task_pid_nr(current), task_pid_nr(task), | ||
1064 | task_pid_nr(task)); | ||
1065 | } else { | ||
1066 | if ((short)oom_adj < task->signal->oom_score_adj_min && | ||
1067 | !capable(CAP_SYS_RESOURCE)) { | ||
1068 | err = -EACCES; | ||
1069 | goto err_unlock; | ||
1070 | } | ||
1071 | } | ||
1072 | |||
1073 | /* | ||
1074 | * Make sure we will check other processes sharing the mm if this is | ||
1075 | * not vfrok which wants its own oom_score_adj. | ||
1076 | * pin the mm so it doesn't go away and get reused after task_unlock | ||
1077 | */ | ||
1078 | if (!task->vfork_done) { | ||
1079 | struct task_struct *p = find_lock_task_mm(task); | ||
1080 | |||
1081 | if (p) { | ||
1082 | if (atomic_read(&p->mm->mm_users) > 1) { | ||
1083 | mm = p->mm; | ||
1084 | atomic_inc(&mm->mm_count); | ||
1085 | } | ||
1086 | task_unlock(p); | ||
1087 | } | ||
1088 | } | ||
1089 | |||
1090 | task->signal->oom_score_adj = oom_adj; | ||
1091 | if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) | ||
1092 | task->signal->oom_score_adj_min = (short)oom_adj; | ||
1093 | trace_oom_score_adj_update(task); | ||
1094 | |||
1095 | if (mm) { | ||
1096 | struct task_struct *p; | ||
1097 | |||
1098 | rcu_read_lock(); | ||
1099 | for_each_process(p) { | ||
1100 | if (same_thread_group(task, p)) | ||
1101 | continue; | ||
1102 | |||
1103 | /* do not touch kernel threads or the global init */ | ||
1104 | if (p->flags & PF_KTHREAD || is_global_init(p)) | ||
1105 | continue; | ||
1106 | |||
1107 | task_lock(p); | ||
1108 | if (!p->vfork_done && process_shares_mm(p, mm)) { | ||
1109 | pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n", | ||
1110 | task_pid_nr(p), p->comm, | ||
1111 | p->signal->oom_score_adj, oom_adj, | ||
1112 | task_pid_nr(task), task->comm); | ||
1113 | p->signal->oom_score_adj = oom_adj; | ||
1114 | if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) | ||
1115 | p->signal->oom_score_adj_min = (short)oom_adj; | ||
1116 | } | ||
1117 | task_unlock(p); | ||
1118 | } | ||
1119 | rcu_read_unlock(); | ||
1120 | mmdrop(mm); | ||
1121 | } | ||
1122 | err_unlock: | ||
1123 | mutex_unlock(&oom_adj_mutex); | ||
1124 | put_task_struct(task); | ||
1125 | return err; | ||
1126 | } | ||
1127 | |||
1044 | /* | 1128 | /* |
1045 | * /proc/pid/oom_adj exists solely for backwards compatibility with previous | 1129 | * /proc/pid/oom_adj exists solely for backwards compatibility with previous |
1046 | * kernels. The effective policy is defined by oom_score_adj, which has a | 1130 | * kernels. The effective policy is defined by oom_score_adj, which has a |
@@ -1054,10 +1138,8 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, | |||
1054 | static ssize_t oom_adj_write(struct file *file, const char __user *buf, | 1138 | static ssize_t oom_adj_write(struct file *file, const char __user *buf, |
1055 | size_t count, loff_t *ppos) | 1139 | size_t count, loff_t *ppos) |
1056 | { | 1140 | { |
1057 | struct task_struct *task; | ||
1058 | char buffer[PROC_NUMBUF]; | 1141 | char buffer[PROC_NUMBUF]; |
1059 | int oom_adj; | 1142 | int oom_adj; |
1060 | unsigned long flags; | ||
1061 | int err; | 1143 | int err; |
1062 | 1144 | ||
1063 | memset(buffer, 0, sizeof(buffer)); | 1145 | memset(buffer, 0, sizeof(buffer)); |
@@ -1077,23 +1159,6 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf, | |||
1077 | goto out; | 1159 | goto out; |
1078 | } | 1160 | } |
1079 | 1161 | ||
1080 | task = get_proc_task(file_inode(file)); | ||
1081 | if (!task) { | ||
1082 | err = -ESRCH; | ||
1083 | goto out; | ||
1084 | } | ||
1085 | |||
1086 | task_lock(task); | ||
1087 | if (!task->mm) { | ||
1088 | err = -EINVAL; | ||
1089 | goto err_task_lock; | ||
1090 | } | ||
1091 | |||
1092 | if (!lock_task_sighand(task, &flags)) { | ||
1093 | err = -ESRCH; | ||
1094 | goto err_task_lock; | ||
1095 | } | ||
1096 | |||
1097 | /* | 1162 | /* |
1098 | * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum | 1163 | * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum |
1099 | * value is always attainable. | 1164 | * value is always attainable. |
@@ -1103,27 +1168,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf, | |||
1103 | else | 1168 | else |
1104 | oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; | 1169 | oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; |
1105 | 1170 | ||
1106 | if (oom_adj < task->signal->oom_score_adj && | 1171 | err = __set_oom_adj(file, oom_adj, true); |
1107 | !capable(CAP_SYS_RESOURCE)) { | ||
1108 | err = -EACCES; | ||
1109 | goto err_sighand; | ||
1110 | } | ||
1111 | |||
1112 | /* | ||
1113 | * /proc/pid/oom_adj is provided for legacy purposes, ask users to use | ||
1114 | * /proc/pid/oom_score_adj instead. | ||
1115 | */ | ||
1116 | pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", | ||
1117 | current->comm, task_pid_nr(current), task_pid_nr(task), | ||
1118 | task_pid_nr(task)); | ||
1119 | |||
1120 | task->signal->oom_score_adj = oom_adj; | ||
1121 | trace_oom_score_adj_update(task); | ||
1122 | err_sighand: | ||
1123 | unlock_task_sighand(task, &flags); | ||
1124 | err_task_lock: | ||
1125 | task_unlock(task); | ||
1126 | put_task_struct(task); | ||
1127 | out: | 1172 | out: |
1128 | return err < 0 ? err : count; | 1173 | return err < 0 ? err : count; |
1129 | } | 1174 | } |
@@ -1140,15 +1185,11 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, | |||
1140 | struct task_struct *task = get_proc_task(file_inode(file)); | 1185 | struct task_struct *task = get_proc_task(file_inode(file)); |
1141 | char buffer[PROC_NUMBUF]; | 1186 | char buffer[PROC_NUMBUF]; |
1142 | short oom_score_adj = OOM_SCORE_ADJ_MIN; | 1187 | short oom_score_adj = OOM_SCORE_ADJ_MIN; |
1143 | unsigned long flags; | ||
1144 | size_t len; | 1188 | size_t len; |
1145 | 1189 | ||
1146 | if (!task) | 1190 | if (!task) |
1147 | return -ESRCH; | 1191 | return -ESRCH; |
1148 | if (lock_task_sighand(task, &flags)) { | 1192 | oom_score_adj = task->signal->oom_score_adj; |
1149 | oom_score_adj = task->signal->oom_score_adj; | ||
1150 | unlock_task_sighand(task, &flags); | ||
1151 | } | ||
1152 | put_task_struct(task); | 1193 | put_task_struct(task); |
1153 | len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); | 1194 | len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); |
1154 | return simple_read_from_buffer(buf, count, ppos, buffer, len); | 1195 | return simple_read_from_buffer(buf, count, ppos, buffer, len); |
@@ -1157,9 +1198,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, | |||
1157 | static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, | 1198 | static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, |
1158 | size_t count, loff_t *ppos) | 1199 | size_t count, loff_t *ppos) |
1159 | { | 1200 | { |
1160 | struct task_struct *task; | ||
1161 | char buffer[PROC_NUMBUF]; | 1201 | char buffer[PROC_NUMBUF]; |
1162 | unsigned long flags; | ||
1163 | int oom_score_adj; | 1202 | int oom_score_adj; |
1164 | int err; | 1203 | int err; |
1165 | 1204 | ||
@@ -1180,39 +1219,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, | |||
1180 | goto out; | 1219 | goto out; |
1181 | } | 1220 | } |
1182 | 1221 | ||
1183 | task = get_proc_task(file_inode(file)); | 1222 | err = __set_oom_adj(file, oom_score_adj, false); |
1184 | if (!task) { | ||
1185 | err = -ESRCH; | ||
1186 | goto out; | ||
1187 | } | ||
1188 | |||
1189 | task_lock(task); | ||
1190 | if (!task->mm) { | ||
1191 | err = -EINVAL; | ||
1192 | goto err_task_lock; | ||
1193 | } | ||
1194 | |||
1195 | if (!lock_task_sighand(task, &flags)) { | ||
1196 | err = -ESRCH; | ||
1197 | goto err_task_lock; | ||
1198 | } | ||
1199 | |||
1200 | if ((short)oom_score_adj < task->signal->oom_score_adj_min && | ||
1201 | !capable(CAP_SYS_RESOURCE)) { | ||
1202 | err = -EACCES; | ||
1203 | goto err_sighand; | ||
1204 | } | ||
1205 | |||
1206 | task->signal->oom_score_adj = (short)oom_score_adj; | ||
1207 | if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) | ||
1208 | task->signal->oom_score_adj_min = (short)oom_score_adj; | ||
1209 | trace_oom_score_adj_update(task); | ||
1210 | |||
1211 | err_sighand: | ||
1212 | unlock_task_sighand(task, &flags); | ||
1213 | err_task_lock: | ||
1214 | task_unlock(task); | ||
1215 | put_task_struct(task); | ||
1216 | out: | 1223 | out: |
1217 | return err < 0 ? err : count; | 1224 | return err < 0 ? err : count; |
1218 | } | 1225 | } |
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index cf301a9ef512..09e18fdf61e5 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c | |||
@@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
40 | si_swapinfo(&i); | 40 | si_swapinfo(&i); |
41 | committed = percpu_counter_read_positive(&vm_committed_as); | 41 | committed = percpu_counter_read_positive(&vm_committed_as); |
42 | 42 | ||
43 | cached = global_page_state(NR_FILE_PAGES) - | 43 | cached = global_node_page_state(NR_FILE_PAGES) - |
44 | total_swapcache_pages() - i.bufferram; | 44 | total_swapcache_pages() - i.bufferram; |
45 | if (cached < 0) | 45 | if (cached < 0) |
46 | cached = 0; | 46 | cached = 0; |
@@ -138,23 +138,23 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
138 | #endif | 138 | #endif |
139 | K(i.totalswap), | 139 | K(i.totalswap), |
140 | K(i.freeswap), | 140 | K(i.freeswap), |
141 | K(global_page_state(NR_FILE_DIRTY)), | 141 | K(global_node_page_state(NR_FILE_DIRTY)), |
142 | K(global_page_state(NR_WRITEBACK)), | 142 | K(global_node_page_state(NR_WRITEBACK)), |
143 | K(global_page_state(NR_ANON_PAGES)), | 143 | K(global_node_page_state(NR_ANON_MAPPED)), |
144 | K(global_page_state(NR_FILE_MAPPED)), | 144 | K(global_node_page_state(NR_FILE_MAPPED)), |
145 | K(i.sharedram), | 145 | K(i.sharedram), |
146 | K(global_page_state(NR_SLAB_RECLAIMABLE) + | 146 | K(global_page_state(NR_SLAB_RECLAIMABLE) + |
147 | global_page_state(NR_SLAB_UNRECLAIMABLE)), | 147 | global_page_state(NR_SLAB_UNRECLAIMABLE)), |
148 | K(global_page_state(NR_SLAB_RECLAIMABLE)), | 148 | K(global_page_state(NR_SLAB_RECLAIMABLE)), |
149 | K(global_page_state(NR_SLAB_UNRECLAIMABLE)), | 149 | K(global_page_state(NR_SLAB_UNRECLAIMABLE)), |
150 | global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, | 150 | global_page_state(NR_KERNEL_STACK_KB), |
151 | K(global_page_state(NR_PAGETABLE)), | 151 | K(global_page_state(NR_PAGETABLE)), |
152 | #ifdef CONFIG_QUICKLIST | 152 | #ifdef CONFIG_QUICKLIST |
153 | K(quicklist_total_size()), | 153 | K(quicklist_total_size()), |
154 | #endif | 154 | #endif |
155 | K(global_page_state(NR_UNSTABLE_NFS)), | 155 | K(global_node_page_state(NR_UNSTABLE_NFS)), |
156 | K(global_page_state(NR_BOUNCE)), | 156 | K(global_page_state(NR_BOUNCE)), |
157 | K(global_page_state(NR_WRITEBACK_TEMP)), | 157 | K(global_node_page_state(NR_WRITEBACK_TEMP)), |
158 | K(vm_commit_limit()), | 158 | K(vm_commit_limit()), |
159 | K(committed), | 159 | K(committed), |
160 | (unsigned long)VMALLOC_TOTAL >> 10, | 160 | (unsigned long)VMALLOC_TOTAL >> 10, |
@@ -164,9 +164,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
164 | , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) | 164 | , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) |
165 | #endif | 165 | #endif |
166 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 166 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
167 | , K(global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR) | 167 | , K(global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR) |
168 | , K(global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR) | 168 | , K(global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR) |
169 | , K(global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR) | 169 | , K(global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR) |
170 | #endif | 170 | #endif |
171 | #ifdef CONFIG_CMA | 171 | #ifdef CONFIG_CMA |
172 | , K(totalcma_pages) | 172 | , K(totalcma_pages) |
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c82794f20110..491a91717788 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h | |||
@@ -197,7 +197,7 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) | |||
197 | } | 197 | } |
198 | 198 | ||
199 | long congestion_wait(int sync, long timeout); | 199 | long congestion_wait(int sync, long timeout); |
200 | long wait_iff_congested(struct zone *zone, int sync, long timeout); | 200 | long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); |
201 | int pdflush_proc_obsolete(struct ctl_table *table, int write, | 201 | int pdflush_proc_obsolete(struct ctl_table *table, int write, |
202 | void __user *buffer, size_t *lenp, loff_t *ppos); | 202 | void __user *buffer, size_t *lenp, loff_t *ppos); |
203 | 203 | ||
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 1a02dab16646..d4e106b5dc27 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
@@ -1,6 +1,18 @@ | |||
1 | #ifndef _LINUX_COMPACTION_H | 1 | #ifndef _LINUX_COMPACTION_H |
2 | #define _LINUX_COMPACTION_H | 2 | #define _LINUX_COMPACTION_H |
3 | 3 | ||
4 | /* | ||
5 | * Determines how hard direct compaction should try to succeed. | ||
6 | * Lower value means higher priority, analogically to reclaim priority. | ||
7 | */ | ||
8 | enum compact_priority { | ||
9 | COMPACT_PRIO_SYNC_LIGHT, | ||
10 | MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, | ||
11 | DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, | ||
12 | COMPACT_PRIO_ASYNC, | ||
13 | INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC | ||
14 | }; | ||
15 | |||
4 | /* Return values for compact_zone() and try_to_compact_pages() */ | 16 | /* Return values for compact_zone() and try_to_compact_pages() */ |
5 | /* When adding new states, please adjust include/trace/events/compaction.h */ | 17 | /* When adding new states, please adjust include/trace/events/compaction.h */ |
6 | enum compact_result { | 18 | enum compact_result { |
@@ -43,14 +55,6 @@ enum compact_result { | |||
43 | COMPACT_PARTIAL, | 55 | COMPACT_PARTIAL, |
44 | }; | 56 | }; |
45 | 57 | ||
46 | /* Used to signal whether compaction detected need_sched() or lock contention */ | ||
47 | /* No contention detected */ | ||
48 | #define COMPACT_CONTENDED_NONE 0 | ||
49 | /* Either need_sched() was true or fatal signal pending */ | ||
50 | #define COMPACT_CONTENDED_SCHED 1 | ||
51 | /* Zone lock or lru_lock was contended in async compaction */ | ||
52 | #define COMPACT_CONTENDED_LOCK 2 | ||
53 | |||
54 | struct alloc_context; /* in mm/internal.h */ | 58 | struct alloc_context; /* in mm/internal.h */ |
55 | 59 | ||
56 | #ifdef CONFIG_COMPACTION | 60 | #ifdef CONFIG_COMPACTION |
@@ -64,9 +68,8 @@ extern int sysctl_compact_unevictable_allowed; | |||
64 | 68 | ||
65 | extern int fragmentation_index(struct zone *zone, unsigned int order); | 69 | extern int fragmentation_index(struct zone *zone, unsigned int order); |
66 | extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, | 70 | extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, |
67 | unsigned int order, | 71 | unsigned int order, unsigned int alloc_flags, |
68 | unsigned int alloc_flags, const struct alloc_context *ac, | 72 | const struct alloc_context *ac, enum compact_priority prio); |
69 | enum migrate_mode mode, int *contended); | ||
70 | extern void compact_pgdat(pg_data_t *pgdat, int order); | 73 | extern void compact_pgdat(pg_data_t *pgdat, int order); |
71 | extern void reset_isolation_suitable(pg_data_t *pgdat); | 74 | extern void reset_isolation_suitable(pg_data_t *pgdat); |
72 | extern enum compact_result compaction_suitable(struct zone *zone, int order, | 75 | extern enum compact_result compaction_suitable(struct zone *zone, int order, |
@@ -151,14 +154,6 @@ extern void kcompactd_stop(int nid); | |||
151 | extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); | 154 | extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); |
152 | 155 | ||
153 | #else | 156 | #else |
154 | static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask, | ||
155 | unsigned int order, int alloc_flags, | ||
156 | const struct alloc_context *ac, | ||
157 | enum migrate_mode mode, int *contended) | ||
158 | { | ||
159 | return COMPACT_CONTINUE; | ||
160 | } | ||
161 | |||
162 | static inline void compact_pgdat(pg_data_t *pgdat, int order) | 157 | static inline void compact_pgdat(pg_data_t *pgdat, int order) |
163 | { | 158 | { |
164 | } | 159 | } |
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c29e9d347bc6..f8041f9de31e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
@@ -237,9 +237,11 @@ struct vm_area_struct; | |||
237 | * are expected to be movable via page reclaim or page migration. Typically, | 237 | * are expected to be movable via page reclaim or page migration. Typically, |
238 | * pages on the LRU would also be allocated with GFP_HIGHUSER_MOVABLE. | 238 | * pages on the LRU would also be allocated with GFP_HIGHUSER_MOVABLE. |
239 | * | 239 | * |
240 | * GFP_TRANSHUGE is used for THP allocations. They are compound allocations | 240 | * GFP_TRANSHUGE and GFP_TRANSHUGE_LIGHT are used for THP allocations. They are |
241 | * that will fail quickly if memory is not available and will not wake | 241 | * compound allocations that will generally fail quickly if memory is not |
242 | * kswapd on failure. | 242 | * available and will not wake kswapd/kcompactd on failure. The _LIGHT |
243 | * version does not attempt reclaim/compaction at all and is by default used | ||
244 | * in page fault path, while the non-light is used by khugepaged. | ||
243 | */ | 245 | */ |
244 | #define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) | 246 | #define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) |
245 | #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) | 247 | #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) |
@@ -254,9 +256,9 @@ struct vm_area_struct; | |||
254 | #define GFP_DMA32 __GFP_DMA32 | 256 | #define GFP_DMA32 __GFP_DMA32 |
255 | #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) | 257 | #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) |
256 | #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) | 258 | #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) |
257 | #define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ | 259 | #define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ |
258 | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \ | 260 | __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM) |
259 | ~__GFP_RECLAIM) | 261 | #define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM) |
260 | 262 | ||
261 | /* Convert GFP flags to their corresponding migrate type */ | 263 | /* Convert GFP flags to their corresponding migrate type */ |
262 | #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) | 264 | #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 92ce91c03cd0..6f14de45b5ce 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -11,7 +11,7 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
11 | unsigned long addr, | 11 | unsigned long addr, |
12 | pmd_t *pmd, | 12 | pmd_t *pmd, |
13 | unsigned int flags); | 13 | unsigned int flags); |
14 | extern int madvise_free_huge_pmd(struct mmu_gather *tlb, | 14 | extern bool madvise_free_huge_pmd(struct mmu_gather *tlb, |
15 | struct vm_area_struct *vma, | 15 | struct vm_area_struct *vma, |
16 | pmd_t *pmd, unsigned long addr, unsigned long next); | 16 | pmd_t *pmd, unsigned long addr, unsigned long next); |
17 | extern int zap_huge_pmd(struct mmu_gather *tlb, | 17 | extern int zap_huge_pmd(struct mmu_gather *tlb, |
diff --git a/include/linux/kasan.h b/include/linux/kasan.h index ac4b3c46a84d..c9cf374445d8 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h | |||
@@ -77,6 +77,7 @@ void kasan_free_shadow(const struct vm_struct *vm); | |||
77 | 77 | ||
78 | size_t ksize(const void *); | 78 | size_t ksize(const void *); |
79 | static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); } | 79 | static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); } |
80 | size_t kasan_metadata_size(struct kmem_cache *cache); | ||
80 | 81 | ||
81 | #else /* CONFIG_KASAN */ | 82 | #else /* CONFIG_KASAN */ |
82 | 83 | ||
@@ -121,6 +122,7 @@ static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } | |||
121 | static inline void kasan_free_shadow(const struct vm_struct *vm) {} | 122 | static inline void kasan_free_shadow(const struct vm_struct *vm) {} |
122 | 123 | ||
123 | static inline void kasan_unpoison_slab(const void *ptr) { } | 124 | static inline void kasan_unpoison_slab(const void *ptr) { } |
125 | static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } | ||
124 | 126 | ||
125 | #endif /* CONFIG_KASAN */ | 127 | #endif /* CONFIG_KASAN */ |
126 | 128 | ||
diff --git a/include/linux/kdb.h b/include/linux/kdb.h index a19bcf9e762e..410decacff8f 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h | |||
@@ -177,7 +177,7 @@ extern int kdb_get_kbd_char(void); | |||
177 | static inline | 177 | static inline |
178 | int kdb_process_cpu(const struct task_struct *p) | 178 | int kdb_process_cpu(const struct task_struct *p) |
179 | { | 179 | { |
180 | unsigned int cpu = task_thread_info(p)->cpu; | 180 | unsigned int cpu = task_cpu(p); |
181 | if (cpu > num_possible_cpus()) | 181 | if (cpu > num_possible_cpus()) |
182 | cpu = 0; | 182 | cpu = 0; |
183 | return cpu; | 183 | return cpu; |
diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 6c14b6179727..2925da23505d 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h | |||
@@ -332,6 +332,7 @@ phys_addr_t memblock_mem_size(unsigned long limit_pfn); | |||
332 | phys_addr_t memblock_start_of_DRAM(void); | 332 | phys_addr_t memblock_start_of_DRAM(void); |
333 | phys_addr_t memblock_end_of_DRAM(void); | 333 | phys_addr_t memblock_end_of_DRAM(void); |
334 | void memblock_enforce_memory_limit(phys_addr_t memory_limit); | 334 | void memblock_enforce_memory_limit(phys_addr_t memory_limit); |
335 | void memblock_mem_limit_remove_map(phys_addr_t limit); | ||
335 | bool memblock_is_memory(phys_addr_t addr); | 336 | bool memblock_is_memory(phys_addr_t addr); |
336 | int memblock_is_map_memory(phys_addr_t addr); | 337 | int memblock_is_map_memory(phys_addr_t addr); |
337 | int memblock_is_region_memory(phys_addr_t base, phys_addr_t size); | 338 | int memblock_is_region_memory(phys_addr_t base, phys_addr_t size); |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 71aff733a497..5d8ca6e02e39 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -52,7 +52,7 @@ enum mem_cgroup_stat_index { | |||
52 | MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ | 52 | MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ |
53 | MEM_CGROUP_STAT_NSTATS, | 53 | MEM_CGROUP_STAT_NSTATS, |
54 | /* default hierarchy stats */ | 54 | /* default hierarchy stats */ |
55 | MEMCG_KERNEL_STACK = MEM_CGROUP_STAT_NSTATS, | 55 | MEMCG_KERNEL_STACK_KB = MEM_CGROUP_STAT_NSTATS, |
56 | MEMCG_SLAB_RECLAIMABLE, | 56 | MEMCG_SLAB_RECLAIMABLE, |
57 | MEMCG_SLAB_UNRECLAIMABLE, | 57 | MEMCG_SLAB_UNRECLAIMABLE, |
58 | MEMCG_SOCK, | 58 | MEMCG_SOCK, |
@@ -60,7 +60,7 @@ enum mem_cgroup_stat_index { | |||
60 | }; | 60 | }; |
61 | 61 | ||
62 | struct mem_cgroup_reclaim_cookie { | 62 | struct mem_cgroup_reclaim_cookie { |
63 | struct zone *zone; | 63 | pg_data_t *pgdat; |
64 | int priority; | 64 | int priority; |
65 | unsigned int generation; | 65 | unsigned int generation; |
66 | }; | 66 | }; |
@@ -118,7 +118,7 @@ struct mem_cgroup_reclaim_iter { | |||
118 | /* | 118 | /* |
119 | * per-zone information in memory controller. | 119 | * per-zone information in memory controller. |
120 | */ | 120 | */ |
121 | struct mem_cgroup_per_zone { | 121 | struct mem_cgroup_per_node { |
122 | struct lruvec lruvec; | 122 | struct lruvec lruvec; |
123 | unsigned long lru_size[NR_LRU_LISTS]; | 123 | unsigned long lru_size[NR_LRU_LISTS]; |
124 | 124 | ||
@@ -132,10 +132,6 @@ struct mem_cgroup_per_zone { | |||
132 | /* use container_of */ | 132 | /* use container_of */ |
133 | }; | 133 | }; |
134 | 134 | ||
135 | struct mem_cgroup_per_node { | ||
136 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | ||
137 | }; | ||
138 | |||
139 | struct mem_cgroup_threshold { | 135 | struct mem_cgroup_threshold { |
140 | struct eventfd_ctx *eventfd; | 136 | struct eventfd_ctx *eventfd; |
141 | unsigned long threshold; | 137 | unsigned long threshold; |
@@ -314,8 +310,46 @@ void mem_cgroup_uncharge_list(struct list_head *page_list); | |||
314 | 310 | ||
315 | void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); | 311 | void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); |
316 | 312 | ||
317 | struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); | 313 | static struct mem_cgroup_per_node * |
318 | struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); | 314 | mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid) |
315 | { | ||
316 | return memcg->nodeinfo[nid]; | ||
317 | } | ||
318 | |||
319 | /** | ||
320 | * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone | ||
321 | * @node: node of the wanted lruvec | ||
322 | * @memcg: memcg of the wanted lruvec | ||
323 | * | ||
324 | * Returns the lru list vector holding pages for a given @node or a given | ||
325 | * @memcg and @zone. This can be the node lruvec, if the memory controller | ||
326 | * is disabled. | ||
327 | */ | ||
328 | static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, | ||
329 | struct mem_cgroup *memcg) | ||
330 | { | ||
331 | struct mem_cgroup_per_node *mz; | ||
332 | struct lruvec *lruvec; | ||
333 | |||
334 | if (mem_cgroup_disabled()) { | ||
335 | lruvec = node_lruvec(pgdat); | ||
336 | goto out; | ||
337 | } | ||
338 | |||
339 | mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); | ||
340 | lruvec = &mz->lruvec; | ||
341 | out: | ||
342 | /* | ||
343 | * Since a node can be onlined after the mem_cgroup was created, | ||
344 | * we have to be prepared to initialize lruvec->pgdat here; | ||
345 | * and if offlined then reonlined, we need to reinitialize it. | ||
346 | */ | ||
347 | if (unlikely(lruvec->pgdat != pgdat)) | ||
348 | lruvec->pgdat = pgdat; | ||
349 | return lruvec; | ||
350 | } | ||
351 | |||
352 | struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); | ||
319 | 353 | ||
320 | bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); | 354 | bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); |
321 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); | 355 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); |
@@ -404,9 +438,9 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, | |||
404 | static inline | 438 | static inline |
405 | unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) | 439 | unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) |
406 | { | 440 | { |
407 | struct mem_cgroup_per_zone *mz; | 441 | struct mem_cgroup_per_node *mz; |
408 | 442 | ||
409 | mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); | 443 | mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); |
410 | return mz->lru_size[lru]; | 444 | return mz->lru_size[lru]; |
411 | } | 445 | } |
412 | 446 | ||
@@ -477,7 +511,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, | |||
477 | mem_cgroup_update_page_stat(page, idx, -1); | 511 | mem_cgroup_update_page_stat(page, idx, -1); |
478 | } | 512 | } |
479 | 513 | ||
480 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | 514 | unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, |
481 | gfp_t gfp_mask, | 515 | gfp_t gfp_mask, |
482 | unsigned long *total_scanned); | 516 | unsigned long *total_scanned); |
483 | 517 | ||
@@ -568,16 +602,16 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new) | |||
568 | { | 602 | { |
569 | } | 603 | } |
570 | 604 | ||
571 | static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, | 605 | static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, |
572 | struct mem_cgroup *memcg) | 606 | struct mem_cgroup *memcg) |
573 | { | 607 | { |
574 | return &zone->lruvec; | 608 | return node_lruvec(pgdat); |
575 | } | 609 | } |
576 | 610 | ||
577 | static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, | 611 | static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, |
578 | struct zone *zone) | 612 | struct pglist_data *pgdat) |
579 | { | 613 | { |
580 | return &zone->lruvec; | 614 | return &pgdat->lruvec; |
581 | } | 615 | } |
582 | 616 | ||
583 | static inline bool mm_match_cgroup(struct mm_struct *mm, | 617 | static inline bool mm_match_cgroup(struct mm_struct *mm, |
@@ -681,7 +715,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, | |||
681 | } | 715 | } |
682 | 716 | ||
683 | static inline | 717 | static inline |
684 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | 718 | unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, |
685 | gfp_t gfp_mask, | 719 | gfp_t gfp_mask, |
686 | unsigned long *total_scanned) | 720 | unsigned long *total_scanned) |
687 | { | 721 | { |
diff --git a/include/linux/memremap.h b/include/linux/memremap.h index bcaa634139a9..93416196ba64 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h | |||
@@ -26,7 +26,7 @@ struct vmem_altmap { | |||
26 | unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); | 26 | unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); |
27 | void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); | 27 | void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); |
28 | 28 | ||
29 | #if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE) | 29 | #ifdef CONFIG_ZONE_DEVICE |
30 | struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start); | 30 | struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start); |
31 | #else | 31 | #else |
32 | static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) | 32 | static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 192c1bbe5fcd..08ed53eeedd5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -933,6 +933,11 @@ static inline struct zone *page_zone(const struct page *page) | |||
933 | return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; | 933 | return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; |
934 | } | 934 | } |
935 | 935 | ||
936 | static inline pg_data_t *page_pgdat(const struct page *page) | ||
937 | { | ||
938 | return NODE_DATA(page_to_nid(page)); | ||
939 | } | ||
940 | |||
936 | #ifdef SECTION_IN_PAGE_FLAGS | 941 | #ifdef SECTION_IN_PAGE_FLAGS |
937 | static inline void set_page_section(struct page *page, unsigned long section) | 942 | static inline void set_page_section(struct page *page, unsigned long section) |
938 | { | 943 | { |
@@ -973,11 +978,21 @@ static inline struct mem_cgroup *page_memcg(struct page *page) | |||
973 | { | 978 | { |
974 | return page->mem_cgroup; | 979 | return page->mem_cgroup; |
975 | } | 980 | } |
981 | static inline struct mem_cgroup *page_memcg_rcu(struct page *page) | ||
982 | { | ||
983 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
984 | return READ_ONCE(page->mem_cgroup); | ||
985 | } | ||
976 | #else | 986 | #else |
977 | static inline struct mem_cgroup *page_memcg(struct page *page) | 987 | static inline struct mem_cgroup *page_memcg(struct page *page) |
978 | { | 988 | { |
979 | return NULL; | 989 | return NULL; |
980 | } | 990 | } |
991 | static inline struct mem_cgroup *page_memcg_rcu(struct page *page) | ||
992 | { | ||
993 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
994 | return NULL; | ||
995 | } | ||
981 | #endif | 996 | #endif |
982 | 997 | ||
983 | /* | 998 | /* |
@@ -2284,6 +2299,8 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) | |||
2284 | } | 2299 | } |
2285 | #endif /* __HAVE_ARCH_GATE_AREA */ | 2300 | #endif /* __HAVE_ARCH_GATE_AREA */ |
2286 | 2301 | ||
2302 | extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); | ||
2303 | |||
2287 | #ifdef CONFIG_SYSCTL | 2304 | #ifdef CONFIG_SYSCTL |
2288 | extern int sysctl_drop_caches; | 2305 | extern int sysctl_drop_caches; |
2289 | int drop_caches_sysctl_handler(struct ctl_table *, int, | 2306 | int drop_caches_sysctl_handler(struct ctl_table *, int, |
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 5bd29ba4f174..71613e8a720f 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h | |||
@@ -23,25 +23,30 @@ static inline int page_is_file_cache(struct page *page) | |||
23 | } | 23 | } |
24 | 24 | ||
25 | static __always_inline void __update_lru_size(struct lruvec *lruvec, | 25 | static __always_inline void __update_lru_size(struct lruvec *lruvec, |
26 | enum lru_list lru, int nr_pages) | 26 | enum lru_list lru, enum zone_type zid, |
27 | int nr_pages) | ||
27 | { | 28 | { |
28 | __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages); | 29 | struct pglist_data *pgdat = lruvec_pgdat(lruvec); |
30 | |||
31 | __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages); | ||
32 | __mod_zone_page_state(&pgdat->node_zones[zid], | ||
33 | NR_ZONE_LRU_BASE + lru, nr_pages); | ||
29 | } | 34 | } |
30 | 35 | ||
31 | static __always_inline void update_lru_size(struct lruvec *lruvec, | 36 | static __always_inline void update_lru_size(struct lruvec *lruvec, |
32 | enum lru_list lru, int nr_pages) | 37 | enum lru_list lru, enum zone_type zid, |
38 | int nr_pages) | ||
33 | { | 39 | { |
40 | __update_lru_size(lruvec, lru, zid, nr_pages); | ||
34 | #ifdef CONFIG_MEMCG | 41 | #ifdef CONFIG_MEMCG |
35 | mem_cgroup_update_lru_size(lruvec, lru, nr_pages); | 42 | mem_cgroup_update_lru_size(lruvec, lru, nr_pages); |
36 | #else | ||
37 | __update_lru_size(lruvec, lru, nr_pages); | ||
38 | #endif | 43 | #endif |
39 | } | 44 | } |
40 | 45 | ||
41 | static __always_inline void add_page_to_lru_list(struct page *page, | 46 | static __always_inline void add_page_to_lru_list(struct page *page, |
42 | struct lruvec *lruvec, enum lru_list lru) | 47 | struct lruvec *lruvec, enum lru_list lru) |
43 | { | 48 | { |
44 | update_lru_size(lruvec, lru, hpage_nr_pages(page)); | 49 | update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page)); |
45 | list_add(&page->lru, &lruvec->lists[lru]); | 50 | list_add(&page->lru, &lruvec->lists[lru]); |
46 | } | 51 | } |
47 | 52 | ||
@@ -49,7 +54,7 @@ static __always_inline void del_page_from_lru_list(struct page *page, | |||
49 | struct lruvec *lruvec, enum lru_list lru) | 54 | struct lruvec *lruvec, enum lru_list lru) |
50 | { | 55 | { |
51 | list_del(&page->lru); | 56 | list_del(&page->lru); |
52 | update_lru_size(lruvec, lru, -hpage_nr_pages(page)); | 57 | update_lru_size(lruvec, lru, page_zonenum(page), -hpage_nr_pages(page)); |
53 | } | 58 | } |
54 | 59 | ||
55 | /** | 60 | /** |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 79472b22d23f..903200f4ec41 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -118,7 +118,7 @@ struct page { | |||
118 | */ | 118 | */ |
119 | union { | 119 | union { |
120 | struct list_head lru; /* Pageout list, eg. active_list | 120 | struct list_head lru; /* Pageout list, eg. active_list |
121 | * protected by zone->lru_lock ! | 121 | * protected by zone_lru_lock ! |
122 | * Can be used as a generic list | 122 | * Can be used as a generic list |
123 | * by the page owner. | 123 | * by the page owner. |
124 | */ | 124 | */ |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 19425e988bdc..f2e4e90621ec 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -93,7 +93,7 @@ struct free_area { | |||
93 | struct pglist_data; | 93 | struct pglist_data; |
94 | 94 | ||
95 | /* | 95 | /* |
96 | * zone->lock and zone->lru_lock are two of the hottest locks in the kernel. | 96 | * zone->lock and the zone lru_lock are two of the hottest locks in the kernel. |
97 | * So add a wild amount of padding here to ensure that they fall into separate | 97 | * So add a wild amount of padding here to ensure that they fall into separate |
98 | * cachelines. There are very few zone structures in the machine, so space | 98 | * cachelines. There are very few zone structures in the machine, so space |
99 | * consumption is not a concern here. | 99 | * consumption is not a concern here. |
@@ -110,36 +110,20 @@ struct zone_padding { | |||
110 | enum zone_stat_item { | 110 | enum zone_stat_item { |
111 | /* First 128 byte cacheline (assuming 64 bit words) */ | 111 | /* First 128 byte cacheline (assuming 64 bit words) */ |
112 | NR_FREE_PAGES, | 112 | NR_FREE_PAGES, |
113 | NR_ALLOC_BATCH, | 113 | NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ |
114 | NR_LRU_BASE, | 114 | NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, |
115 | NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ | 115 | NR_ZONE_ACTIVE_ANON, |
116 | NR_ACTIVE_ANON, /* " " " " " */ | 116 | NR_ZONE_INACTIVE_FILE, |
117 | NR_INACTIVE_FILE, /* " " " " " */ | 117 | NR_ZONE_ACTIVE_FILE, |
118 | NR_ACTIVE_FILE, /* " " " " " */ | 118 | NR_ZONE_UNEVICTABLE, |
119 | NR_UNEVICTABLE, /* " " " " " */ | 119 | NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ |
120 | NR_MLOCK, /* mlock()ed pages found and moved off LRU */ | 120 | NR_MLOCK, /* mlock()ed pages found and moved off LRU */ |
121 | NR_ANON_PAGES, /* Mapped anonymous pages */ | ||
122 | NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. | ||
123 | only modified from process context */ | ||
124 | NR_FILE_PAGES, | ||
125 | NR_FILE_DIRTY, | ||
126 | NR_WRITEBACK, | ||
127 | NR_SLAB_RECLAIMABLE, | 121 | NR_SLAB_RECLAIMABLE, |
128 | NR_SLAB_UNRECLAIMABLE, | 122 | NR_SLAB_UNRECLAIMABLE, |
129 | NR_PAGETABLE, /* used for pagetables */ | 123 | NR_PAGETABLE, /* used for pagetables */ |
130 | NR_KERNEL_STACK, | 124 | NR_KERNEL_STACK_KB, /* measured in KiB */ |
131 | /* Second 128 byte cacheline */ | 125 | /* Second 128 byte cacheline */ |
132 | NR_UNSTABLE_NFS, /* NFS unstable pages */ | ||
133 | NR_BOUNCE, | 126 | NR_BOUNCE, |
134 | NR_VMSCAN_WRITE, | ||
135 | NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ | ||
136 | NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ | ||
137 | NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ | ||
138 | NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ | ||
139 | NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ | ||
140 | NR_DIRTIED, /* page dirtyings since bootup */ | ||
141 | NR_WRITTEN, /* page writings since bootup */ | ||
142 | NR_PAGES_SCANNED, /* pages scanned since last reclaim */ | ||
143 | #if IS_ENABLED(CONFIG_ZSMALLOC) | 127 | #if IS_ENABLED(CONFIG_ZSMALLOC) |
144 | NR_ZSPAGES, /* allocated in zsmalloc */ | 128 | NR_ZSPAGES, /* allocated in zsmalloc */ |
145 | #endif | 129 | #endif |
@@ -151,14 +135,40 @@ enum zone_stat_item { | |||
151 | NUMA_LOCAL, /* allocation from local node */ | 135 | NUMA_LOCAL, /* allocation from local node */ |
152 | NUMA_OTHER, /* allocation from other node */ | 136 | NUMA_OTHER, /* allocation from other node */ |
153 | #endif | 137 | #endif |
138 | NR_FREE_CMA_PAGES, | ||
139 | NR_VM_ZONE_STAT_ITEMS }; | ||
140 | |||
141 | enum node_stat_item { | ||
142 | NR_LRU_BASE, | ||
143 | NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ | ||
144 | NR_ACTIVE_ANON, /* " " " " " */ | ||
145 | NR_INACTIVE_FILE, /* " " " " " */ | ||
146 | NR_ACTIVE_FILE, /* " " " " " */ | ||
147 | NR_UNEVICTABLE, /* " " " " " */ | ||
148 | NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ | ||
149 | NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ | ||
150 | NR_PAGES_SCANNED, /* pages scanned since last reclaim */ | ||
154 | WORKINGSET_REFAULT, | 151 | WORKINGSET_REFAULT, |
155 | WORKINGSET_ACTIVATE, | 152 | WORKINGSET_ACTIVATE, |
156 | WORKINGSET_NODERECLAIM, | 153 | WORKINGSET_NODERECLAIM, |
157 | NR_ANON_THPS, | 154 | NR_ANON_MAPPED, /* Mapped anonymous pages */ |
155 | NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. | ||
156 | only modified from process context */ | ||
157 | NR_FILE_PAGES, | ||
158 | NR_FILE_DIRTY, | ||
159 | NR_WRITEBACK, | ||
160 | NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ | ||
161 | NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ | ||
158 | NR_SHMEM_THPS, | 162 | NR_SHMEM_THPS, |
159 | NR_SHMEM_PMDMAPPED, | 163 | NR_SHMEM_PMDMAPPED, |
160 | NR_FREE_CMA_PAGES, | 164 | NR_ANON_THPS, |
161 | NR_VM_ZONE_STAT_ITEMS }; | 165 | NR_UNSTABLE_NFS, /* NFS unstable pages */ |
166 | NR_VMSCAN_WRITE, | ||
167 | NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ | ||
168 | NR_DIRTIED, /* page dirtyings since bootup */ | ||
169 | NR_WRITTEN, /* page writings since bootup */ | ||
170 | NR_VM_NODE_STAT_ITEMS | ||
171 | }; | ||
162 | 172 | ||
163 | /* | 173 | /* |
164 | * We do arithmetic on the LRU lists in various places in the code, | 174 | * We do arithmetic on the LRU lists in various places in the code, |
@@ -215,7 +225,7 @@ struct lruvec { | |||
215 | /* Evictions & activations on the inactive file list */ | 225 | /* Evictions & activations on the inactive file list */ |
216 | atomic_long_t inactive_age; | 226 | atomic_long_t inactive_age; |
217 | #ifdef CONFIG_MEMCG | 227 | #ifdef CONFIG_MEMCG |
218 | struct zone *zone; | 228 | struct pglist_data *pgdat; |
219 | #endif | 229 | #endif |
220 | }; | 230 | }; |
221 | 231 | ||
@@ -267,6 +277,11 @@ struct per_cpu_pageset { | |||
267 | #endif | 277 | #endif |
268 | }; | 278 | }; |
269 | 279 | ||
280 | struct per_cpu_nodestat { | ||
281 | s8 stat_threshold; | ||
282 | s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; | ||
283 | }; | ||
284 | |||
270 | #endif /* !__GENERATING_BOUNDS.H */ | 285 | #endif /* !__GENERATING_BOUNDS.H */ |
271 | 286 | ||
272 | enum zone_type { | 287 | enum zone_type { |
@@ -348,22 +363,9 @@ struct zone { | |||
348 | #ifdef CONFIG_NUMA | 363 | #ifdef CONFIG_NUMA |
349 | int node; | 364 | int node; |
350 | #endif | 365 | #endif |
351 | |||
352 | /* | ||
353 | * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on | ||
354 | * this zone's LRU. Maintained by the pageout code. | ||
355 | */ | ||
356 | unsigned int inactive_ratio; | ||
357 | |||
358 | struct pglist_data *zone_pgdat; | 366 | struct pglist_data *zone_pgdat; |
359 | struct per_cpu_pageset __percpu *pageset; | 367 | struct per_cpu_pageset __percpu *pageset; |
360 | 368 | ||
361 | /* | ||
362 | * This is a per-zone reserve of pages that are not available | ||
363 | * to userspace allocations. | ||
364 | */ | ||
365 | unsigned long totalreserve_pages; | ||
366 | |||
367 | #ifndef CONFIG_SPARSEMEM | 369 | #ifndef CONFIG_SPARSEMEM |
368 | /* | 370 | /* |
369 | * Flags for a pageblock_nr_pages block. See pageblock-flags.h. | 371 | * Flags for a pageblock_nr_pages block. See pageblock-flags.h. |
@@ -372,14 +374,6 @@ struct zone { | |||
372 | unsigned long *pageblock_flags; | 374 | unsigned long *pageblock_flags; |
373 | #endif /* CONFIG_SPARSEMEM */ | 375 | #endif /* CONFIG_SPARSEMEM */ |
374 | 376 | ||
375 | #ifdef CONFIG_NUMA | ||
376 | /* | ||
377 | * zone reclaim becomes active if more unmapped pages exist. | ||
378 | */ | ||
379 | unsigned long min_unmapped_pages; | ||
380 | unsigned long min_slab_pages; | ||
381 | #endif /* CONFIG_NUMA */ | ||
382 | |||
383 | /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ | 377 | /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ |
384 | unsigned long zone_start_pfn; | 378 | unsigned long zone_start_pfn; |
385 | 379 | ||
@@ -472,24 +466,21 @@ struct zone { | |||
472 | unsigned long wait_table_hash_nr_entries; | 466 | unsigned long wait_table_hash_nr_entries; |
473 | unsigned long wait_table_bits; | 467 | unsigned long wait_table_bits; |
474 | 468 | ||
469 | /* Write-intensive fields used from the page allocator */ | ||
475 | ZONE_PADDING(_pad1_) | 470 | ZONE_PADDING(_pad1_) |
471 | |||
476 | /* free areas of different sizes */ | 472 | /* free areas of different sizes */ |
477 | struct free_area free_area[MAX_ORDER]; | 473 | struct free_area free_area[MAX_ORDER]; |
478 | 474 | ||
479 | /* zone flags, see below */ | 475 | /* zone flags, see below */ |
480 | unsigned long flags; | 476 | unsigned long flags; |
481 | 477 | ||
482 | /* Write-intensive fields used from the page allocator */ | 478 | /* Primarily protects free_area */ |
483 | spinlock_t lock; | 479 | spinlock_t lock; |
484 | 480 | ||
481 | /* Write-intensive fields used by compaction and vmstats. */ | ||
485 | ZONE_PADDING(_pad2_) | 482 | ZONE_PADDING(_pad2_) |
486 | 483 | ||
487 | /* Write-intensive fields used by page reclaim */ | ||
488 | |||
489 | /* Fields commonly accessed by the page reclaim scanner */ | ||
490 | spinlock_t lru_lock; | ||
491 | struct lruvec lruvec; | ||
492 | |||
493 | /* | 484 | /* |
494 | * When free pages are below this point, additional steps are taken | 485 | * When free pages are below this point, additional steps are taken |
495 | * when reading the number of free pages to avoid per-cpu counter | 486 | * when reading the number of free pages to avoid per-cpu counter |
@@ -527,19 +518,18 @@ struct zone { | |||
527 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; | 518 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; |
528 | } ____cacheline_internodealigned_in_smp; | 519 | } ____cacheline_internodealigned_in_smp; |
529 | 520 | ||
530 | enum zone_flags { | 521 | enum pgdat_flags { |
531 | ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ | 522 | PGDAT_CONGESTED, /* pgdat has many dirty pages backed by |
532 | ZONE_CONGESTED, /* zone has many dirty pages backed by | ||
533 | * a congested BDI | 523 | * a congested BDI |
534 | */ | 524 | */ |
535 | ZONE_DIRTY, /* reclaim scanning has recently found | 525 | PGDAT_DIRTY, /* reclaim scanning has recently found |
536 | * many dirty file pages at the tail | 526 | * many dirty file pages at the tail |
537 | * of the LRU. | 527 | * of the LRU. |
538 | */ | 528 | */ |
539 | ZONE_WRITEBACK, /* reclaim scanning has recently found | 529 | PGDAT_WRITEBACK, /* reclaim scanning has recently found |
540 | * many pages under writeback | 530 | * many pages under writeback |
541 | */ | 531 | */ |
542 | ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ | 532 | PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ |
543 | }; | 533 | }; |
544 | 534 | ||
545 | static inline unsigned long zone_end_pfn(const struct zone *zone) | 535 | static inline unsigned long zone_end_pfn(const struct zone *zone) |
@@ -663,8 +653,9 @@ typedef struct pglist_data { | |||
663 | wait_queue_head_t pfmemalloc_wait; | 653 | wait_queue_head_t pfmemalloc_wait; |
664 | struct task_struct *kswapd; /* Protected by | 654 | struct task_struct *kswapd; /* Protected by |
665 | mem_hotplug_begin/end() */ | 655 | mem_hotplug_begin/end() */ |
666 | int kswapd_max_order; | 656 | int kswapd_order; |
667 | enum zone_type classzone_idx; | 657 | enum zone_type kswapd_classzone_idx; |
658 | |||
668 | #ifdef CONFIG_COMPACTION | 659 | #ifdef CONFIG_COMPACTION |
669 | int kcompactd_max_order; | 660 | int kcompactd_max_order; |
670 | enum zone_type kcompactd_classzone_idx; | 661 | enum zone_type kcompactd_classzone_idx; |
@@ -681,6 +672,23 @@ typedef struct pglist_data { | |||
681 | /* Number of pages migrated during the rate limiting time interval */ | 672 | /* Number of pages migrated during the rate limiting time interval */ |
682 | unsigned long numabalancing_migrate_nr_pages; | 673 | unsigned long numabalancing_migrate_nr_pages; |
683 | #endif | 674 | #endif |
675 | /* | ||
676 | * This is a per-node reserve of pages that are not available | ||
677 | * to userspace allocations. | ||
678 | */ | ||
679 | unsigned long totalreserve_pages; | ||
680 | |||
681 | #ifdef CONFIG_NUMA | ||
682 | /* | ||
683 | * zone reclaim becomes active if more unmapped pages exist. | ||
684 | */ | ||
685 | unsigned long min_unmapped_pages; | ||
686 | unsigned long min_slab_pages; | ||
687 | #endif /* CONFIG_NUMA */ | ||
688 | |||
689 | /* Write-intensive fields used by page reclaim */ | ||
690 | ZONE_PADDING(_pad1_) | ||
691 | spinlock_t lru_lock; | ||
684 | 692 | ||
685 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT | 693 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
686 | /* | 694 | /* |
@@ -695,6 +703,23 @@ typedef struct pglist_data { | |||
695 | struct list_head split_queue; | 703 | struct list_head split_queue; |
696 | unsigned long split_queue_len; | 704 | unsigned long split_queue_len; |
697 | #endif | 705 | #endif |
706 | |||
707 | /* Fields commonly accessed by the page reclaim scanner */ | ||
708 | struct lruvec lruvec; | ||
709 | |||
710 | /* | ||
711 | * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on | ||
712 | * this node's LRU. Maintained by the pageout code. | ||
713 | */ | ||
714 | unsigned int inactive_ratio; | ||
715 | |||
716 | unsigned long flags; | ||
717 | |||
718 | ZONE_PADDING(_pad2_) | ||
719 | |||
720 | /* Per-node vmstats */ | ||
721 | struct per_cpu_nodestat __percpu *per_cpu_nodestats; | ||
722 | atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; | ||
698 | } pg_data_t; | 723 | } pg_data_t; |
699 | 724 | ||
700 | #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) | 725 | #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) |
@@ -708,6 +733,15 @@ typedef struct pglist_data { | |||
708 | 733 | ||
709 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) | 734 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) |
710 | #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) | 735 | #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) |
736 | static inline spinlock_t *zone_lru_lock(struct zone *zone) | ||
737 | { | ||
738 | return &zone->zone_pgdat->lru_lock; | ||
739 | } | ||
740 | |||
741 | static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) | ||
742 | { | ||
743 | return &pgdat->lruvec; | ||
744 | } | ||
711 | 745 | ||
712 | static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) | 746 | static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) |
713 | { | 747 | { |
@@ -760,12 +794,12 @@ extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, | |||
760 | 794 | ||
761 | extern void lruvec_init(struct lruvec *lruvec); | 795 | extern void lruvec_init(struct lruvec *lruvec); |
762 | 796 | ||
763 | static inline struct zone *lruvec_zone(struct lruvec *lruvec) | 797 | static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) |
764 | { | 798 | { |
765 | #ifdef CONFIG_MEMCG | 799 | #ifdef CONFIG_MEMCG |
766 | return lruvec->zone; | 800 | return lruvec->pgdat; |
767 | #else | 801 | #else |
768 | return container_of(lruvec, struct zone, lruvec); | 802 | return container_of(lruvec, struct pglist_data, lruvec); |
769 | #endif | 803 | #endif |
770 | } | 804 | } |
771 | 805 | ||
diff --git a/include/linux/oom.h b/include/linux/oom.h index 606137b3b778..5bc0457ee3a8 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h | |||
@@ -73,9 +73,9 @@ static inline bool oom_task_origin(const struct task_struct *p) | |||
73 | extern void mark_oom_victim(struct task_struct *tsk); | 73 | extern void mark_oom_victim(struct task_struct *tsk); |
74 | 74 | ||
75 | #ifdef CONFIG_MMU | 75 | #ifdef CONFIG_MMU |
76 | extern void try_oom_reaper(struct task_struct *tsk); | 76 | extern void wake_oom_reaper(struct task_struct *tsk); |
77 | #else | 77 | #else |
78 | static inline void try_oom_reaper(struct task_struct *tsk) | 78 | static inline void wake_oom_reaper(struct task_struct *tsk) |
79 | { | 79 | { |
80 | } | 80 | } |
81 | #endif | 81 | #endif |
@@ -107,27 +107,7 @@ extern void oom_killer_enable(void); | |||
107 | 107 | ||
108 | extern struct task_struct *find_lock_task_mm(struct task_struct *p); | 108 | extern struct task_struct *find_lock_task_mm(struct task_struct *p); |
109 | 109 | ||
110 | static inline bool task_will_free_mem(struct task_struct *task) | 110 | bool task_will_free_mem(struct task_struct *task); |
111 | { | ||
112 | struct signal_struct *sig = task->signal; | ||
113 | |||
114 | /* | ||
115 | * A coredumping process may sleep for an extended period in exit_mm(), | ||
116 | * so the oom killer cannot assume that the process will promptly exit | ||
117 | * and release memory. | ||
118 | */ | ||
119 | if (sig->flags & SIGNAL_GROUP_COREDUMP) | ||
120 | return false; | ||
121 | |||
122 | if (!(task->flags & PF_EXITING)) | ||
123 | return false; | ||
124 | |||
125 | /* Make sure that the whole thread group is going down */ | ||
126 | if (!thread_group_empty(task) && !(sig->flags & SIGNAL_GROUP_EXIT)) | ||
127 | return false; | ||
128 | |||
129 | return true; | ||
130 | } | ||
131 | 111 | ||
132 | /* sysctls */ | 112 | /* sysctls */ |
133 | extern int sysctl_oom_dump_tasks; | 113 | extern int sysctl_oom_dump_tasks; |
diff --git a/include/linux/sched.h b/include/linux/sched.h index d99218a1e043..553af2923824 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -523,6 +523,7 @@ static inline int get_dumpable(struct mm_struct *mm) | |||
523 | #define MMF_HAS_UPROBES 19 /* has uprobes */ | 523 | #define MMF_HAS_UPROBES 19 /* has uprobes */ |
524 | #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ | 524 | #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ |
525 | #define MMF_OOM_REAPED 21 /* mm has been already reaped */ | 525 | #define MMF_OOM_REAPED 21 /* mm has been already reaped */ |
526 | #define MMF_OOM_NOT_REAPABLE 22 /* mm couldn't be reaped */ | ||
526 | 527 | ||
527 | #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) | 528 | #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) |
528 | 529 | ||
@@ -1949,6 +1950,32 @@ static inline int tsk_nr_cpus_allowed(struct task_struct *p) | |||
1949 | #define TNF_FAULT_LOCAL 0x08 | 1950 | #define TNF_FAULT_LOCAL 0x08 |
1950 | #define TNF_MIGRATE_FAIL 0x10 | 1951 | #define TNF_MIGRATE_FAIL 0x10 |
1951 | 1952 | ||
1953 | static inline bool in_vfork(struct task_struct *tsk) | ||
1954 | { | ||
1955 | bool ret; | ||
1956 | |||
1957 | /* | ||
1958 | * need RCU to access ->real_parent if CLONE_VM was used along with | ||
1959 | * CLONE_PARENT. | ||
1960 | * | ||
1961 | * We check real_parent->mm == tsk->mm because CLONE_VFORK does not | ||
1962 | * imply CLONE_VM | ||
1963 | * | ||
1964 | * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus | ||
1965 | * ->real_parent is not necessarily the task doing vfork(), so in | ||
1966 | * theory we can't rely on task_lock() if we want to dereference it. | ||
1967 | * | ||
1968 | * And in this case we can't trust the real_parent->mm == tsk->mm | ||
1969 | * check, it can be false negative. But we do not care, if init or | ||
1970 | * another oom-unkillable task does this it should blame itself. | ||
1971 | */ | ||
1972 | rcu_read_lock(); | ||
1973 | ret = tsk->vfork_done && tsk->real_parent->mm == tsk->mm; | ||
1974 | rcu_read_unlock(); | ||
1975 | |||
1976 | return ret; | ||
1977 | } | ||
1978 | |||
1952 | #ifdef CONFIG_NUMA_BALANCING | 1979 | #ifdef CONFIG_NUMA_BALANCING |
1953 | extern void task_numa_fault(int last_node, int node, int pages, int flags); | 1980 | extern void task_numa_fault(int last_node, int node, int pages, int flags); |
1954 | extern pid_t task_numa_group_id(struct task_struct *p); | 1981 | extern pid_t task_numa_group_id(struct task_struct *p); |
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 339ba027ade9..4ad2c5a26399 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h | |||
@@ -88,7 +88,8 @@ struct kmem_cache { | |||
88 | }; | 88 | }; |
89 | 89 | ||
90 | static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, | 90 | static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, |
91 | void *x) { | 91 | void *x) |
92 | { | ||
92 | void *object = x - (x - page->s_mem) % cache->size; | 93 | void *object = x - (x - page->s_mem) % cache->size; |
93 | void *last_object = page->s_mem + (cache->num - 1) * cache->size; | 94 | void *last_object = page->s_mem + (cache->num - 1) * cache->size; |
94 | 95 | ||
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 5624c1f3eb0a..75f56c2ef2d4 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h | |||
@@ -104,6 +104,10 @@ struct kmem_cache { | |||
104 | unsigned int *random_seq; | 104 | unsigned int *random_seq; |
105 | #endif | 105 | #endif |
106 | 106 | ||
107 | #ifdef CONFIG_KASAN | ||
108 | struct kasan_cache kasan_info; | ||
109 | #endif | ||
110 | |||
107 | struct kmem_cache_node *node[MAX_NUMNODES]; | 111 | struct kmem_cache_node *node[MAX_NUMNODES]; |
108 | }; | 112 | }; |
109 | 113 | ||
@@ -119,15 +123,17 @@ static inline void sysfs_slab_remove(struct kmem_cache *s) | |||
119 | void object_err(struct kmem_cache *s, struct page *page, | 123 | void object_err(struct kmem_cache *s, struct page *page, |
120 | u8 *object, char *reason); | 124 | u8 *object, char *reason); |
121 | 125 | ||
126 | void *fixup_red_left(struct kmem_cache *s, void *p); | ||
127 | |||
122 | static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, | 128 | static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, |
123 | void *x) { | 129 | void *x) { |
124 | void *object = x - (x - page_address(page)) % cache->size; | 130 | void *object = x - (x - page_address(page)) % cache->size; |
125 | void *last_object = page_address(page) + | 131 | void *last_object = page_address(page) + |
126 | (page->objects - 1) * cache->size; | 132 | (page->objects - 1) * cache->size; |
127 | if (unlikely(object > last_object)) | 133 | void *result = (unlikely(object > last_object)) ? last_object : object; |
128 | return last_object; | 134 | |
129 | else | 135 | result = fixup_red_left(cache, result); |
130 | return object; | 136 | return result; |
131 | } | 137 | } |
132 | 138 | ||
133 | #endif /* _LINUX_SLUB_DEF_H */ | 139 | #endif /* _LINUX_SLUB_DEF_H */ |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 0af2bb2028fd..b17cc4830fa6 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -157,15 +157,6 @@ enum { | |||
157 | #define SWAP_CLUSTER_MAX 32UL | 157 | #define SWAP_CLUSTER_MAX 32UL |
158 | #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX | 158 | #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX |
159 | 159 | ||
160 | /* | ||
161 | * Ratio between zone->managed_pages and the "gap" that above the per-zone | ||
162 | * "high_wmark". While balancing nodes, We allow kswapd to shrink zones that | ||
163 | * do not meet the (high_wmark + gap) watermark, even which already met the | ||
164 | * high_wmark, in order to provide better per-zone lru behavior. We are ok to | ||
165 | * spend not more than 1% of the memory for this zone balancing "gap". | ||
166 | */ | ||
167 | #define KSWAPD_ZONE_BALANCE_GAP_RATIO 100 | ||
168 | |||
169 | #define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */ | 160 | #define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */ |
170 | #define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */ | 161 | #define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */ |
171 | #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ | 162 | #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ |
@@ -317,6 +308,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page, | |||
317 | 308 | ||
318 | /* linux/mm/vmscan.c */ | 309 | /* linux/mm/vmscan.c */ |
319 | extern unsigned long zone_reclaimable_pages(struct zone *zone); | 310 | extern unsigned long zone_reclaimable_pages(struct zone *zone); |
311 | extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat); | ||
320 | extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | 312 | extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
321 | gfp_t gfp_mask, nodemask_t *mask); | 313 | gfp_t gfp_mask, nodemask_t *mask); |
322 | extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); | 314 | extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); |
@@ -324,9 +316,9 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
324 | unsigned long nr_pages, | 316 | unsigned long nr_pages, |
325 | gfp_t gfp_mask, | 317 | gfp_t gfp_mask, |
326 | bool may_swap); | 318 | bool may_swap); |
327 | extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | 319 | extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, |
328 | gfp_t gfp_mask, bool noswap, | 320 | gfp_t gfp_mask, bool noswap, |
329 | struct zone *zone, | 321 | pg_data_t *pgdat, |
330 | unsigned long *nr_scanned); | 322 | unsigned long *nr_scanned); |
331 | extern unsigned long shrink_all_memory(unsigned long nr_pages); | 323 | extern unsigned long shrink_all_memory(unsigned long nr_pages); |
332 | extern int vm_swappiness; | 324 | extern int vm_swappiness; |
@@ -334,13 +326,14 @@ extern int remove_mapping(struct address_space *mapping, struct page *page); | |||
334 | extern unsigned long vm_total_pages; | 326 | extern unsigned long vm_total_pages; |
335 | 327 | ||
336 | #ifdef CONFIG_NUMA | 328 | #ifdef CONFIG_NUMA |
337 | extern int zone_reclaim_mode; | 329 | extern int node_reclaim_mode; |
338 | extern int sysctl_min_unmapped_ratio; | 330 | extern int sysctl_min_unmapped_ratio; |
339 | extern int sysctl_min_slab_ratio; | 331 | extern int sysctl_min_slab_ratio; |
340 | extern int zone_reclaim(struct zone *, gfp_t, unsigned int); | 332 | extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); |
341 | #else | 333 | #else |
342 | #define zone_reclaim_mode 0 | 334 | #define node_reclaim_mode 0 |
343 | static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) | 335 | static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, |
336 | unsigned int order) | ||
344 | { | 337 | { |
345 | return 0; | 338 | return 0; |
346 | } | 339 | } |
diff --git a/include/linux/topology.h b/include/linux/topology.h index afce69296ac0..cb0775e1ee4b 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h | |||
@@ -54,7 +54,7 @@ int arch_update_cpu_topology(void); | |||
54 | /* | 54 | /* |
55 | * If the distance between nodes in a system is larger than RECLAIM_DISTANCE | 55 | * If the distance between nodes in a system is larger than RECLAIM_DISTANCE |
56 | * (in whatever arch specific measurement units returned by node_distance()) | 56 | * (in whatever arch specific measurement units returned by node_distance()) |
57 | * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim() | 57 | * and node_reclaim_mode is enabled then the VM will only call node_reclaim() |
58 | * on nodes within this distance. | 58 | * on nodes within this distance. |
59 | */ | 59 | */ |
60 | #define RECLAIM_DISTANCE 30 | 60 | #define RECLAIM_DISTANCE 30 |
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 42604173f122..4d6ec58a8d45 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
@@ -23,21 +23,23 @@ | |||
23 | 23 | ||
24 | enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | 24 | enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, |
25 | FOR_ALL_ZONES(PGALLOC), | 25 | FOR_ALL_ZONES(PGALLOC), |
26 | FOR_ALL_ZONES(ALLOCSTALL), | ||
27 | FOR_ALL_ZONES(PGSCAN_SKIP), | ||
26 | PGFREE, PGACTIVATE, PGDEACTIVATE, | 28 | PGFREE, PGACTIVATE, PGDEACTIVATE, |
27 | PGFAULT, PGMAJFAULT, | 29 | PGFAULT, PGMAJFAULT, |
28 | PGLAZYFREED, | 30 | PGLAZYFREED, |
29 | FOR_ALL_ZONES(PGREFILL), | 31 | PGREFILL, |
30 | FOR_ALL_ZONES(PGSTEAL_KSWAPD), | 32 | PGSTEAL_KSWAPD, |
31 | FOR_ALL_ZONES(PGSTEAL_DIRECT), | 33 | PGSTEAL_DIRECT, |
32 | FOR_ALL_ZONES(PGSCAN_KSWAPD), | 34 | PGSCAN_KSWAPD, |
33 | FOR_ALL_ZONES(PGSCAN_DIRECT), | 35 | PGSCAN_DIRECT, |
34 | PGSCAN_DIRECT_THROTTLE, | 36 | PGSCAN_DIRECT_THROTTLE, |
35 | #ifdef CONFIG_NUMA | 37 | #ifdef CONFIG_NUMA |
36 | PGSCAN_ZONE_RECLAIM_FAILED, | 38 | PGSCAN_ZONE_RECLAIM_FAILED, |
37 | #endif | 39 | #endif |
38 | PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL, | 40 | PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL, |
39 | KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, | 41 | KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, |
40 | PAGEOUTRUN, ALLOCSTALL, PGROTATED, | 42 | PAGEOUTRUN, PGROTATED, |
41 | DROP_PAGECACHE, DROP_SLAB, | 43 | DROP_PAGECACHE, DROP_SLAB, |
42 | #ifdef CONFIG_NUMA_BALANCING | 44 | #ifdef CONFIG_NUMA_BALANCING |
43 | NUMA_PTE_UPDATES, | 45 | NUMA_PTE_UPDATES, |
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d2da8e053210..613771909b6e 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h | |||
@@ -101,25 +101,42 @@ static inline void vm_events_fold_cpu(int cpu) | |||
101 | #define count_vm_vmacache_event(x) do {} while (0) | 101 | #define count_vm_vmacache_event(x) do {} while (0) |
102 | #endif | 102 | #endif |
103 | 103 | ||
104 | #define __count_zone_vm_events(item, zone, delta) \ | 104 | #define __count_zid_vm_events(item, zid, delta) \ |
105 | __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ | 105 | __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) |
106 | zone_idx(zone), delta) | ||
107 | 106 | ||
108 | /* | 107 | /* |
109 | * Zone based page accounting with per cpu differentials. | 108 | * Zone and node-based page accounting with per cpu differentials. |
110 | */ | 109 | */ |
111 | extern atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; | 110 | extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS]; |
111 | extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS]; | ||
112 | 112 | ||
113 | static inline void zone_page_state_add(long x, struct zone *zone, | 113 | static inline void zone_page_state_add(long x, struct zone *zone, |
114 | enum zone_stat_item item) | 114 | enum zone_stat_item item) |
115 | { | 115 | { |
116 | atomic_long_add(x, &zone->vm_stat[item]); | 116 | atomic_long_add(x, &zone->vm_stat[item]); |
117 | atomic_long_add(x, &vm_stat[item]); | 117 | atomic_long_add(x, &vm_zone_stat[item]); |
118 | } | ||
119 | |||
120 | static inline void node_page_state_add(long x, struct pglist_data *pgdat, | ||
121 | enum node_stat_item item) | ||
122 | { | ||
123 | atomic_long_add(x, &pgdat->vm_stat[item]); | ||
124 | atomic_long_add(x, &vm_node_stat[item]); | ||
118 | } | 125 | } |
119 | 126 | ||
120 | static inline unsigned long global_page_state(enum zone_stat_item item) | 127 | static inline unsigned long global_page_state(enum zone_stat_item item) |
121 | { | 128 | { |
122 | long x = atomic_long_read(&vm_stat[item]); | 129 | long x = atomic_long_read(&vm_zone_stat[item]); |
130 | #ifdef CONFIG_SMP | ||
131 | if (x < 0) | ||
132 | x = 0; | ||
133 | #endif | ||
134 | return x; | ||
135 | } | ||
136 | |||
137 | static inline unsigned long global_node_page_state(enum node_stat_item item) | ||
138 | { | ||
139 | long x = atomic_long_read(&vm_node_stat[item]); | ||
123 | #ifdef CONFIG_SMP | 140 | #ifdef CONFIG_SMP |
124 | if (x < 0) | 141 | if (x < 0) |
125 | x = 0; | 142 | x = 0; |
@@ -160,32 +177,61 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, | |||
160 | return x; | 177 | return x; |
161 | } | 178 | } |
162 | 179 | ||
163 | #ifdef CONFIG_NUMA | 180 | static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat, |
181 | enum node_stat_item item) | ||
182 | { | ||
183 | long x = atomic_long_read(&pgdat->vm_stat[item]); | ||
164 | 184 | ||
165 | extern unsigned long node_page_state(int node, enum zone_stat_item item); | 185 | #ifdef CONFIG_SMP |
186 | int cpu; | ||
187 | for_each_online_cpu(cpu) | ||
188 | x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item]; | ||
166 | 189 | ||
167 | #else | 190 | if (x < 0) |
191 | x = 0; | ||
192 | #endif | ||
193 | return x; | ||
194 | } | ||
168 | 195 | ||
169 | #define node_page_state(node, item) global_page_state(item) | ||
170 | 196 | ||
197 | #ifdef CONFIG_NUMA | ||
198 | extern unsigned long sum_zone_node_page_state(int node, | ||
199 | enum zone_stat_item item); | ||
200 | extern unsigned long node_page_state(struct pglist_data *pgdat, | ||
201 | enum node_stat_item item); | ||
202 | #else | ||
203 | #define sum_zone_node_page_state(node, item) global_page_state(item) | ||
204 | #define node_page_state(node, item) global_node_page_state(item) | ||
171 | #endif /* CONFIG_NUMA */ | 205 | #endif /* CONFIG_NUMA */ |
172 | 206 | ||
173 | #define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d) | 207 | #define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d) |
174 | #define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d)) | 208 | #define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d)) |
209 | #define add_node_page_state(__p, __i, __d) mod_node_page_state(__p, __i, __d) | ||
210 | #define sub_node_page_state(__p, __i, __d) mod_node_page_state(__p, __i, -(__d)) | ||
175 | 211 | ||
176 | #ifdef CONFIG_SMP | 212 | #ifdef CONFIG_SMP |
177 | void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long); | 213 | void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long); |
178 | void __inc_zone_page_state(struct page *, enum zone_stat_item); | 214 | void __inc_zone_page_state(struct page *, enum zone_stat_item); |
179 | void __dec_zone_page_state(struct page *, enum zone_stat_item); | 215 | void __dec_zone_page_state(struct page *, enum zone_stat_item); |
180 | 216 | ||
217 | void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long); | ||
218 | void __inc_node_page_state(struct page *, enum node_stat_item); | ||
219 | void __dec_node_page_state(struct page *, enum node_stat_item); | ||
220 | |||
181 | void mod_zone_page_state(struct zone *, enum zone_stat_item, long); | 221 | void mod_zone_page_state(struct zone *, enum zone_stat_item, long); |
182 | void inc_zone_page_state(struct page *, enum zone_stat_item); | 222 | void inc_zone_page_state(struct page *, enum zone_stat_item); |
183 | void dec_zone_page_state(struct page *, enum zone_stat_item); | 223 | void dec_zone_page_state(struct page *, enum zone_stat_item); |
184 | 224 | ||
185 | extern void inc_zone_state(struct zone *, enum zone_stat_item); | 225 | void mod_node_page_state(struct pglist_data *, enum node_stat_item, long); |
226 | void inc_node_page_state(struct page *, enum node_stat_item); | ||
227 | void dec_node_page_state(struct page *, enum node_stat_item); | ||
228 | |||
229 | extern void inc_node_state(struct pglist_data *, enum node_stat_item); | ||
186 | extern void __inc_zone_state(struct zone *, enum zone_stat_item); | 230 | extern void __inc_zone_state(struct zone *, enum zone_stat_item); |
231 | extern void __inc_node_state(struct pglist_data *, enum node_stat_item); | ||
187 | extern void dec_zone_state(struct zone *, enum zone_stat_item); | 232 | extern void dec_zone_state(struct zone *, enum zone_stat_item); |
188 | extern void __dec_zone_state(struct zone *, enum zone_stat_item); | 233 | extern void __dec_zone_state(struct zone *, enum zone_stat_item); |
234 | extern void __dec_node_state(struct pglist_data *, enum node_stat_item); | ||
189 | 235 | ||
190 | void quiet_vmstat(void); | 236 | void quiet_vmstat(void); |
191 | void cpu_vm_stats_fold(int cpu); | 237 | void cpu_vm_stats_fold(int cpu); |
@@ -213,16 +259,34 @@ static inline void __mod_zone_page_state(struct zone *zone, | |||
213 | zone_page_state_add(delta, zone, item); | 259 | zone_page_state_add(delta, zone, item); |
214 | } | 260 | } |
215 | 261 | ||
262 | static inline void __mod_node_page_state(struct pglist_data *pgdat, | ||
263 | enum node_stat_item item, int delta) | ||
264 | { | ||
265 | node_page_state_add(delta, pgdat, item); | ||
266 | } | ||
267 | |||
216 | static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item) | 268 | static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item) |
217 | { | 269 | { |
218 | atomic_long_inc(&zone->vm_stat[item]); | 270 | atomic_long_inc(&zone->vm_stat[item]); |
219 | atomic_long_inc(&vm_stat[item]); | 271 | atomic_long_inc(&vm_zone_stat[item]); |
272 | } | ||
273 | |||
274 | static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) | ||
275 | { | ||
276 | atomic_long_inc(&pgdat->vm_stat[item]); | ||
277 | atomic_long_inc(&vm_node_stat[item]); | ||
220 | } | 278 | } |
221 | 279 | ||
222 | static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item) | 280 | static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item) |
223 | { | 281 | { |
224 | atomic_long_dec(&zone->vm_stat[item]); | 282 | atomic_long_dec(&zone->vm_stat[item]); |
225 | atomic_long_dec(&vm_stat[item]); | 283 | atomic_long_dec(&vm_zone_stat[item]); |
284 | } | ||
285 | |||
286 | static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) | ||
287 | { | ||
288 | atomic_long_dec(&pgdat->vm_stat[item]); | ||
289 | atomic_long_dec(&vm_node_stat[item]); | ||
226 | } | 290 | } |
227 | 291 | ||
228 | static inline void __inc_zone_page_state(struct page *page, | 292 | static inline void __inc_zone_page_state(struct page *page, |
@@ -231,12 +295,26 @@ static inline void __inc_zone_page_state(struct page *page, | |||
231 | __inc_zone_state(page_zone(page), item); | 295 | __inc_zone_state(page_zone(page), item); |
232 | } | 296 | } |
233 | 297 | ||
298 | static inline void __inc_node_page_state(struct page *page, | ||
299 | enum node_stat_item item) | ||
300 | { | ||
301 | __inc_node_state(page_pgdat(page), item); | ||
302 | } | ||
303 | |||
304 | |||
234 | static inline void __dec_zone_page_state(struct page *page, | 305 | static inline void __dec_zone_page_state(struct page *page, |
235 | enum zone_stat_item item) | 306 | enum zone_stat_item item) |
236 | { | 307 | { |
237 | __dec_zone_state(page_zone(page), item); | 308 | __dec_zone_state(page_zone(page), item); |
238 | } | 309 | } |
239 | 310 | ||
311 | static inline void __dec_node_page_state(struct page *page, | ||
312 | enum node_stat_item item) | ||
313 | { | ||
314 | __dec_node_state(page_pgdat(page), item); | ||
315 | } | ||
316 | |||
317 | |||
240 | /* | 318 | /* |
241 | * We only use atomic operations to update counters. So there is no need to | 319 | * We only use atomic operations to update counters. So there is no need to |
242 | * disable interrupts. | 320 | * disable interrupts. |
@@ -245,7 +323,12 @@ static inline void __dec_zone_page_state(struct page *page, | |||
245 | #define dec_zone_page_state __dec_zone_page_state | 323 | #define dec_zone_page_state __dec_zone_page_state |
246 | #define mod_zone_page_state __mod_zone_page_state | 324 | #define mod_zone_page_state __mod_zone_page_state |
247 | 325 | ||
326 | #define inc_node_page_state __inc_node_page_state | ||
327 | #define dec_node_page_state __dec_node_page_state | ||
328 | #define mod_node_page_state __mod_node_page_state | ||
329 | |||
248 | #define inc_zone_state __inc_zone_state | 330 | #define inc_zone_state __inc_zone_state |
331 | #define inc_node_state __inc_node_state | ||
249 | #define dec_zone_state __dec_zone_state | 332 | #define dec_zone_state __dec_zone_state |
250 | 333 | ||
251 | #define set_pgdat_percpu_threshold(pgdat, callback) { } | 334 | #define set_pgdat_percpu_threshold(pgdat, callback) { } |
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 717e6149e753..fc1e16c25a29 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
@@ -320,7 +320,7 @@ void laptop_mode_timer_fn(unsigned long data); | |||
320 | static inline void laptop_sync_completion(void) { } | 320 | static inline void laptop_sync_completion(void) { } |
321 | #endif | 321 | #endif |
322 | void throttle_vm_writeout(gfp_t gfp_mask); | 322 | void throttle_vm_writeout(gfp_t gfp_mask); |
323 | bool zone_dirty_ok(struct zone *zone); | 323 | bool node_dirty_ok(struct pglist_data *pgdat); |
324 | int wb_domain_init(struct wb_domain *dom, gfp_t gfp); | 324 | int wb_domain_init(struct wb_domain *dom, gfp_t gfp); |
325 | #ifdef CONFIG_CGROUP_WRITEBACK | 325 | #ifdef CONFIG_CGROUP_WRITEBACK |
326 | void wb_domain_exit(struct wb_domain *dom); | 326 | void wb_domain_exit(struct wb_domain *dom); |
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 36e2d6fb1360..c2ba402ab256 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h | |||
@@ -226,26 +226,26 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages, | |||
226 | TP_PROTO( | 226 | TP_PROTO( |
227 | int order, | 227 | int order, |
228 | gfp_t gfp_mask, | 228 | gfp_t gfp_mask, |
229 | enum migrate_mode mode), | 229 | int prio), |
230 | 230 | ||
231 | TP_ARGS(order, gfp_mask, mode), | 231 | TP_ARGS(order, gfp_mask, prio), |
232 | 232 | ||
233 | TP_STRUCT__entry( | 233 | TP_STRUCT__entry( |
234 | __field(int, order) | 234 | __field(int, order) |
235 | __field(gfp_t, gfp_mask) | 235 | __field(gfp_t, gfp_mask) |
236 | __field(enum migrate_mode, mode) | 236 | __field(int, prio) |
237 | ), | 237 | ), |
238 | 238 | ||
239 | TP_fast_assign( | 239 | TP_fast_assign( |
240 | __entry->order = order; | 240 | __entry->order = order; |
241 | __entry->gfp_mask = gfp_mask; | 241 | __entry->gfp_mask = gfp_mask; |
242 | __entry->mode = mode; | 242 | __entry->prio = prio; |
243 | ), | 243 | ), |
244 | 244 | ||
245 | TP_printk("order=%d gfp_mask=0x%x mode=%d", | 245 | TP_printk("order=%d gfp_mask=0x%x priority=%d", |
246 | __entry->order, | 246 | __entry->order, |
247 | __entry->gfp_mask, | 247 | __entry->gfp_mask, |
248 | (int)__entry->mode) | 248 | __entry->prio) |
249 | ); | 249 | ); |
250 | 250 | ||
251 | DECLARE_EVENT_CLASS(mm_compaction_suitable_template, | 251 | DECLARE_EVENT_CLASS(mm_compaction_suitable_template, |
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 43cedbf0c759..5a81ab48a2fb 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h | |||
@@ -11,6 +11,7 @@ | |||
11 | 11 | ||
12 | #define __def_gfpflag_names \ | 12 | #define __def_gfpflag_names \ |
13 | {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \ | 13 | {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \ |
14 | {(unsigned long)GFP_TRANSHUGE_LIGHT, "GFP_TRANSHUGE_LIGHT"}, \ | ||
14 | {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\ | 15 | {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\ |
15 | {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \ | 16 | {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \ |
16 | {(unsigned long)GFP_USER, "GFP_USER"}, \ | 17 | {(unsigned long)GFP_USER, "GFP_USER"}, \ |
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 0101ef37f1ee..c88fd0934e7e 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h | |||
@@ -55,21 +55,23 @@ TRACE_EVENT(mm_vmscan_kswapd_sleep, | |||
55 | 55 | ||
56 | TRACE_EVENT(mm_vmscan_kswapd_wake, | 56 | TRACE_EVENT(mm_vmscan_kswapd_wake, |
57 | 57 | ||
58 | TP_PROTO(int nid, int order), | 58 | TP_PROTO(int nid, int zid, int order), |
59 | 59 | ||
60 | TP_ARGS(nid, order), | 60 | TP_ARGS(nid, zid, order), |
61 | 61 | ||
62 | TP_STRUCT__entry( | 62 | TP_STRUCT__entry( |
63 | __field( int, nid ) | 63 | __field( int, nid ) |
64 | __field( int, zid ) | ||
64 | __field( int, order ) | 65 | __field( int, order ) |
65 | ), | 66 | ), |
66 | 67 | ||
67 | TP_fast_assign( | 68 | TP_fast_assign( |
68 | __entry->nid = nid; | 69 | __entry->nid = nid; |
70 | __entry->zid = zid; | ||
69 | __entry->order = order; | 71 | __entry->order = order; |
70 | ), | 72 | ), |
71 | 73 | ||
72 | TP_printk("nid=%d order=%d", __entry->nid, __entry->order) | 74 | TP_printk("nid=%d zid=%d order=%d", __entry->nid, __entry->zid, __entry->order) |
73 | ); | 75 | ); |
74 | 76 | ||
75 | TRACE_EVENT(mm_vmscan_wakeup_kswapd, | 77 | TRACE_EVENT(mm_vmscan_wakeup_kswapd, |
@@ -98,47 +100,50 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd, | |||
98 | 100 | ||
99 | DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, | 101 | DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, |
100 | 102 | ||
101 | TP_PROTO(int order, int may_writepage, gfp_t gfp_flags), | 103 | TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), |
102 | 104 | ||
103 | TP_ARGS(order, may_writepage, gfp_flags), | 105 | TP_ARGS(order, may_writepage, gfp_flags, classzone_idx), |
104 | 106 | ||
105 | TP_STRUCT__entry( | 107 | TP_STRUCT__entry( |
106 | __field( int, order ) | 108 | __field( int, order ) |
107 | __field( int, may_writepage ) | 109 | __field( int, may_writepage ) |
108 | __field( gfp_t, gfp_flags ) | 110 | __field( gfp_t, gfp_flags ) |
111 | __field( int, classzone_idx ) | ||
109 | ), | 112 | ), |
110 | 113 | ||
111 | TP_fast_assign( | 114 | TP_fast_assign( |
112 | __entry->order = order; | 115 | __entry->order = order; |
113 | __entry->may_writepage = may_writepage; | 116 | __entry->may_writepage = may_writepage; |
114 | __entry->gfp_flags = gfp_flags; | 117 | __entry->gfp_flags = gfp_flags; |
118 | __entry->classzone_idx = classzone_idx; | ||
115 | ), | 119 | ), |
116 | 120 | ||
117 | TP_printk("order=%d may_writepage=%d gfp_flags=%s", | 121 | TP_printk("order=%d may_writepage=%d gfp_flags=%s classzone_idx=%d", |
118 | __entry->order, | 122 | __entry->order, |
119 | __entry->may_writepage, | 123 | __entry->may_writepage, |
120 | show_gfp_flags(__entry->gfp_flags)) | 124 | show_gfp_flags(__entry->gfp_flags), |
125 | __entry->classzone_idx) | ||
121 | ); | 126 | ); |
122 | 127 | ||
123 | DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, | 128 | DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, |
124 | 129 | ||
125 | TP_PROTO(int order, int may_writepage, gfp_t gfp_flags), | 130 | TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), |
126 | 131 | ||
127 | TP_ARGS(order, may_writepage, gfp_flags) | 132 | TP_ARGS(order, may_writepage, gfp_flags, classzone_idx) |
128 | ); | 133 | ); |
129 | 134 | ||
130 | DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin, | 135 | DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin, |
131 | 136 | ||
132 | TP_PROTO(int order, int may_writepage, gfp_t gfp_flags), | 137 | TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), |
133 | 138 | ||
134 | TP_ARGS(order, may_writepage, gfp_flags) | 139 | TP_ARGS(order, may_writepage, gfp_flags, classzone_idx) |
135 | ); | 140 | ); |
136 | 141 | ||
137 | DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin, | 142 | DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin, |
138 | 143 | ||
139 | TP_PROTO(int order, int may_writepage, gfp_t gfp_flags), | 144 | TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), |
140 | 145 | ||
141 | TP_ARGS(order, may_writepage, gfp_flags) | 146 | TP_ARGS(order, may_writepage, gfp_flags, classzone_idx) |
142 | ); | 147 | ); |
143 | 148 | ||
144 | DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template, | 149 | DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template, |
@@ -266,16 +271,18 @@ TRACE_EVENT(mm_shrink_slab_end, | |||
266 | 271 | ||
267 | DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, | 272 | DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, |
268 | 273 | ||
269 | TP_PROTO(int order, | 274 | TP_PROTO(int classzone_idx, |
275 | int order, | ||
270 | unsigned long nr_requested, | 276 | unsigned long nr_requested, |
271 | unsigned long nr_scanned, | 277 | unsigned long nr_scanned, |
272 | unsigned long nr_taken, | 278 | unsigned long nr_taken, |
273 | isolate_mode_t isolate_mode, | 279 | isolate_mode_t isolate_mode, |
274 | int file), | 280 | int file), |
275 | 281 | ||
276 | TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file), | 282 | TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file), |
277 | 283 | ||
278 | TP_STRUCT__entry( | 284 | TP_STRUCT__entry( |
285 | __field(int, classzone_idx) | ||
279 | __field(int, order) | 286 | __field(int, order) |
280 | __field(unsigned long, nr_requested) | 287 | __field(unsigned long, nr_requested) |
281 | __field(unsigned long, nr_scanned) | 288 | __field(unsigned long, nr_scanned) |
@@ -285,6 +292,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, | |||
285 | ), | 292 | ), |
286 | 293 | ||
287 | TP_fast_assign( | 294 | TP_fast_assign( |
295 | __entry->classzone_idx = classzone_idx; | ||
288 | __entry->order = order; | 296 | __entry->order = order; |
289 | __entry->nr_requested = nr_requested; | 297 | __entry->nr_requested = nr_requested; |
290 | __entry->nr_scanned = nr_scanned; | 298 | __entry->nr_scanned = nr_scanned; |
@@ -293,8 +301,9 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, | |||
293 | __entry->file = file; | 301 | __entry->file = file; |
294 | ), | 302 | ), |
295 | 303 | ||
296 | TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d", | 304 | TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d", |
297 | __entry->isolate_mode, | 305 | __entry->isolate_mode, |
306 | __entry->classzone_idx, | ||
298 | __entry->order, | 307 | __entry->order, |
299 | __entry->nr_requested, | 308 | __entry->nr_requested, |
300 | __entry->nr_scanned, | 309 | __entry->nr_scanned, |
@@ -304,27 +313,29 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, | |||
304 | 313 | ||
305 | DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, | 314 | DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, |
306 | 315 | ||
307 | TP_PROTO(int order, | 316 | TP_PROTO(int classzone_idx, |
317 | int order, | ||
308 | unsigned long nr_requested, | 318 | unsigned long nr_requested, |
309 | unsigned long nr_scanned, | 319 | unsigned long nr_scanned, |
310 | unsigned long nr_taken, | 320 | unsigned long nr_taken, |
311 | isolate_mode_t isolate_mode, | 321 | isolate_mode_t isolate_mode, |
312 | int file), | 322 | int file), |
313 | 323 | ||
314 | TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) | 324 | TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) |
315 | 325 | ||
316 | ); | 326 | ); |
317 | 327 | ||
318 | DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate, | 328 | DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate, |
319 | 329 | ||
320 | TP_PROTO(int order, | 330 | TP_PROTO(int classzone_idx, |
331 | int order, | ||
321 | unsigned long nr_requested, | 332 | unsigned long nr_requested, |
322 | unsigned long nr_scanned, | 333 | unsigned long nr_scanned, |
323 | unsigned long nr_taken, | 334 | unsigned long nr_taken, |
324 | isolate_mode_t isolate_mode, | 335 | isolate_mode_t isolate_mode, |
325 | int file), | 336 | int file), |
326 | 337 | ||
327 | TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) | 338 | TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) |
328 | 339 | ||
329 | ); | 340 | ); |
330 | 341 | ||
@@ -352,15 +363,14 @@ TRACE_EVENT(mm_vmscan_writepage, | |||
352 | 363 | ||
353 | TRACE_EVENT(mm_vmscan_lru_shrink_inactive, | 364 | TRACE_EVENT(mm_vmscan_lru_shrink_inactive, |
354 | 365 | ||
355 | TP_PROTO(struct zone *zone, | 366 | TP_PROTO(int nid, |
356 | unsigned long nr_scanned, unsigned long nr_reclaimed, | 367 | unsigned long nr_scanned, unsigned long nr_reclaimed, |
357 | int priority, int file), | 368 | int priority, int file), |
358 | 369 | ||
359 | TP_ARGS(zone, nr_scanned, nr_reclaimed, priority, file), | 370 | TP_ARGS(nid, nr_scanned, nr_reclaimed, priority, file), |
360 | 371 | ||
361 | TP_STRUCT__entry( | 372 | TP_STRUCT__entry( |
362 | __field(int, nid) | 373 | __field(int, nid) |
363 | __field(int, zid) | ||
364 | __field(unsigned long, nr_scanned) | 374 | __field(unsigned long, nr_scanned) |
365 | __field(unsigned long, nr_reclaimed) | 375 | __field(unsigned long, nr_reclaimed) |
366 | __field(int, priority) | 376 | __field(int, priority) |
@@ -368,16 +378,15 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, | |||
368 | ), | 378 | ), |
369 | 379 | ||
370 | TP_fast_assign( | 380 | TP_fast_assign( |
371 | __entry->nid = zone_to_nid(zone); | 381 | __entry->nid = nid; |
372 | __entry->zid = zone_idx(zone); | ||
373 | __entry->nr_scanned = nr_scanned; | 382 | __entry->nr_scanned = nr_scanned; |
374 | __entry->nr_reclaimed = nr_reclaimed; | 383 | __entry->nr_reclaimed = nr_reclaimed; |
375 | __entry->priority = priority; | 384 | __entry->priority = priority; |
376 | __entry->reclaim_flags = trace_shrink_flags(file); | 385 | __entry->reclaim_flags = trace_shrink_flags(file); |
377 | ), | 386 | ), |
378 | 387 | ||
379 | TP_printk("nid=%d zid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s", | 388 | TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s", |
380 | __entry->nid, __entry->zid, | 389 | __entry->nid, |
381 | __entry->nr_scanned, __entry->nr_reclaimed, | 390 | __entry->nr_scanned, __entry->nr_reclaimed, |
382 | __entry->priority, | 391 | __entry->priority, |
383 | show_reclaim_flags(__entry->reclaim_flags)) | 392 | show_reclaim_flags(__entry->reclaim_flags)) |
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 531f5811ff6b..2ccd9ccbf9ef 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h | |||
@@ -412,11 +412,11 @@ TRACE_EVENT(global_dirty_state, | |||
412 | ), | 412 | ), |
413 | 413 | ||
414 | TP_fast_assign( | 414 | TP_fast_assign( |
415 | __entry->nr_dirty = global_page_state(NR_FILE_DIRTY); | 415 | __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY); |
416 | __entry->nr_writeback = global_page_state(NR_WRITEBACK); | 416 | __entry->nr_writeback = global_node_page_state(NR_WRITEBACK); |
417 | __entry->nr_unstable = global_page_state(NR_UNSTABLE_NFS); | 417 | __entry->nr_unstable = global_node_page_state(NR_UNSTABLE_NFS); |
418 | __entry->nr_dirtied = global_page_state(NR_DIRTIED); | 418 | __entry->nr_dirtied = global_node_page_state(NR_DIRTIED); |
419 | __entry->nr_written = global_page_state(NR_WRITTEN); | 419 | __entry->nr_written = global_node_page_state(NR_WRITTEN); |
420 | __entry->background_thresh = background_thresh; | 420 | __entry->background_thresh = background_thresh; |
421 | __entry->dirty_thresh = dirty_thresh; | 421 | __entry->dirty_thresh = dirty_thresh; |
422 | __entry->dirty_limit = global_wb_domain.dirty_limit; | 422 | __entry->dirty_limit = global_wb_domain.dirty_limit; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 73e93e53884d..c7fd2778ed50 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
1034 | { | 1034 | { |
1035 | bool need_loop; | 1035 | bool need_loop; |
1036 | 1036 | ||
1037 | /* | ||
1038 | * Allow tasks that have access to memory reserves because they have | ||
1039 | * been OOM killed to get memory anywhere. | ||
1040 | */ | ||
1041 | if (unlikely(test_thread_flag(TIF_MEMDIE))) | ||
1042 | return; | ||
1043 | if (current->flags & PF_EXITING) /* Let dying task have memory */ | ||
1044 | return; | ||
1045 | |||
1046 | task_lock(tsk); | 1037 | task_lock(tsk); |
1047 | /* | 1038 | /* |
1048 | * Determine if a loop is necessary if another thread is doing | 1039 | * Determine if a loop is necessary if another thread is doing |
diff --git a/kernel/fork.c b/kernel/fork.c index de21f25e0d2c..52e725d4a866 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -165,20 +165,12 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, | |||
165 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, | 165 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, |
166 | THREAD_SIZE_ORDER); | 166 | THREAD_SIZE_ORDER); |
167 | 167 | ||
168 | if (page) | ||
169 | memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, | ||
170 | 1 << THREAD_SIZE_ORDER); | ||
171 | |||
172 | return page ? page_address(page) : NULL; | 168 | return page ? page_address(page) : NULL; |
173 | } | 169 | } |
174 | 170 | ||
175 | static inline void free_thread_stack(unsigned long *stack) | 171 | static inline void free_thread_stack(unsigned long *stack) |
176 | { | 172 | { |
177 | struct page *page = virt_to_page(stack); | 173 | __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); |
178 | |||
179 | memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, | ||
180 | -(1 << THREAD_SIZE_ORDER)); | ||
181 | __free_pages(page, THREAD_SIZE_ORDER); | ||
182 | } | 174 | } |
183 | # else | 175 | # else |
184 | static struct kmem_cache *thread_stack_cache; | 176 | static struct kmem_cache *thread_stack_cache; |
@@ -223,9 +215,15 @@ static struct kmem_cache *mm_cachep; | |||
223 | 215 | ||
224 | static void account_kernel_stack(unsigned long *stack, int account) | 216 | static void account_kernel_stack(unsigned long *stack, int account) |
225 | { | 217 | { |
226 | struct zone *zone = page_zone(virt_to_page(stack)); | 218 | /* All stack pages are in the same zone and belong to the same memcg. */ |
219 | struct page *first_page = virt_to_page(stack); | ||
220 | |||
221 | mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, | ||
222 | THREAD_SIZE / 1024 * account); | ||
227 | 223 | ||
228 | mod_zone_page_state(zone, NR_KERNEL_STACK, account); | 224 | memcg_kmem_update_page_stat( |
225 | first_page, MEMCG_KERNEL_STACK_KB, | ||
226 | account * (THREAD_SIZE / 1024)); | ||
229 | } | 227 | } |
230 | 228 | ||
231 | void free_task(struct task_struct *tsk) | 229 | void free_task(struct task_struct *tsk) |
diff --git a/kernel/freezer.c b/kernel/freezer.c index a8900a3bc27a..6f56a9e219fa 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p) | |||
42 | if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) | 42 | if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) |
43 | return false; | 43 | return false; |
44 | 44 | ||
45 | if (test_thread_flag(TIF_MEMDIE)) | 45 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) |
46 | return false; | 46 | return false; |
47 | 47 | ||
48 | if (pm_nosig_freezing || cgroup_freezing(p)) | 48 | if (pm_nosig_freezing || cgroup_freezing(p)) |
diff --git a/kernel/memremap.c b/kernel/memremap.c index 017532193fb1..ddb3247a872a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c | |||
@@ -308,12 +308,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
308 | if (is_ram == REGION_INTERSECTS) | 308 | if (is_ram == REGION_INTERSECTS) |
309 | return __va(res->start); | 309 | return __va(res->start); |
310 | 310 | ||
311 | if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) { | ||
312 | dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n", | ||
313 | __func__); | ||
314 | return ERR_PTR(-ENXIO); | ||
315 | } | ||
316 | |||
317 | if (!ref) | 311 | if (!ref) |
318 | return ERR_PTR(-EINVAL); | 312 | return ERR_PTR(-EINVAL); |
319 | 313 | ||
@@ -401,7 +395,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) | |||
401 | altmap->alloc -= nr_pfns; | 395 | altmap->alloc -= nr_pfns; |
402 | } | 396 | } |
403 | 397 | ||
404 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | ||
405 | struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) | 398 | struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) |
406 | { | 399 | { |
407 | /* | 400 | /* |
@@ -427,5 +420,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) | |||
427 | 420 | ||
428 | return pgmap ? pgmap->altmap : NULL; | 421 | return pgmap ? pgmap->altmap : NULL; |
429 | } | 422 | } |
430 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ | ||
431 | #endif /* CONFIG_ZONE_DEVICE */ | 423 | #endif /* CONFIG_ZONE_DEVICE */ |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d90df926b59f..9a0178c2ac1d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -1627,11 +1627,11 @@ static unsigned long minimum_image_size(unsigned long saveable) | |||
1627 | unsigned long size; | 1627 | unsigned long size; |
1628 | 1628 | ||
1629 | size = global_page_state(NR_SLAB_RECLAIMABLE) | 1629 | size = global_page_state(NR_SLAB_RECLAIMABLE) |
1630 | + global_page_state(NR_ACTIVE_ANON) | 1630 | + global_node_page_state(NR_ACTIVE_ANON) |
1631 | + global_page_state(NR_INACTIVE_ANON) | 1631 | + global_node_page_state(NR_INACTIVE_ANON) |
1632 | + global_page_state(NR_ACTIVE_FILE) | 1632 | + global_node_page_state(NR_ACTIVE_FILE) |
1633 | + global_page_state(NR_INACTIVE_FILE) | 1633 | + global_node_page_state(NR_INACTIVE_FILE) |
1634 | - global_page_state(NR_FILE_MAPPED); | 1634 | - global_node_page_state(NR_FILE_MAPPED); |
1635 | 1635 | ||
1636 | return saveable <= size ? 0 : saveable - size; | 1636 | return saveable <= size ? 0 : saveable - size; |
1637 | } | 1637 | } |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 60cdf6386763..d4de33934dac 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -3177,9 +3177,8 @@ void show_regs_print_info(const char *log_lvl) | |||
3177 | { | 3177 | { |
3178 | dump_stack_print_info(log_lvl); | 3178 | dump_stack_print_info(log_lvl); |
3179 | 3179 | ||
3180 | printk("%stask: %p ti: %p task.ti: %p\n", | 3180 | printk("%stask: %p task.stack: %p\n", |
3181 | log_lvl, current, current_thread_info(), | 3181 | log_lvl, current, task_stack_page(current)); |
3182 | task_thread_info(current)); | ||
3183 | } | 3182 | } |
3184 | 3183 | ||
3185 | #endif | 3184 | #endif |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 35f0dcb1cb4f..53954631a4e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -1508,8 +1508,8 @@ static struct ctl_table vm_table[] = { | |||
1508 | #ifdef CONFIG_NUMA | 1508 | #ifdef CONFIG_NUMA |
1509 | { | 1509 | { |
1510 | .procname = "zone_reclaim_mode", | 1510 | .procname = "zone_reclaim_mode", |
1511 | .data = &zone_reclaim_mode, | 1511 | .data = &node_reclaim_mode, |
1512 | .maxlen = sizeof(zone_reclaim_mode), | 1512 | .maxlen = sizeof(node_reclaim_mode), |
1513 | .mode = 0644, | 1513 | .mode = 0644, |
1514 | .proc_handler = proc_dointvec, | 1514 | .proc_handler = proc_dointvec, |
1515 | .extra1 = &zero, | 1515 | .extra1 = &zero, |
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 67d8c6838ba9..bd38aab05929 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan | |||
@@ -5,9 +5,9 @@ if HAVE_ARCH_KASAN | |||
5 | 5 | ||
6 | config KASAN | 6 | config KASAN |
7 | bool "KASan: runtime memory debugger" | 7 | bool "KASan: runtime memory debugger" |
8 | depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB) | 8 | depends on SLUB || (SLAB && !DEBUG_SLAB) |
9 | select CONSTRUCTORS | 9 | select CONSTRUCTORS |
10 | select STACKDEPOT if SLAB | 10 | select STACKDEPOT |
11 | help | 11 | help |
12 | Enables kernel address sanitizer - runtime memory debugger, | 12 | Enables kernel address sanitizer - runtime memory debugger, |
13 | designed to find out-of-bounds accesses and use-after-free bugs. | 13 | designed to find out-of-bounds accesses and use-after-free bugs. |
diff --git a/lib/iov_iter.c b/lib/iov_iter.c index d67c8288d95d..9e8c7386b3a0 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c | |||
@@ -144,7 +144,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b | |||
144 | buf = iov->iov_base + skip; | 144 | buf = iov->iov_base + skip; |
145 | copy = min(bytes, iov->iov_len - skip); | 145 | copy = min(bytes, iov->iov_len - skip); |
146 | 146 | ||
147 | if (!fault_in_pages_writeable(buf, copy)) { | 147 | if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) { |
148 | kaddr = kmap_atomic(page); | 148 | kaddr = kmap_atomic(page); |
149 | from = kaddr + offset; | 149 | from = kaddr + offset; |
150 | 150 | ||
@@ -175,6 +175,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b | |||
175 | copy = min(bytes, iov->iov_len - skip); | 175 | copy = min(bytes, iov->iov_len - skip); |
176 | } | 176 | } |
177 | /* Too bad - revert to non-atomic kmap */ | 177 | /* Too bad - revert to non-atomic kmap */ |
178 | |||
178 | kaddr = kmap(page); | 179 | kaddr = kmap(page); |
179 | from = kaddr + offset; | 180 | from = kaddr + offset; |
180 | left = __copy_to_user(buf, from, copy); | 181 | left = __copy_to_user(buf, from, copy); |
@@ -193,6 +194,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b | |||
193 | bytes -= copy; | 194 | bytes -= copy; |
194 | } | 195 | } |
195 | kunmap(page); | 196 | kunmap(page); |
197 | |||
196 | done: | 198 | done: |
197 | if (skip == iov->iov_len) { | 199 | if (skip == iov->iov_len) { |
198 | iov++; | 200 | iov++; |
@@ -225,7 +227,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t | |||
225 | buf = iov->iov_base + skip; | 227 | buf = iov->iov_base + skip; |
226 | copy = min(bytes, iov->iov_len - skip); | 228 | copy = min(bytes, iov->iov_len - skip); |
227 | 229 | ||
228 | if (!fault_in_pages_readable(buf, copy)) { | 230 | if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) { |
229 | kaddr = kmap_atomic(page); | 231 | kaddr = kmap_atomic(page); |
230 | to = kaddr + offset; | 232 | to = kaddr + offset; |
231 | 233 | ||
@@ -256,6 +258,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t | |||
256 | copy = min(bytes, iov->iov_len - skip); | 258 | copy = min(bytes, iov->iov_len - skip); |
257 | } | 259 | } |
258 | /* Too bad - revert to non-atomic kmap */ | 260 | /* Too bad - revert to non-atomic kmap */ |
261 | |||
259 | kaddr = kmap(page); | 262 | kaddr = kmap(page); |
260 | to = kaddr + offset; | 263 | to = kaddr + offset; |
261 | left = __copy_from_user(to, buf, copy); | 264 | left = __copy_from_user(to, buf, copy); |
@@ -274,6 +277,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t | |||
274 | bytes -= copy; | 277 | bytes -= copy; |
275 | } | 278 | } |
276 | kunmap(page); | 279 | kunmap(page); |
280 | |||
277 | done: | 281 | done: |
278 | if (skip == iov->iov_len) { | 282 | if (skip == iov->iov_len) { |
279 | iov++; | 283 | iov++; |
diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 53ad6c0831ae..60f77f1d470a 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c | |||
@@ -242,6 +242,7 @@ depot_stack_handle_t depot_save_stack(struct stack_trace *trace, | |||
242 | */ | 242 | */ |
243 | alloc_flags &= ~GFP_ZONEMASK; | 243 | alloc_flags &= ~GFP_ZONEMASK; |
244 | alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); | 244 | alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); |
245 | alloc_flags |= __GFP_NOWARN; | ||
245 | page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER); | 246 | page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER); |
246 | if (page) | 247 | if (page) |
247 | prealloc = page_address(page); | 248 | prealloc = page_address(page); |
diff --git a/mm/Kconfig b/mm/Kconfig index 3c81803b00a3..c0837845c17c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -681,7 +681,7 @@ config IDLE_PAGE_TRACKING | |||
681 | See Documentation/vm/idle_page_tracking.txt for more details. | 681 | See Documentation/vm/idle_page_tracking.txt for more details. |
682 | 682 | ||
683 | config ZONE_DEVICE | 683 | config ZONE_DEVICE |
684 | bool "Device memory (pmem, etc...) hotplug support" if EXPERT | 684 | bool "Device memory (pmem, etc...) hotplug support" |
685 | depends on MEMORY_HOTPLUG | 685 | depends on MEMORY_HOTPLUG |
686 | depends on MEMORY_HOTREMOVE | 686 | depends on MEMORY_HOTREMOVE |
687 | depends on SPARSEMEM_VMEMMAP | 687 | depends on SPARSEMEM_VMEMMAP |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index ed173b8ae8f2..efe237742074 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -947,24 +947,24 @@ long congestion_wait(int sync, long timeout) | |||
947 | EXPORT_SYMBOL(congestion_wait); | 947 | EXPORT_SYMBOL(congestion_wait); |
948 | 948 | ||
949 | /** | 949 | /** |
950 | * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes | 950 | * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes |
951 | * @zone: A zone to check if it is heavily congested | 951 | * @pgdat: A pgdat to check if it is heavily congested |
952 | * @sync: SYNC or ASYNC IO | 952 | * @sync: SYNC or ASYNC IO |
953 | * @timeout: timeout in jiffies | 953 | * @timeout: timeout in jiffies |
954 | * | 954 | * |
955 | * In the event of a congested backing_dev (any backing_dev) and the given | 955 | * In the event of a congested backing_dev (any backing_dev) and the given |
956 | * @zone has experienced recent congestion, this waits for up to @timeout | 956 | * @pgdat has experienced recent congestion, this waits for up to @timeout |
957 | * jiffies for either a BDI to exit congestion of the given @sync queue | 957 | * jiffies for either a BDI to exit congestion of the given @sync queue |
958 | * or a write to complete. | 958 | * or a write to complete. |
959 | * | 959 | * |
960 | * In the absence of zone congestion, cond_resched() is called to yield | 960 | * In the absence of pgdat congestion, cond_resched() is called to yield |
961 | * the processor if necessary but otherwise does not sleep. | 961 | * the processor if necessary but otherwise does not sleep. |
962 | * | 962 | * |
963 | * The return value is 0 if the sleep is for the full timeout. Otherwise, | 963 | * The return value is 0 if the sleep is for the full timeout. Otherwise, |
964 | * it is the number of jiffies that were still remaining when the function | 964 | * it is the number of jiffies that were still remaining when the function |
965 | * returned. return_value == timeout implies the function did not sleep. | 965 | * returned. return_value == timeout implies the function did not sleep. |
966 | */ | 966 | */ |
967 | long wait_iff_congested(struct zone *zone, int sync, long timeout) | 967 | long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout) |
968 | { | 968 | { |
969 | long ret; | 969 | long ret; |
970 | unsigned long start = jiffies; | 970 | unsigned long start = jiffies; |
@@ -973,12 +973,13 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) | |||
973 | 973 | ||
974 | /* | 974 | /* |
975 | * If there is no congestion, or heavy congestion is not being | 975 | * If there is no congestion, or heavy congestion is not being |
976 | * encountered in the current zone, yield if necessary instead | 976 | * encountered in the current pgdat, yield if necessary instead |
977 | * of sleeping on the congestion queue | 977 | * of sleeping on the congestion queue |
978 | */ | 978 | */ |
979 | if (atomic_read(&nr_wb_congested[sync]) == 0 || | 979 | if (atomic_read(&nr_wb_congested[sync]) == 0 || |
980 | !test_bit(ZONE_CONGESTED, &zone->flags)) { | 980 | !test_bit(PGDAT_CONGESTED, &pgdat->flags)) { |
981 | cond_resched(); | 981 | cond_resched(); |
982 | |||
982 | /* In case we scheduled, work out time remaining */ | 983 | /* In case we scheduled, work out time remaining */ |
983 | ret = timeout - (jiffies - start); | 984 | ret = timeout - (jiffies - start); |
984 | if (ret < 0) | 985 | if (ret < 0) |
diff --git a/mm/compaction.c b/mm/compaction.c index 64df5fe052db..9affb2908304 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -331,7 +331,7 @@ static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, | |||
331 | { | 331 | { |
332 | if (cc->mode == MIGRATE_ASYNC) { | 332 | if (cc->mode == MIGRATE_ASYNC) { |
333 | if (!spin_trylock_irqsave(lock, *flags)) { | 333 | if (!spin_trylock_irqsave(lock, *flags)) { |
334 | cc->contended = COMPACT_CONTENDED_LOCK; | 334 | cc->contended = true; |
335 | return false; | 335 | return false; |
336 | } | 336 | } |
337 | } else { | 337 | } else { |
@@ -365,13 +365,13 @@ static bool compact_unlock_should_abort(spinlock_t *lock, | |||
365 | } | 365 | } |
366 | 366 | ||
367 | if (fatal_signal_pending(current)) { | 367 | if (fatal_signal_pending(current)) { |
368 | cc->contended = COMPACT_CONTENDED_SCHED; | 368 | cc->contended = true; |
369 | return true; | 369 | return true; |
370 | } | 370 | } |
371 | 371 | ||
372 | if (need_resched()) { | 372 | if (need_resched()) { |
373 | if (cc->mode == MIGRATE_ASYNC) { | 373 | if (cc->mode == MIGRATE_ASYNC) { |
374 | cc->contended = COMPACT_CONTENDED_SCHED; | 374 | cc->contended = true; |
375 | return true; | 375 | return true; |
376 | } | 376 | } |
377 | cond_resched(); | 377 | cond_resched(); |
@@ -394,7 +394,7 @@ static inline bool compact_should_abort(struct compact_control *cc) | |||
394 | /* async compaction aborts if contended */ | 394 | /* async compaction aborts if contended */ |
395 | if (need_resched()) { | 395 | if (need_resched()) { |
396 | if (cc->mode == MIGRATE_ASYNC) { | 396 | if (cc->mode == MIGRATE_ASYNC) { |
397 | cc->contended = COMPACT_CONTENDED_SCHED; | 397 | cc->contended = true; |
398 | return true; | 398 | return true; |
399 | } | 399 | } |
400 | 400 | ||
@@ -646,8 +646,8 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc) | |||
646 | list_for_each_entry(page, &cc->migratepages, lru) | 646 | list_for_each_entry(page, &cc->migratepages, lru) |
647 | count[!!page_is_file_cache(page)]++; | 647 | count[!!page_is_file_cache(page)]++; |
648 | 648 | ||
649 | mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); | 649 | mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]); |
650 | mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); | 650 | mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]); |
651 | } | 651 | } |
652 | 652 | ||
653 | /* Similar to reclaim, but different enough that they don't share logic */ | 653 | /* Similar to reclaim, but different enough that they don't share logic */ |
@@ -655,12 +655,12 @@ static bool too_many_isolated(struct zone *zone) | |||
655 | { | 655 | { |
656 | unsigned long active, inactive, isolated; | 656 | unsigned long active, inactive, isolated; |
657 | 657 | ||
658 | inactive = zone_page_state(zone, NR_INACTIVE_FILE) + | 658 | inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) + |
659 | zone_page_state(zone, NR_INACTIVE_ANON); | 659 | node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON); |
660 | active = zone_page_state(zone, NR_ACTIVE_FILE) + | 660 | active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) + |
661 | zone_page_state(zone, NR_ACTIVE_ANON); | 661 | node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON); |
662 | isolated = zone_page_state(zone, NR_ISOLATED_FILE) + | 662 | isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) + |
663 | zone_page_state(zone, NR_ISOLATED_ANON); | 663 | node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON); |
664 | 664 | ||
665 | return isolated > (inactive + active) / 2; | 665 | return isolated > (inactive + active) / 2; |
666 | } | 666 | } |
@@ -752,7 +752,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
752 | * if contended. | 752 | * if contended. |
753 | */ | 753 | */ |
754 | if (!(low_pfn % SWAP_CLUSTER_MAX) | 754 | if (!(low_pfn % SWAP_CLUSTER_MAX) |
755 | && compact_unlock_should_abort(&zone->lru_lock, flags, | 755 | && compact_unlock_should_abort(zone_lru_lock(zone), flags, |
756 | &locked, cc)) | 756 | &locked, cc)) |
757 | break; | 757 | break; |
758 | 758 | ||
@@ -813,7 +813,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
813 | if (unlikely(__PageMovable(page)) && | 813 | if (unlikely(__PageMovable(page)) && |
814 | !PageIsolated(page)) { | 814 | !PageIsolated(page)) { |
815 | if (locked) { | 815 | if (locked) { |
816 | spin_unlock_irqrestore(&zone->lru_lock, | 816 | spin_unlock_irqrestore(zone_lru_lock(zone), |
817 | flags); | 817 | flags); |
818 | locked = false; | 818 | locked = false; |
819 | } | 819 | } |
@@ -836,7 +836,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
836 | 836 | ||
837 | /* If we already hold the lock, we can skip some rechecking */ | 837 | /* If we already hold the lock, we can skip some rechecking */ |
838 | if (!locked) { | 838 | if (!locked) { |
839 | locked = compact_trylock_irqsave(&zone->lru_lock, | 839 | locked = compact_trylock_irqsave(zone_lru_lock(zone), |
840 | &flags, cc); | 840 | &flags, cc); |
841 | if (!locked) | 841 | if (!locked) |
842 | break; | 842 | break; |
@@ -856,7 +856,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
856 | } | 856 | } |
857 | } | 857 | } |
858 | 858 | ||
859 | lruvec = mem_cgroup_page_lruvec(page, zone); | 859 | lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); |
860 | 860 | ||
861 | /* Try isolate the page */ | 861 | /* Try isolate the page */ |
862 | if (__isolate_lru_page(page, isolate_mode) != 0) | 862 | if (__isolate_lru_page(page, isolate_mode) != 0) |
@@ -899,7 +899,7 @@ isolate_fail: | |||
899 | */ | 899 | */ |
900 | if (nr_isolated) { | 900 | if (nr_isolated) { |
901 | if (locked) { | 901 | if (locked) { |
902 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 902 | spin_unlock_irqrestore(zone_lru_lock(zone), flags); |
903 | locked = false; | 903 | locked = false; |
904 | } | 904 | } |
905 | acct_isolated(zone, cc); | 905 | acct_isolated(zone, cc); |
@@ -927,7 +927,7 @@ isolate_fail: | |||
927 | low_pfn = end_pfn; | 927 | low_pfn = end_pfn; |
928 | 928 | ||
929 | if (locked) | 929 | if (locked) |
930 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 930 | spin_unlock_irqrestore(zone_lru_lock(zone), flags); |
931 | 931 | ||
932 | /* | 932 | /* |
933 | * Update the pageblock-skip information and cached scanner pfn, | 933 | * Update the pageblock-skip information and cached scanner pfn, |
@@ -1200,7 +1200,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
1200 | struct page *page; | 1200 | struct page *page; |
1201 | const isolate_mode_t isolate_mode = | 1201 | const isolate_mode_t isolate_mode = |
1202 | (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | | 1202 | (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | |
1203 | (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); | 1203 | (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0); |
1204 | 1204 | ||
1205 | /* | 1205 | /* |
1206 | * Start at where we last stopped, or beginning of the zone as | 1206 | * Start at where we last stopped, or beginning of the zone as |
@@ -1619,14 +1619,11 @@ out: | |||
1619 | trace_mm_compaction_end(start_pfn, cc->migrate_pfn, | 1619 | trace_mm_compaction_end(start_pfn, cc->migrate_pfn, |
1620 | cc->free_pfn, end_pfn, sync, ret); | 1620 | cc->free_pfn, end_pfn, sync, ret); |
1621 | 1621 | ||
1622 | if (ret == COMPACT_CONTENDED) | ||
1623 | ret = COMPACT_PARTIAL; | ||
1624 | |||
1625 | return ret; | 1622 | return ret; |
1626 | } | 1623 | } |
1627 | 1624 | ||
1628 | static enum compact_result compact_zone_order(struct zone *zone, int order, | 1625 | static enum compact_result compact_zone_order(struct zone *zone, int order, |
1629 | gfp_t gfp_mask, enum migrate_mode mode, int *contended, | 1626 | gfp_t gfp_mask, enum compact_priority prio, |
1630 | unsigned int alloc_flags, int classzone_idx) | 1627 | unsigned int alloc_flags, int classzone_idx) |
1631 | { | 1628 | { |
1632 | enum compact_result ret; | 1629 | enum compact_result ret; |
@@ -1636,7 +1633,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, | |||
1636 | .order = order, | 1633 | .order = order, |
1637 | .gfp_mask = gfp_mask, | 1634 | .gfp_mask = gfp_mask, |
1638 | .zone = zone, | 1635 | .zone = zone, |
1639 | .mode = mode, | 1636 | .mode = (prio == COMPACT_PRIO_ASYNC) ? |
1637 | MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT, | ||
1640 | .alloc_flags = alloc_flags, | 1638 | .alloc_flags = alloc_flags, |
1641 | .classzone_idx = classzone_idx, | 1639 | .classzone_idx = classzone_idx, |
1642 | .direct_compaction = true, | 1640 | .direct_compaction = true, |
@@ -1649,7 +1647,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, | |||
1649 | VM_BUG_ON(!list_empty(&cc.freepages)); | 1647 | VM_BUG_ON(!list_empty(&cc.freepages)); |
1650 | VM_BUG_ON(!list_empty(&cc.migratepages)); | 1648 | VM_BUG_ON(!list_empty(&cc.migratepages)); |
1651 | 1649 | ||
1652 | *contended = cc.contended; | ||
1653 | return ret; | 1650 | return ret; |
1654 | } | 1651 | } |
1655 | 1652 | ||
@@ -1662,50 +1659,38 @@ int sysctl_extfrag_threshold = 500; | |||
1662 | * @alloc_flags: The allocation flags of the current allocation | 1659 | * @alloc_flags: The allocation flags of the current allocation |
1663 | * @ac: The context of current allocation | 1660 | * @ac: The context of current allocation |
1664 | * @mode: The migration mode for async, sync light, or sync migration | 1661 | * @mode: The migration mode for async, sync light, or sync migration |
1665 | * @contended: Return value that determines if compaction was aborted due to | ||
1666 | * need_resched() or lock contention | ||
1667 | * | 1662 | * |
1668 | * This is the main entry point for direct page compaction. | 1663 | * This is the main entry point for direct page compaction. |
1669 | */ | 1664 | */ |
1670 | enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, | 1665 | enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, |
1671 | unsigned int alloc_flags, const struct alloc_context *ac, | 1666 | unsigned int alloc_flags, const struct alloc_context *ac, |
1672 | enum migrate_mode mode, int *contended) | 1667 | enum compact_priority prio) |
1673 | { | 1668 | { |
1674 | int may_enter_fs = gfp_mask & __GFP_FS; | 1669 | int may_enter_fs = gfp_mask & __GFP_FS; |
1675 | int may_perform_io = gfp_mask & __GFP_IO; | 1670 | int may_perform_io = gfp_mask & __GFP_IO; |
1676 | struct zoneref *z; | 1671 | struct zoneref *z; |
1677 | struct zone *zone; | 1672 | struct zone *zone; |
1678 | enum compact_result rc = COMPACT_SKIPPED; | 1673 | enum compact_result rc = COMPACT_SKIPPED; |
1679 | int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ | ||
1680 | |||
1681 | *contended = COMPACT_CONTENDED_NONE; | ||
1682 | 1674 | ||
1683 | /* Check if the GFP flags allow compaction */ | 1675 | /* Check if the GFP flags allow compaction */ |
1684 | if (!order || !may_enter_fs || !may_perform_io) | 1676 | if (!may_enter_fs || !may_perform_io) |
1685 | return COMPACT_SKIPPED; | 1677 | return COMPACT_SKIPPED; |
1686 | 1678 | ||
1687 | trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); | 1679 | trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); |
1688 | 1680 | ||
1689 | /* Compact each zone in the list */ | 1681 | /* Compact each zone in the list */ |
1690 | for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, | 1682 | for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, |
1691 | ac->nodemask) { | 1683 | ac->nodemask) { |
1692 | enum compact_result status; | 1684 | enum compact_result status; |
1693 | int zone_contended; | ||
1694 | 1685 | ||
1695 | if (compaction_deferred(zone, order)) { | 1686 | if (compaction_deferred(zone, order)) { |
1696 | rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); | 1687 | rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); |
1697 | continue; | 1688 | continue; |
1698 | } | 1689 | } |
1699 | 1690 | ||
1700 | status = compact_zone_order(zone, order, gfp_mask, mode, | 1691 | status = compact_zone_order(zone, order, gfp_mask, prio, |
1701 | &zone_contended, alloc_flags, | 1692 | alloc_flags, ac_classzone_idx(ac)); |
1702 | ac_classzone_idx(ac)); | ||
1703 | rc = max(status, rc); | 1693 | rc = max(status, rc); |
1704 | /* | ||
1705 | * It takes at least one zone that wasn't lock contended | ||
1706 | * to clear all_zones_contended. | ||
1707 | */ | ||
1708 | all_zones_contended &= zone_contended; | ||
1709 | 1694 | ||
1710 | /* If a normal allocation would succeed, stop compacting */ | 1695 | /* If a normal allocation would succeed, stop compacting */ |
1711 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), | 1696 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), |
@@ -1717,59 +1702,29 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, | |||
1717 | * succeeds in this zone. | 1702 | * succeeds in this zone. |
1718 | */ | 1703 | */ |
1719 | compaction_defer_reset(zone, order, false); | 1704 | compaction_defer_reset(zone, order, false); |
1720 | /* | ||
1721 | * It is possible that async compaction aborted due to | ||
1722 | * need_resched() and the watermarks were ok thanks to | ||
1723 | * somebody else freeing memory. The allocation can | ||
1724 | * however still fail so we better signal the | ||
1725 | * need_resched() contention anyway (this will not | ||
1726 | * prevent the allocation attempt). | ||
1727 | */ | ||
1728 | if (zone_contended == COMPACT_CONTENDED_SCHED) | ||
1729 | *contended = COMPACT_CONTENDED_SCHED; | ||
1730 | 1705 | ||
1731 | goto break_loop; | 1706 | break; |
1732 | } | 1707 | } |
1733 | 1708 | ||
1734 | if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE || | 1709 | if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE || |
1735 | status == COMPACT_PARTIAL_SKIPPED)) { | 1710 | status == COMPACT_PARTIAL_SKIPPED)) |
1736 | /* | 1711 | /* |
1737 | * We think that allocation won't succeed in this zone | 1712 | * We think that allocation won't succeed in this zone |
1738 | * so we defer compaction there. If it ends up | 1713 | * so we defer compaction there. If it ends up |
1739 | * succeeding after all, it will be reset. | 1714 | * succeeding after all, it will be reset. |
1740 | */ | 1715 | */ |
1741 | defer_compaction(zone, order); | 1716 | defer_compaction(zone, order); |
1742 | } | ||
1743 | 1717 | ||
1744 | /* | 1718 | /* |
1745 | * We might have stopped compacting due to need_resched() in | 1719 | * We might have stopped compacting due to need_resched() in |
1746 | * async compaction, or due to a fatal signal detected. In that | 1720 | * async compaction, or due to a fatal signal detected. In that |
1747 | * case do not try further zones and signal need_resched() | 1721 | * case do not try further zones |
1748 | * contention. | ||
1749 | */ | ||
1750 | if ((zone_contended == COMPACT_CONTENDED_SCHED) | ||
1751 | || fatal_signal_pending(current)) { | ||
1752 | *contended = COMPACT_CONTENDED_SCHED; | ||
1753 | goto break_loop; | ||
1754 | } | ||
1755 | |||
1756 | continue; | ||
1757 | break_loop: | ||
1758 | /* | ||
1759 | * We might not have tried all the zones, so be conservative | ||
1760 | * and assume they are not all lock contended. | ||
1761 | */ | 1722 | */ |
1762 | all_zones_contended = 0; | 1723 | if ((prio == COMPACT_PRIO_ASYNC && need_resched()) |
1763 | break; | 1724 | || fatal_signal_pending(current)) |
1725 | break; | ||
1764 | } | 1726 | } |
1765 | 1727 | ||
1766 | /* | ||
1767 | * If at least one zone wasn't deferred or skipped, we report if all | ||
1768 | * zones that were tried were lock contended. | ||
1769 | */ | ||
1770 | if (rc > COMPACT_INACTIVE && all_zones_contended) | ||
1771 | *contended = COMPACT_CONTENDED_LOCK; | ||
1772 | |||
1773 | return rc; | 1728 | return rc; |
1774 | } | 1729 | } |
1775 | 1730 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index e90c1543ec2d..c5f5e46c6f7f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -95,8 +95,8 @@ | |||
95 | * ->swap_lock (try_to_unmap_one) | 95 | * ->swap_lock (try_to_unmap_one) |
96 | * ->private_lock (try_to_unmap_one) | 96 | * ->private_lock (try_to_unmap_one) |
97 | * ->tree_lock (try_to_unmap_one) | 97 | * ->tree_lock (try_to_unmap_one) |
98 | * ->zone.lru_lock (follow_page->mark_page_accessed) | 98 | * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) |
99 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) | 99 | * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) |
100 | * ->private_lock (page_remove_rmap->set_page_dirty) | 100 | * ->private_lock (page_remove_rmap->set_page_dirty) |
101 | * ->tree_lock (page_remove_rmap->set_page_dirty) | 101 | * ->tree_lock (page_remove_rmap->set_page_dirty) |
102 | * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) | 102 | * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) |
@@ -218,11 +218,11 @@ void __delete_from_page_cache(struct page *page, void *shadow) | |||
218 | 218 | ||
219 | /* hugetlb pages do not participate in page cache accounting. */ | 219 | /* hugetlb pages do not participate in page cache accounting. */ |
220 | if (!PageHuge(page)) | 220 | if (!PageHuge(page)) |
221 | __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, -nr); | 221 | __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); |
222 | if (PageSwapBacked(page)) { | 222 | if (PageSwapBacked(page)) { |
223 | __mod_zone_page_state(page_zone(page), NR_SHMEM, -nr); | 223 | __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); |
224 | if (PageTransHuge(page)) | 224 | if (PageTransHuge(page)) |
225 | __dec_zone_page_state(page, NR_SHMEM_THPS); | 225 | __dec_node_page_state(page, NR_SHMEM_THPS); |
226 | } else { | 226 | } else { |
227 | VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page); | 227 | VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page); |
228 | } | 228 | } |
@@ -568,9 +568,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | |||
568 | * hugetlb pages do not participate in page cache accounting. | 568 | * hugetlb pages do not participate in page cache accounting. |
569 | */ | 569 | */ |
570 | if (!PageHuge(new)) | 570 | if (!PageHuge(new)) |
571 | __inc_zone_page_state(new, NR_FILE_PAGES); | 571 | __inc_node_page_state(new, NR_FILE_PAGES); |
572 | if (PageSwapBacked(new)) | 572 | if (PageSwapBacked(new)) |
573 | __inc_zone_page_state(new, NR_SHMEM); | 573 | __inc_node_page_state(new, NR_SHMEM); |
574 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 574 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
575 | mem_cgroup_migrate(old, new); | 575 | mem_cgroup_migrate(old, new); |
576 | radix_tree_preload_end(); | 576 | radix_tree_preload_end(); |
@@ -677,7 +677,7 @@ static int __add_to_page_cache_locked(struct page *page, | |||
677 | 677 | ||
678 | /* hugetlb pages do not participate in page cache accounting. */ | 678 | /* hugetlb pages do not participate in page cache accounting. */ |
679 | if (!huge) | 679 | if (!huge) |
680 | __inc_zone_page_state(page, NR_FILE_PAGES); | 680 | __inc_node_page_state(page, NR_FILE_PAGES); |
681 | spin_unlock_irq(&mapping->tree_lock); | 681 | spin_unlock_irq(&mapping->tree_lock); |
682 | if (!huge) | 682 | if (!huge) |
683 | mem_cgroup_commit_charge(page, memcg, false, false); | 683 | mem_cgroup_commit_charge(page, memcg, false, false); |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3647334c2ef9..2373f0a7d340 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -539,23 +539,26 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, | |||
539 | } | 539 | } |
540 | 540 | ||
541 | /* | 541 | /* |
542 | * If THP is set to always then directly reclaim/compact as necessary | 542 | * If THP defrag is set to always then directly reclaim/compact as necessary |
543 | * If set to defer then do no reclaim and defer to khugepaged | 543 | * If set to defer then do only background reclaim/compact and defer to khugepaged |
544 | * If set to madvise and the VMA is flagged then directly reclaim/compact | 544 | * If set to madvise and the VMA is flagged then directly reclaim/compact |
545 | * When direct reclaim/compact is allowed, don't retry except for flagged VMA's | ||
545 | */ | 546 | */ |
546 | static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) | 547 | static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) |
547 | { | 548 | { |
548 | gfp_t reclaim_flags = 0; | 549 | bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); |
549 | 550 | ||
550 | if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) && | 551 | if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, |
551 | (vma->vm_flags & VM_HUGEPAGE)) | 552 | &transparent_hugepage_flags) && vma_madvised) |
552 | reclaim_flags = __GFP_DIRECT_RECLAIM; | 553 | return GFP_TRANSHUGE; |
553 | else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) | 554 | else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, |
554 | reclaim_flags = __GFP_KSWAPD_RECLAIM; | 555 | &transparent_hugepage_flags)) |
555 | else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) | 556 | return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; |
556 | reclaim_flags = __GFP_DIRECT_RECLAIM; | 557 | else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, |
557 | 558 | &transparent_hugepage_flags)) | |
558 | return GFP_TRANSHUGE | reclaim_flags; | 559 | return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); |
560 | |||
561 | return GFP_TRANSHUGE_LIGHT; | ||
559 | } | 562 | } |
560 | 563 | ||
561 | /* Caller must hold page table lock. */ | 564 | /* Caller must hold page table lock. */ |
@@ -1249,25 +1252,26 @@ out: | |||
1249 | return 0; | 1252 | return 0; |
1250 | } | 1253 | } |
1251 | 1254 | ||
1252 | int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | 1255 | /* |
1256 | * Return true if we do MADV_FREE successfully on entire pmd page. | ||
1257 | * Otherwise, return false. | ||
1258 | */ | ||
1259 | bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | ||
1253 | pmd_t *pmd, unsigned long addr, unsigned long next) | 1260 | pmd_t *pmd, unsigned long addr, unsigned long next) |
1254 | |||
1255 | { | 1261 | { |
1256 | spinlock_t *ptl; | 1262 | spinlock_t *ptl; |
1257 | pmd_t orig_pmd; | 1263 | pmd_t orig_pmd; |
1258 | struct page *page; | 1264 | struct page *page; |
1259 | struct mm_struct *mm = tlb->mm; | 1265 | struct mm_struct *mm = tlb->mm; |
1260 | int ret = 0; | 1266 | bool ret = false; |
1261 | 1267 | ||
1262 | ptl = pmd_trans_huge_lock(pmd, vma); | 1268 | ptl = pmd_trans_huge_lock(pmd, vma); |
1263 | if (!ptl) | 1269 | if (!ptl) |
1264 | goto out_unlocked; | 1270 | goto out_unlocked; |
1265 | 1271 | ||
1266 | orig_pmd = *pmd; | 1272 | orig_pmd = *pmd; |
1267 | if (is_huge_zero_pmd(orig_pmd)) { | 1273 | if (is_huge_zero_pmd(orig_pmd)) |
1268 | ret = 1; | ||
1269 | goto out; | 1274 | goto out; |
1270 | } | ||
1271 | 1275 | ||
1272 | page = pmd_page(orig_pmd); | 1276 | page = pmd_page(orig_pmd); |
1273 | /* | 1277 | /* |
@@ -1309,7 +1313,7 @@ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1309 | set_pmd_at(mm, addr, pmd, orig_pmd); | 1313 | set_pmd_at(mm, addr, pmd, orig_pmd); |
1310 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1314 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1311 | } | 1315 | } |
1312 | ret = 1; | 1316 | ret = true; |
1313 | out: | 1317 | out: |
1314 | spin_unlock(ptl); | 1318 | spin_unlock(ptl); |
1315 | out_unlocked: | 1319 | out_unlocked: |
@@ -1586,7 +1590,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, | |||
1586 | 1590 | ||
1587 | if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { | 1591 | if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { |
1588 | /* Last compound_mapcount is gone. */ | 1592 | /* Last compound_mapcount is gone. */ |
1589 | __dec_zone_page_state(page, NR_ANON_THPS); | 1593 | __dec_node_page_state(page, NR_ANON_THPS); |
1590 | if (TestClearPageDoubleMap(page)) { | 1594 | if (TestClearPageDoubleMap(page)) { |
1591 | /* No need in mapcount reference anymore */ | 1595 | /* No need in mapcount reference anymore */ |
1592 | for (i = 0; i < HPAGE_PMD_NR; i++) | 1596 | for (i = 0; i < HPAGE_PMD_NR; i++) |
@@ -1818,7 +1822,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, | |||
1818 | pgoff_t end = -1; | 1822 | pgoff_t end = -1; |
1819 | int i; | 1823 | int i; |
1820 | 1824 | ||
1821 | lruvec = mem_cgroup_page_lruvec(head, zone); | 1825 | lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); |
1822 | 1826 | ||
1823 | /* complete memcg works before add pages to LRU */ | 1827 | /* complete memcg works before add pages to LRU */ |
1824 | mem_cgroup_split_huge_fixup(head); | 1828 | mem_cgroup_split_huge_fixup(head); |
@@ -1848,7 +1852,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, | |||
1848 | spin_unlock(&head->mapping->tree_lock); | 1852 | spin_unlock(&head->mapping->tree_lock); |
1849 | } | 1853 | } |
1850 | 1854 | ||
1851 | spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags); | 1855 | spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); |
1852 | 1856 | ||
1853 | unfreeze_page(head); | 1857 | unfreeze_page(head); |
1854 | 1858 | ||
@@ -2034,7 +2038,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
2034 | lru_add_drain(); | 2038 | lru_add_drain(); |
2035 | 2039 | ||
2036 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | 2040 | /* prevent PageLRU to go away from under us, and freeze lru stats */ |
2037 | spin_lock_irqsave(&page_zone(head)->lru_lock, flags); | 2041 | spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); |
2038 | 2042 | ||
2039 | if (mapping) { | 2043 | if (mapping) { |
2040 | void **pslot; | 2044 | void **pslot; |
@@ -2061,7 +2065,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
2061 | list_del(page_deferred_list(head)); | 2065 | list_del(page_deferred_list(head)); |
2062 | } | 2066 | } |
2063 | if (mapping) | 2067 | if (mapping) |
2064 | __dec_zone_page_state(page, NR_SHMEM_THPS); | 2068 | __dec_node_page_state(page, NR_SHMEM_THPS); |
2065 | spin_unlock(&pgdata->split_queue_lock); | 2069 | spin_unlock(&pgdata->split_queue_lock); |
2066 | __split_huge_page(page, list, flags); | 2070 | __split_huge_page(page, list, flags); |
2067 | ret = 0; | 2071 | ret = 0; |
@@ -2077,7 +2081,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
2077 | spin_unlock(&pgdata->split_queue_lock); | 2081 | spin_unlock(&pgdata->split_queue_lock); |
2078 | fail: if (mapping) | 2082 | fail: if (mapping) |
2079 | spin_unlock(&mapping->tree_lock); | 2083 | spin_unlock(&mapping->tree_lock); |
2080 | spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags); | 2084 | spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); |
2081 | unfreeze_page(head); | 2085 | unfreeze_page(head); |
2082 | ret = -EBUSY; | 2086 | ret = -EBUSY; |
2083 | } | 2087 | } |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 51a04e5e9373..f904246a8fd5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -4391,7 +4391,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, | |||
4391 | 4391 | ||
4392 | /* | 4392 | /* |
4393 | * This function is called from memory failure code. | 4393 | * This function is called from memory failure code. |
4394 | * Assume the caller holds page lock of the head page. | ||
4395 | */ | 4394 | */ |
4396 | int dequeue_hwpoisoned_huge_page(struct page *hpage) | 4395 | int dequeue_hwpoisoned_huge_page(struct page *hpage) |
4397 | { | 4396 | { |
diff --git a/mm/internal.h b/mm/internal.h index 9b6a6c43ac39..1501304f87a4 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -78,7 +78,7 @@ extern unsigned long highest_memmap_pfn; | |||
78 | */ | 78 | */ |
79 | extern int isolate_lru_page(struct page *page); | 79 | extern int isolate_lru_page(struct page *page); |
80 | extern void putback_lru_page(struct page *page); | 80 | extern void putback_lru_page(struct page *page); |
81 | extern bool zone_reclaimable(struct zone *zone); | 81 | extern bool pgdat_reclaimable(struct pglist_data *pgdat); |
82 | 82 | ||
83 | /* | 83 | /* |
84 | * in mm/rmap.c: | 84 | * in mm/rmap.c: |
@@ -185,10 +185,7 @@ struct compact_control { | |||
185 | const unsigned int alloc_flags; /* alloc flags of a direct compactor */ | 185 | const unsigned int alloc_flags; /* alloc flags of a direct compactor */ |
186 | const int classzone_idx; /* zone index of a direct compactor */ | 186 | const int classzone_idx; /* zone index of a direct compactor */ |
187 | struct zone *zone; | 187 | struct zone *zone; |
188 | int contended; /* Signal need_sched() or lock | 188 | bool contended; /* Signal lock or sched contention */ |
189 | * contention detected during | ||
190 | * compaction | ||
191 | */ | ||
192 | }; | 189 | }; |
193 | 190 | ||
194 | unsigned long | 191 | unsigned long |
@@ -433,10 +430,10 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | |||
433 | } | 430 | } |
434 | #endif /* CONFIG_SPARSEMEM */ | 431 | #endif /* CONFIG_SPARSEMEM */ |
435 | 432 | ||
436 | #define ZONE_RECLAIM_NOSCAN -2 | 433 | #define NODE_RECLAIM_NOSCAN -2 |
437 | #define ZONE_RECLAIM_FULL -1 | 434 | #define NODE_RECLAIM_FULL -1 |
438 | #define ZONE_RECLAIM_SOME 0 | 435 | #define NODE_RECLAIM_SOME 0 |
439 | #define ZONE_RECLAIM_SUCCESS 1 | 436 | #define NODE_RECLAIM_SUCCESS 1 |
440 | 437 | ||
441 | extern int hwpoison_filter(struct page *p); | 438 | extern int hwpoison_filter(struct page *p); |
442 | 439 | ||
@@ -467,7 +464,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
467 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 464 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
468 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 465 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
469 | #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ | 466 | #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ |
470 | #define ALLOC_FAIR 0x100 /* fair zone allocation */ | ||
471 | 467 | ||
472 | enum ttu_flags; | 468 | enum ttu_flags; |
473 | struct tlbflush_unmap_batch; | 469 | struct tlbflush_unmap_batch; |
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 1548749a3d45..2976a9ee104f 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile | |||
@@ -7,5 +7,4 @@ CFLAGS_REMOVE_kasan.o = -pg | |||
7 | # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 | 7 | # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 |
8 | CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) | 8 | CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) |
9 | 9 | ||
10 | obj-y := kasan.o report.o kasan_init.o | 10 | obj-y := kasan.o report.o kasan_init.o quarantine.o |
11 | obj-$(CONFIG_SLAB) += quarantine.o | ||
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 6845f9294696..b6f99e81bfeb 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c | |||
@@ -351,7 +351,6 @@ void kasan_free_pages(struct page *page, unsigned int order) | |||
351 | KASAN_FREE_PAGE); | 351 | KASAN_FREE_PAGE); |
352 | } | 352 | } |
353 | 353 | ||
354 | #ifdef CONFIG_SLAB | ||
355 | /* | 354 | /* |
356 | * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. | 355 | * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. |
357 | * For larger allocations larger redzones are used. | 356 | * For larger allocations larger redzones are used. |
@@ -373,16 +372,8 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, | |||
373 | unsigned long *flags) | 372 | unsigned long *flags) |
374 | { | 373 | { |
375 | int redzone_adjust; | 374 | int redzone_adjust; |
376 | /* Make sure the adjusted size is still less than | 375 | int orig_size = *size; |
377 | * KMALLOC_MAX_CACHE_SIZE. | 376 | |
378 | * TODO: this check is only useful for SLAB, but not SLUB. We'll need | ||
379 | * to skip it for SLUB when it starts using kasan_cache_create(). | ||
380 | */ | ||
381 | if (*size > KMALLOC_MAX_CACHE_SIZE - | ||
382 | sizeof(struct kasan_alloc_meta) - | ||
383 | sizeof(struct kasan_free_meta)) | ||
384 | return; | ||
385 | *flags |= SLAB_KASAN; | ||
386 | /* Add alloc meta. */ | 377 | /* Add alloc meta. */ |
387 | cache->kasan_info.alloc_meta_offset = *size; | 378 | cache->kasan_info.alloc_meta_offset = *size; |
388 | *size += sizeof(struct kasan_alloc_meta); | 379 | *size += sizeof(struct kasan_alloc_meta); |
@@ -395,14 +386,26 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, | |||
395 | } | 386 | } |
396 | redzone_adjust = optimal_redzone(cache->object_size) - | 387 | redzone_adjust = optimal_redzone(cache->object_size) - |
397 | (*size - cache->object_size); | 388 | (*size - cache->object_size); |
389 | |||
398 | if (redzone_adjust > 0) | 390 | if (redzone_adjust > 0) |
399 | *size += redzone_adjust; | 391 | *size += redzone_adjust; |
400 | *size = min(KMALLOC_MAX_CACHE_SIZE, | 392 | |
401 | max(*size, | 393 | *size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size + |
402 | cache->object_size + | 394 | optimal_redzone(cache->object_size))); |
403 | optimal_redzone(cache->object_size))); | 395 | |
396 | /* | ||
397 | * If the metadata doesn't fit, don't enable KASAN at all. | ||
398 | */ | ||
399 | if (*size <= cache->kasan_info.alloc_meta_offset || | ||
400 | *size <= cache->kasan_info.free_meta_offset) { | ||
401 | cache->kasan_info.alloc_meta_offset = 0; | ||
402 | cache->kasan_info.free_meta_offset = 0; | ||
403 | *size = orig_size; | ||
404 | return; | ||
405 | } | ||
406 | |||
407 | *flags |= SLAB_KASAN; | ||
404 | } | 408 | } |
405 | #endif | ||
406 | 409 | ||
407 | void kasan_cache_shrink(struct kmem_cache *cache) | 410 | void kasan_cache_shrink(struct kmem_cache *cache) |
408 | { | 411 | { |
@@ -414,6 +417,14 @@ void kasan_cache_destroy(struct kmem_cache *cache) | |||
414 | quarantine_remove_cache(cache); | 417 | quarantine_remove_cache(cache); |
415 | } | 418 | } |
416 | 419 | ||
420 | size_t kasan_metadata_size(struct kmem_cache *cache) | ||
421 | { | ||
422 | return (cache->kasan_info.alloc_meta_offset ? | ||
423 | sizeof(struct kasan_alloc_meta) : 0) + | ||
424 | (cache->kasan_info.free_meta_offset ? | ||
425 | sizeof(struct kasan_free_meta) : 0); | ||
426 | } | ||
427 | |||
417 | void kasan_poison_slab(struct page *page) | 428 | void kasan_poison_slab(struct page *page) |
418 | { | 429 | { |
419 | kasan_poison_shadow(page_address(page), | 430 | kasan_poison_shadow(page_address(page), |
@@ -431,16 +442,13 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) | |||
431 | kasan_poison_shadow(object, | 442 | kasan_poison_shadow(object, |
432 | round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), | 443 | round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), |
433 | KASAN_KMALLOC_REDZONE); | 444 | KASAN_KMALLOC_REDZONE); |
434 | #ifdef CONFIG_SLAB | ||
435 | if (cache->flags & SLAB_KASAN) { | 445 | if (cache->flags & SLAB_KASAN) { |
436 | struct kasan_alloc_meta *alloc_info = | 446 | struct kasan_alloc_meta *alloc_info = |
437 | get_alloc_info(cache, object); | 447 | get_alloc_info(cache, object); |
438 | alloc_info->state = KASAN_STATE_INIT; | 448 | alloc_info->state = KASAN_STATE_INIT; |
439 | } | 449 | } |
440 | #endif | ||
441 | } | 450 | } |
442 | 451 | ||
443 | #ifdef CONFIG_SLAB | ||
444 | static inline int in_irqentry_text(unsigned long ptr) | 452 | static inline int in_irqentry_text(unsigned long ptr) |
445 | { | 453 | { |
446 | return (ptr >= (unsigned long)&__irqentry_text_start && | 454 | return (ptr >= (unsigned long)&__irqentry_text_start && |
@@ -501,7 +509,6 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache, | |||
501 | BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); | 509 | BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); |
502 | return (void *)object + cache->kasan_info.free_meta_offset; | 510 | return (void *)object + cache->kasan_info.free_meta_offset; |
503 | } | 511 | } |
504 | #endif | ||
505 | 512 | ||
506 | void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) | 513 | void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) |
507 | { | 514 | { |
@@ -522,16 +529,16 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object) | |||
522 | 529 | ||
523 | bool kasan_slab_free(struct kmem_cache *cache, void *object) | 530 | bool kasan_slab_free(struct kmem_cache *cache, void *object) |
524 | { | 531 | { |
525 | #ifdef CONFIG_SLAB | ||
526 | /* RCU slabs could be legally used after free within the RCU period */ | 532 | /* RCU slabs could be legally used after free within the RCU period */ |
527 | if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) | 533 | if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) |
528 | return false; | 534 | return false; |
529 | 535 | ||
530 | if (likely(cache->flags & SLAB_KASAN)) { | 536 | if (likely(cache->flags & SLAB_KASAN)) { |
531 | struct kasan_alloc_meta *alloc_info = | 537 | struct kasan_alloc_meta *alloc_info; |
532 | get_alloc_info(cache, object); | 538 | struct kasan_free_meta *free_info; |
533 | struct kasan_free_meta *free_info = | 539 | |
534 | get_free_info(cache, object); | 540 | alloc_info = get_alloc_info(cache, object); |
541 | free_info = get_free_info(cache, object); | ||
535 | 542 | ||
536 | switch (alloc_info->state) { | 543 | switch (alloc_info->state) { |
537 | case KASAN_STATE_ALLOC: | 544 | case KASAN_STATE_ALLOC: |
@@ -550,10 +557,6 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object) | |||
550 | } | 557 | } |
551 | } | 558 | } |
552 | return false; | 559 | return false; |
553 | #else | ||
554 | kasan_poison_slab_free(cache, object); | ||
555 | return false; | ||
556 | #endif | ||
557 | } | 560 | } |
558 | 561 | ||
559 | void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, | 562 | void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, |
@@ -576,7 +579,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, | |||
576 | kasan_unpoison_shadow(object, size); | 579 | kasan_unpoison_shadow(object, size); |
577 | kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, | 580 | kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, |
578 | KASAN_KMALLOC_REDZONE); | 581 | KASAN_KMALLOC_REDZONE); |
579 | #ifdef CONFIG_SLAB | ||
580 | if (cache->flags & SLAB_KASAN) { | 582 | if (cache->flags & SLAB_KASAN) { |
581 | struct kasan_alloc_meta *alloc_info = | 583 | struct kasan_alloc_meta *alloc_info = |
582 | get_alloc_info(cache, object); | 584 | get_alloc_info(cache, object); |
@@ -585,7 +587,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, | |||
585 | alloc_info->alloc_size = size; | 587 | alloc_info->alloc_size = size; |
586 | set_track(&alloc_info->track, flags); | 588 | set_track(&alloc_info->track, flags); |
587 | } | 589 | } |
588 | #endif | ||
589 | } | 590 | } |
590 | EXPORT_SYMBOL(kasan_kmalloc); | 591 | EXPORT_SYMBOL(kasan_kmalloc); |
591 | 592 | ||
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index fb87923552ef..31972cdba433 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h | |||
@@ -95,7 +95,6 @@ struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, | |||
95 | struct kasan_free_meta *get_free_info(struct kmem_cache *cache, | 95 | struct kasan_free_meta *get_free_info(struct kmem_cache *cache, |
96 | const void *object); | 96 | const void *object); |
97 | 97 | ||
98 | |||
99 | static inline const void *kasan_shadow_to_mem(const void *shadow_addr) | 98 | static inline const void *kasan_shadow_to_mem(const void *shadow_addr) |
100 | { | 99 | { |
101 | return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) | 100 | return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) |
@@ -110,7 +109,7 @@ static inline bool kasan_report_enabled(void) | |||
110 | void kasan_report(unsigned long addr, size_t size, | 109 | void kasan_report(unsigned long addr, size_t size, |
111 | bool is_write, unsigned long ip); | 110 | bool is_write, unsigned long ip); |
112 | 111 | ||
113 | #ifdef CONFIG_SLAB | 112 | #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) |
114 | void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); | 113 | void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); |
115 | void quarantine_reduce(void); | 114 | void quarantine_reduce(void); |
116 | void quarantine_remove_cache(struct kmem_cache *cache); | 115 | void quarantine_remove_cache(struct kmem_cache *cache); |
diff --git a/mm/kasan/report.c b/mm/kasan/report.c index b3c122ddd454..861b9776841a 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c | |||
@@ -116,7 +116,6 @@ static inline bool init_task_stack_addr(const void *addr) | |||
116 | sizeof(init_thread_union.stack)); | 116 | sizeof(init_thread_union.stack)); |
117 | } | 117 | } |
118 | 118 | ||
119 | #ifdef CONFIG_SLAB | ||
120 | static void print_track(struct kasan_track *track) | 119 | static void print_track(struct kasan_track *track) |
121 | { | 120 | { |
122 | pr_err("PID = %u\n", track->pid); | 121 | pr_err("PID = %u\n", track->pid); |
@@ -130,8 +129,8 @@ static void print_track(struct kasan_track *track) | |||
130 | } | 129 | } |
131 | } | 130 | } |
132 | 131 | ||
133 | static void object_err(struct kmem_cache *cache, struct page *page, | 132 | static void kasan_object_err(struct kmem_cache *cache, struct page *page, |
134 | void *object, char *unused_reason) | 133 | void *object, char *unused_reason) |
135 | { | 134 | { |
136 | struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); | 135 | struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); |
137 | struct kasan_free_meta *free_info; | 136 | struct kasan_free_meta *free_info; |
@@ -162,7 +161,6 @@ static void object_err(struct kmem_cache *cache, struct page *page, | |||
162 | break; | 161 | break; |
163 | } | 162 | } |
164 | } | 163 | } |
165 | #endif | ||
166 | 164 | ||
167 | static void print_address_description(struct kasan_access_info *info) | 165 | static void print_address_description(struct kasan_access_info *info) |
168 | { | 166 | { |
@@ -177,7 +175,7 @@ static void print_address_description(struct kasan_access_info *info) | |||
177 | struct kmem_cache *cache = page->slab_cache; | 175 | struct kmem_cache *cache = page->slab_cache; |
178 | object = nearest_obj(cache, page, | 176 | object = nearest_obj(cache, page, |
179 | (void *)info->access_addr); | 177 | (void *)info->access_addr); |
180 | object_err(cache, page, object, | 178 | kasan_object_err(cache, page, object, |
181 | "kasan: bad access detected"); | 179 | "kasan: bad access detected"); |
182 | return; | 180 | return; |
183 | } | 181 | } |
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7dbee698d6aa..79c52d0061af 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c | |||
@@ -480,7 +480,7 @@ void __khugepaged_exit(struct mm_struct *mm) | |||
480 | static void release_pte_page(struct page *page) | 480 | static void release_pte_page(struct page *page) |
481 | { | 481 | { |
482 | /* 0 stands for page_is_file_cache(page) == false */ | 482 | /* 0 stands for page_is_file_cache(page) == false */ |
483 | dec_zone_page_state(page, NR_ISOLATED_ANON + 0); | 483 | dec_node_page_state(page, NR_ISOLATED_ANON + 0); |
484 | unlock_page(page); | 484 | unlock_page(page); |
485 | putback_lru_page(page); | 485 | putback_lru_page(page); |
486 | } | 486 | } |
@@ -576,7 +576,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
576 | goto out; | 576 | goto out; |
577 | } | 577 | } |
578 | /* 0 stands for page_is_file_cache(page) == false */ | 578 | /* 0 stands for page_is_file_cache(page) == false */ |
579 | inc_zone_page_state(page, NR_ISOLATED_ANON + 0); | 579 | inc_node_page_state(page, NR_ISOLATED_ANON + 0); |
580 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 580 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
581 | VM_BUG_ON_PAGE(PageLRU(page), page); | 581 | VM_BUG_ON_PAGE(PageLRU(page), page); |
582 | 582 | ||
@@ -672,10 +672,10 @@ static bool khugepaged_scan_abort(int nid) | |||
672 | int i; | 672 | int i; |
673 | 673 | ||
674 | /* | 674 | /* |
675 | * If zone_reclaim_mode is disabled, then no extra effort is made to | 675 | * If node_reclaim_mode is disabled, then no extra effort is made to |
676 | * allocate memory locally. | 676 | * allocate memory locally. |
677 | */ | 677 | */ |
678 | if (!zone_reclaim_mode) | 678 | if (!node_reclaim_mode) |
679 | return false; | 679 | return false; |
680 | 680 | ||
681 | /* If there is a count for this node already, it must be acceptable */ | 681 | /* If there is a count for this node already, it must be acceptable */ |
@@ -694,7 +694,7 @@ static bool khugepaged_scan_abort(int nid) | |||
694 | /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ | 694 | /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ |
695 | static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) | 695 | static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) |
696 | { | 696 | { |
697 | return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); | 697 | return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT; |
698 | } | 698 | } |
699 | 699 | ||
700 | #ifdef CONFIG_NUMA | 700 | #ifdef CONFIG_NUMA |
@@ -1483,10 +1483,10 @@ tree_unlocked: | |||
1483 | } | 1483 | } |
1484 | 1484 | ||
1485 | local_irq_save(flags); | 1485 | local_irq_save(flags); |
1486 | __inc_zone_page_state(new_page, NR_SHMEM_THPS); | 1486 | __inc_node_page_state(new_page, NR_SHMEM_THPS); |
1487 | if (nr_none) { | 1487 | if (nr_none) { |
1488 | __mod_zone_page_state(zone, NR_FILE_PAGES, nr_none); | 1488 | __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); |
1489 | __mod_zone_page_state(zone, NR_SHMEM, nr_none); | 1489 | __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); |
1490 | } | 1490 | } |
1491 | local_irq_restore(flags); | 1491 | local_irq_restore(flags); |
1492 | 1492 | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 04320d3adbef..086292f7c59d 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -1485,8 +1485,10 @@ static int kmemleak_scan_thread(void *arg) | |||
1485 | * Wait before the first scan to allow the system to fully initialize. | 1485 | * Wait before the first scan to allow the system to fully initialize. |
1486 | */ | 1486 | */ |
1487 | if (first_run) { | 1487 | if (first_run) { |
1488 | signed long timeout = msecs_to_jiffies(SECS_FIRST_SCAN * 1000); | ||
1488 | first_run = 0; | 1489 | first_run = 0; |
1489 | ssleep(SECS_FIRST_SCAN); | 1490 | while (timeout && !kthread_should_stop()) |
1491 | timeout = schedule_timeout_interruptible(timeout); | ||
1490 | } | 1492 | } |
1491 | 1493 | ||
1492 | while (!kthread_should_stop()) { | 1494 | while (!kthread_should_stop()) { |
diff --git a/mm/memblock.c b/mm/memblock.c index ca099159b45a..ff5ff3b5f1ea 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -20,7 +20,7 @@ | |||
20 | #include <linux/seq_file.h> | 20 | #include <linux/seq_file.h> |
21 | #include <linux/memblock.h> | 21 | #include <linux/memblock.h> |
22 | 22 | ||
23 | #include <asm-generic/sections.h> | 23 | #include <asm/sections.h> |
24 | #include <linux/io.h> | 24 | #include <linux/io.h> |
25 | 25 | ||
26 | #include "internal.h" | 26 | #include "internal.h" |
@@ -1027,7 +1027,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, | |||
1027 | *out_end = m_end; | 1027 | *out_end = m_end; |
1028 | if (out_nid) | 1028 | if (out_nid) |
1029 | *out_nid = m_nid; | 1029 | *out_nid = m_nid; |
1030 | idx_a++; | 1030 | idx_a--; |
1031 | *idx = (u32)idx_a | (u64)idx_b << 32; | 1031 | *idx = (u32)idx_a | (u64)idx_b << 32; |
1032 | return; | 1032 | return; |
1033 | } | 1033 | } |
@@ -1465,15 +1465,16 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) | |||
1465 | return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); | 1465 | return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); |
1466 | } | 1466 | } |
1467 | 1467 | ||
1468 | void __init memblock_enforce_memory_limit(phys_addr_t limit) | 1468 | static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) |
1469 | { | 1469 | { |
1470 | phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; | 1470 | phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; |
1471 | struct memblock_region *r; | 1471 | struct memblock_region *r; |
1472 | 1472 | ||
1473 | if (!limit) | 1473 | /* |
1474 | return; | 1474 | * translate the memory @limit size into the max address within one of |
1475 | 1475 | * the memory memblock regions, if the @limit exceeds the total size | |
1476 | /* find out max address */ | 1476 | * of those regions, max_addr will keep original value ULLONG_MAX |
1477 | */ | ||
1477 | for_each_memblock(memory, r) { | 1478 | for_each_memblock(memory, r) { |
1478 | if (limit <= r->size) { | 1479 | if (limit <= r->size) { |
1479 | max_addr = r->base + limit; | 1480 | max_addr = r->base + limit; |
@@ -1482,6 +1483,22 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) | |||
1482 | limit -= r->size; | 1483 | limit -= r->size; |
1483 | } | 1484 | } |
1484 | 1485 | ||
1486 | return max_addr; | ||
1487 | } | ||
1488 | |||
1489 | void __init memblock_enforce_memory_limit(phys_addr_t limit) | ||
1490 | { | ||
1491 | phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; | ||
1492 | |||
1493 | if (!limit) | ||
1494 | return; | ||
1495 | |||
1496 | max_addr = __find_max_addr(limit); | ||
1497 | |||
1498 | /* @limit exceeds the total size of the memory, do nothing */ | ||
1499 | if (max_addr == (phys_addr_t)ULLONG_MAX) | ||
1500 | return; | ||
1501 | |||
1485 | /* truncate both memory and reserved regions */ | 1502 | /* truncate both memory and reserved regions */ |
1486 | memblock_remove_range(&memblock.memory, max_addr, | 1503 | memblock_remove_range(&memblock.memory, max_addr, |
1487 | (phys_addr_t)ULLONG_MAX); | 1504 | (phys_addr_t)ULLONG_MAX); |
@@ -1489,6 +1506,36 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) | |||
1489 | (phys_addr_t)ULLONG_MAX); | 1506 | (phys_addr_t)ULLONG_MAX); |
1490 | } | 1507 | } |
1491 | 1508 | ||
1509 | void __init memblock_mem_limit_remove_map(phys_addr_t limit) | ||
1510 | { | ||
1511 | struct memblock_type *type = &memblock.memory; | ||
1512 | phys_addr_t max_addr; | ||
1513 | int i, ret, start_rgn, end_rgn; | ||
1514 | |||
1515 | if (!limit) | ||
1516 | return; | ||
1517 | |||
1518 | max_addr = __find_max_addr(limit); | ||
1519 | |||
1520 | /* @limit exceeds the total size of the memory, do nothing */ | ||
1521 | if (max_addr == (phys_addr_t)ULLONG_MAX) | ||
1522 | return; | ||
1523 | |||
1524 | ret = memblock_isolate_range(type, max_addr, (phys_addr_t)ULLONG_MAX, | ||
1525 | &start_rgn, &end_rgn); | ||
1526 | if (ret) | ||
1527 | return; | ||
1528 | |||
1529 | /* remove all the MAP regions above the limit */ | ||
1530 | for (i = end_rgn - 1; i >= start_rgn; i--) { | ||
1531 | if (!memblock_is_nomap(&type->regions[i])) | ||
1532 | memblock_remove_region(type, i); | ||
1533 | } | ||
1534 | /* truncate the reserved regions */ | ||
1535 | memblock_remove_range(&memblock.reserved, max_addr, | ||
1536 | (phys_addr_t)ULLONG_MAX); | ||
1537 | } | ||
1538 | |||
1492 | static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) | 1539 | static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) |
1493 | { | 1540 | { |
1494 | unsigned int left = 0, right = type->cnt; | 1541 | unsigned int left = 0, right = type->cnt; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f3a84c64f35c..c265212bec8c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -132,15 +132,11 @@ static const char * const mem_cgroup_lru_names[] = { | |||
132 | * their hierarchy representation | 132 | * their hierarchy representation |
133 | */ | 133 | */ |
134 | 134 | ||
135 | struct mem_cgroup_tree_per_zone { | 135 | struct mem_cgroup_tree_per_node { |
136 | struct rb_root rb_root; | 136 | struct rb_root rb_root; |
137 | spinlock_t lock; | 137 | spinlock_t lock; |
138 | }; | 138 | }; |
139 | 139 | ||
140 | struct mem_cgroup_tree_per_node { | ||
141 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
142 | }; | ||
143 | |||
144 | struct mem_cgroup_tree { | 140 | struct mem_cgroup_tree { |
145 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | 141 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; |
146 | }; | 142 | }; |
@@ -323,15 +319,6 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); | |||
323 | 319 | ||
324 | #endif /* !CONFIG_SLOB */ | 320 | #endif /* !CONFIG_SLOB */ |
325 | 321 | ||
326 | static struct mem_cgroup_per_zone * | ||
327 | mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) | ||
328 | { | ||
329 | int nid = zone_to_nid(zone); | ||
330 | int zid = zone_idx(zone); | ||
331 | |||
332 | return &memcg->nodeinfo[nid]->zoneinfo[zid]; | ||
333 | } | ||
334 | |||
335 | /** | 322 | /** |
336 | * mem_cgroup_css_from_page - css of the memcg associated with a page | 323 | * mem_cgroup_css_from_page - css of the memcg associated with a page |
337 | * @page: page of interest | 324 | * @page: page of interest |
@@ -383,37 +370,35 @@ ino_t page_cgroup_ino(struct page *page) | |||
383 | return ino; | 370 | return ino; |
384 | } | 371 | } |
385 | 372 | ||
386 | static struct mem_cgroup_per_zone * | 373 | static struct mem_cgroup_per_node * |
387 | mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) | 374 | mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) |
388 | { | 375 | { |
389 | int nid = page_to_nid(page); | 376 | int nid = page_to_nid(page); |
390 | int zid = page_zonenum(page); | ||
391 | 377 | ||
392 | return &memcg->nodeinfo[nid]->zoneinfo[zid]; | 378 | return memcg->nodeinfo[nid]; |
393 | } | 379 | } |
394 | 380 | ||
395 | static struct mem_cgroup_tree_per_zone * | 381 | static struct mem_cgroup_tree_per_node * |
396 | soft_limit_tree_node_zone(int nid, int zid) | 382 | soft_limit_tree_node(int nid) |
397 | { | 383 | { |
398 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | 384 | return soft_limit_tree.rb_tree_per_node[nid]; |
399 | } | 385 | } |
400 | 386 | ||
401 | static struct mem_cgroup_tree_per_zone * | 387 | static struct mem_cgroup_tree_per_node * |
402 | soft_limit_tree_from_page(struct page *page) | 388 | soft_limit_tree_from_page(struct page *page) |
403 | { | 389 | { |
404 | int nid = page_to_nid(page); | 390 | int nid = page_to_nid(page); |
405 | int zid = page_zonenum(page); | ||
406 | 391 | ||
407 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | 392 | return soft_limit_tree.rb_tree_per_node[nid]; |
408 | } | 393 | } |
409 | 394 | ||
410 | static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, | 395 | static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, |
411 | struct mem_cgroup_tree_per_zone *mctz, | 396 | struct mem_cgroup_tree_per_node *mctz, |
412 | unsigned long new_usage_in_excess) | 397 | unsigned long new_usage_in_excess) |
413 | { | 398 | { |
414 | struct rb_node **p = &mctz->rb_root.rb_node; | 399 | struct rb_node **p = &mctz->rb_root.rb_node; |
415 | struct rb_node *parent = NULL; | 400 | struct rb_node *parent = NULL; |
416 | struct mem_cgroup_per_zone *mz_node; | 401 | struct mem_cgroup_per_node *mz_node; |
417 | 402 | ||
418 | if (mz->on_tree) | 403 | if (mz->on_tree) |
419 | return; | 404 | return; |
@@ -423,7 +408,7 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, | |||
423 | return; | 408 | return; |
424 | while (*p) { | 409 | while (*p) { |
425 | parent = *p; | 410 | parent = *p; |
426 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | 411 | mz_node = rb_entry(parent, struct mem_cgroup_per_node, |
427 | tree_node); | 412 | tree_node); |
428 | if (mz->usage_in_excess < mz_node->usage_in_excess) | 413 | if (mz->usage_in_excess < mz_node->usage_in_excess) |
429 | p = &(*p)->rb_left; | 414 | p = &(*p)->rb_left; |
@@ -439,8 +424,8 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, | |||
439 | mz->on_tree = true; | 424 | mz->on_tree = true; |
440 | } | 425 | } |
441 | 426 | ||
442 | static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, | 427 | static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, |
443 | struct mem_cgroup_tree_per_zone *mctz) | 428 | struct mem_cgroup_tree_per_node *mctz) |
444 | { | 429 | { |
445 | if (!mz->on_tree) | 430 | if (!mz->on_tree) |
446 | return; | 431 | return; |
@@ -448,8 +433,8 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, | |||
448 | mz->on_tree = false; | 433 | mz->on_tree = false; |
449 | } | 434 | } |
450 | 435 | ||
451 | static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, | 436 | static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, |
452 | struct mem_cgroup_tree_per_zone *mctz) | 437 | struct mem_cgroup_tree_per_node *mctz) |
453 | { | 438 | { |
454 | unsigned long flags; | 439 | unsigned long flags; |
455 | 440 | ||
@@ -473,8 +458,8 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg) | |||
473 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | 458 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) |
474 | { | 459 | { |
475 | unsigned long excess; | 460 | unsigned long excess; |
476 | struct mem_cgroup_per_zone *mz; | 461 | struct mem_cgroup_per_node *mz; |
477 | struct mem_cgroup_tree_per_zone *mctz; | 462 | struct mem_cgroup_tree_per_node *mctz; |
478 | 463 | ||
479 | mctz = soft_limit_tree_from_page(page); | 464 | mctz = soft_limit_tree_from_page(page); |
480 | /* | 465 | /* |
@@ -482,7 +467,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | |||
482 | * because their event counter is not touched. | 467 | * because their event counter is not touched. |
483 | */ | 468 | */ |
484 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { | 469 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { |
485 | mz = mem_cgroup_page_zoneinfo(memcg, page); | 470 | mz = mem_cgroup_page_nodeinfo(memcg, page); |
486 | excess = soft_limit_excess(memcg); | 471 | excess = soft_limit_excess(memcg); |
487 | /* | 472 | /* |
488 | * We have to update the tree if mz is on RB-tree or | 473 | * We have to update the tree if mz is on RB-tree or |
@@ -507,24 +492,22 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | |||
507 | 492 | ||
508 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) | 493 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) |
509 | { | 494 | { |
510 | struct mem_cgroup_tree_per_zone *mctz; | 495 | struct mem_cgroup_tree_per_node *mctz; |
511 | struct mem_cgroup_per_zone *mz; | 496 | struct mem_cgroup_per_node *mz; |
512 | int nid, zid; | 497 | int nid; |
513 | 498 | ||
514 | for_each_node(nid) { | 499 | for_each_node(nid) { |
515 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 500 | mz = mem_cgroup_nodeinfo(memcg, nid); |
516 | mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; | 501 | mctz = soft_limit_tree_node(nid); |
517 | mctz = soft_limit_tree_node_zone(nid, zid); | 502 | mem_cgroup_remove_exceeded(mz, mctz); |
518 | mem_cgroup_remove_exceeded(mz, mctz); | ||
519 | } | ||
520 | } | 503 | } |
521 | } | 504 | } |
522 | 505 | ||
523 | static struct mem_cgroup_per_zone * | 506 | static struct mem_cgroup_per_node * |
524 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | 507 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) |
525 | { | 508 | { |
526 | struct rb_node *rightmost = NULL; | 509 | struct rb_node *rightmost = NULL; |
527 | struct mem_cgroup_per_zone *mz; | 510 | struct mem_cgroup_per_node *mz; |
528 | 511 | ||
529 | retry: | 512 | retry: |
530 | mz = NULL; | 513 | mz = NULL; |
@@ -532,7 +515,7 @@ retry: | |||
532 | if (!rightmost) | 515 | if (!rightmost) |
533 | goto done; /* Nothing to reclaim from */ | 516 | goto done; /* Nothing to reclaim from */ |
534 | 517 | ||
535 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | 518 | mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node); |
536 | /* | 519 | /* |
537 | * Remove the node now but someone else can add it back, | 520 | * Remove the node now but someone else can add it back, |
538 | * we will to add it back at the end of reclaim to its correct | 521 | * we will to add it back at the end of reclaim to its correct |
@@ -546,10 +529,10 @@ done: | |||
546 | return mz; | 529 | return mz; |
547 | } | 530 | } |
548 | 531 | ||
549 | static struct mem_cgroup_per_zone * | 532 | static struct mem_cgroup_per_node * |
550 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | 533 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) |
551 | { | 534 | { |
552 | struct mem_cgroup_per_zone *mz; | 535 | struct mem_cgroup_per_node *mz; |
553 | 536 | ||
554 | spin_lock_irq(&mctz->lock); | 537 | spin_lock_irq(&mctz->lock); |
555 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | 538 | mz = __mem_cgroup_largest_soft_limit_node(mctz); |
@@ -643,20 +626,16 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, | |||
643 | int nid, unsigned int lru_mask) | 626 | int nid, unsigned int lru_mask) |
644 | { | 627 | { |
645 | unsigned long nr = 0; | 628 | unsigned long nr = 0; |
646 | int zid; | 629 | struct mem_cgroup_per_node *mz; |
630 | enum lru_list lru; | ||
647 | 631 | ||
648 | VM_BUG_ON((unsigned)nid >= nr_node_ids); | 632 | VM_BUG_ON((unsigned)nid >= nr_node_ids); |
649 | 633 | ||
650 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 634 | for_each_lru(lru) { |
651 | struct mem_cgroup_per_zone *mz; | 635 | if (!(BIT(lru) & lru_mask)) |
652 | enum lru_list lru; | 636 | continue; |
653 | 637 | mz = mem_cgroup_nodeinfo(memcg, nid); | |
654 | for_each_lru(lru) { | 638 | nr += mz->lru_size[lru]; |
655 | if (!(BIT(lru) & lru_mask)) | ||
656 | continue; | ||
657 | mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; | ||
658 | nr += mz->lru_size[lru]; | ||
659 | } | ||
660 | } | 639 | } |
661 | return nr; | 640 | return nr; |
662 | } | 641 | } |
@@ -809,9 +788,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
809 | rcu_read_lock(); | 788 | rcu_read_lock(); |
810 | 789 | ||
811 | if (reclaim) { | 790 | if (reclaim) { |
812 | struct mem_cgroup_per_zone *mz; | 791 | struct mem_cgroup_per_node *mz; |
813 | 792 | ||
814 | mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); | 793 | mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); |
815 | iter = &mz->iter[reclaim->priority]; | 794 | iter = &mz->iter[reclaim->priority]; |
816 | 795 | ||
817 | if (prev && reclaim->generation != iter->generation) | 796 | if (prev && reclaim->generation != iter->generation) |
@@ -910,19 +889,17 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) | |||
910 | { | 889 | { |
911 | struct mem_cgroup *memcg = dead_memcg; | 890 | struct mem_cgroup *memcg = dead_memcg; |
912 | struct mem_cgroup_reclaim_iter *iter; | 891 | struct mem_cgroup_reclaim_iter *iter; |
913 | struct mem_cgroup_per_zone *mz; | 892 | struct mem_cgroup_per_node *mz; |
914 | int nid, zid; | 893 | int nid; |
915 | int i; | 894 | int i; |
916 | 895 | ||
917 | while ((memcg = parent_mem_cgroup(memcg))) { | 896 | while ((memcg = parent_mem_cgroup(memcg))) { |
918 | for_each_node(nid) { | 897 | for_each_node(nid) { |
919 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 898 | mz = mem_cgroup_nodeinfo(memcg, nid); |
920 | mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; | 899 | for (i = 0; i <= DEF_PRIORITY; i++) { |
921 | for (i = 0; i <= DEF_PRIORITY; i++) { | 900 | iter = &mz->iter[i]; |
922 | iter = &mz->iter[i]; | 901 | cmpxchg(&iter->position, |
923 | cmpxchg(&iter->position, | 902 | dead_memcg, NULL); |
924 | dead_memcg, NULL); | ||
925 | } | ||
926 | } | 903 | } |
927 | } | 904 | } |
928 | } | 905 | } |
@@ -944,39 +921,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) | |||
944 | iter = mem_cgroup_iter(NULL, iter, NULL)) | 921 | iter = mem_cgroup_iter(NULL, iter, NULL)) |
945 | 922 | ||
946 | /** | 923 | /** |
947 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg | ||
948 | * @zone: zone of the wanted lruvec | ||
949 | * @memcg: memcg of the wanted lruvec | ||
950 | * | ||
951 | * Returns the lru list vector holding pages for the given @zone and | ||
952 | * @mem. This can be the global zone lruvec, if the memory controller | ||
953 | * is disabled. | ||
954 | */ | ||
955 | struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, | ||
956 | struct mem_cgroup *memcg) | ||
957 | { | ||
958 | struct mem_cgroup_per_zone *mz; | ||
959 | struct lruvec *lruvec; | ||
960 | |||
961 | if (mem_cgroup_disabled()) { | ||
962 | lruvec = &zone->lruvec; | ||
963 | goto out; | ||
964 | } | ||
965 | |||
966 | mz = mem_cgroup_zone_zoneinfo(memcg, zone); | ||
967 | lruvec = &mz->lruvec; | ||
968 | out: | ||
969 | /* | ||
970 | * Since a node can be onlined after the mem_cgroup was created, | ||
971 | * we have to be prepared to initialize lruvec->zone here; | ||
972 | * and if offlined then reonlined, we need to reinitialize it. | ||
973 | */ | ||
974 | if (unlikely(lruvec->zone != zone)) | ||
975 | lruvec->zone = zone; | ||
976 | return lruvec; | ||
977 | } | ||
978 | |||
979 | /** | ||
980 | * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page | 924 | * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page |
981 | * @page: the page | 925 | * @page: the page |
982 | * @zone: zone of the page | 926 | * @zone: zone of the page |
@@ -985,14 +929,14 @@ out: | |||
985 | * and putback protocol: the LRU lock must be held, and the page must | 929 | * and putback protocol: the LRU lock must be held, and the page must |
986 | * either be PageLRU() or the caller must have isolated/allocated it. | 930 | * either be PageLRU() or the caller must have isolated/allocated it. |
987 | */ | 931 | */ |
988 | struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) | 932 | struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) |
989 | { | 933 | { |
990 | struct mem_cgroup_per_zone *mz; | 934 | struct mem_cgroup_per_node *mz; |
991 | struct mem_cgroup *memcg; | 935 | struct mem_cgroup *memcg; |
992 | struct lruvec *lruvec; | 936 | struct lruvec *lruvec; |
993 | 937 | ||
994 | if (mem_cgroup_disabled()) { | 938 | if (mem_cgroup_disabled()) { |
995 | lruvec = &zone->lruvec; | 939 | lruvec = &pgdat->lruvec; |
996 | goto out; | 940 | goto out; |
997 | } | 941 | } |
998 | 942 | ||
@@ -1004,7 +948,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) | |||
1004 | if (!memcg) | 948 | if (!memcg) |
1005 | memcg = root_mem_cgroup; | 949 | memcg = root_mem_cgroup; |
1006 | 950 | ||
1007 | mz = mem_cgroup_page_zoneinfo(memcg, page); | 951 | mz = mem_cgroup_page_nodeinfo(memcg, page); |
1008 | lruvec = &mz->lruvec; | 952 | lruvec = &mz->lruvec; |
1009 | out: | 953 | out: |
1010 | /* | 954 | /* |
@@ -1012,8 +956,8 @@ out: | |||
1012 | * we have to be prepared to initialize lruvec->zone here; | 956 | * we have to be prepared to initialize lruvec->zone here; |
1013 | * and if offlined then reonlined, we need to reinitialize it. | 957 | * and if offlined then reonlined, we need to reinitialize it. |
1014 | */ | 958 | */ |
1015 | if (unlikely(lruvec->zone != zone)) | 959 | if (unlikely(lruvec->pgdat != pgdat)) |
1016 | lruvec->zone = zone; | 960 | lruvec->pgdat = pgdat; |
1017 | return lruvec; | 961 | return lruvec; |
1018 | } | 962 | } |
1019 | 963 | ||
@@ -1030,17 +974,15 @@ out: | |||
1030 | void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, | 974 | void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, |
1031 | int nr_pages) | 975 | int nr_pages) |
1032 | { | 976 | { |
1033 | struct mem_cgroup_per_zone *mz; | 977 | struct mem_cgroup_per_node *mz; |
1034 | unsigned long *lru_size; | 978 | unsigned long *lru_size; |
1035 | long size; | 979 | long size; |
1036 | bool empty; | 980 | bool empty; |
1037 | 981 | ||
1038 | __update_lru_size(lruvec, lru, nr_pages); | ||
1039 | |||
1040 | if (mem_cgroup_disabled()) | 982 | if (mem_cgroup_disabled()) |
1041 | return; | 983 | return; |
1042 | 984 | ||
1043 | mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); | 985 | mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); |
1044 | lru_size = mz->lru_size + lru; | 986 | lru_size = mz->lru_size + lru; |
1045 | empty = list_empty(lruvec->lists + lru); | 987 | empty = list_empty(lruvec->lists + lru); |
1046 | 988 | ||
@@ -1276,9 +1218,9 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1276 | * select it. The goal is to allow it to allocate so that it may | 1218 | * select it. The goal is to allow it to allocate so that it may |
1277 | * quickly exit and free its memory. | 1219 | * quickly exit and free its memory. |
1278 | */ | 1220 | */ |
1279 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { | 1221 | if (task_will_free_mem(current)) { |
1280 | mark_oom_victim(current); | 1222 | mark_oom_victim(current); |
1281 | try_oom_reaper(current); | 1223 | wake_oom_reaper(current); |
1282 | goto unlock; | 1224 | goto unlock; |
1283 | } | 1225 | } |
1284 | 1226 | ||
@@ -1433,7 +1375,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | |||
1433 | #endif | 1375 | #endif |
1434 | 1376 | ||
1435 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, | 1377 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, |
1436 | struct zone *zone, | 1378 | pg_data_t *pgdat, |
1437 | gfp_t gfp_mask, | 1379 | gfp_t gfp_mask, |
1438 | unsigned long *total_scanned) | 1380 | unsigned long *total_scanned) |
1439 | { | 1381 | { |
@@ -1443,7 +1385,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, | |||
1443 | unsigned long excess; | 1385 | unsigned long excess; |
1444 | unsigned long nr_scanned; | 1386 | unsigned long nr_scanned; |
1445 | struct mem_cgroup_reclaim_cookie reclaim = { | 1387 | struct mem_cgroup_reclaim_cookie reclaim = { |
1446 | .zone = zone, | 1388 | .pgdat = pgdat, |
1447 | .priority = 0, | 1389 | .priority = 0, |
1448 | }; | 1390 | }; |
1449 | 1391 | ||
@@ -1473,8 +1415,8 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, | |||
1473 | } | 1415 | } |
1474 | continue; | 1416 | continue; |
1475 | } | 1417 | } |
1476 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, | 1418 | total += mem_cgroup_shrink_node(victim, gfp_mask, false, |
1477 | zone, &nr_scanned); | 1419 | pgdat, &nr_scanned); |
1478 | *total_scanned += nr_scanned; | 1420 | *total_scanned += nr_scanned; |
1479 | if (!soft_limit_excess(root_memcg)) | 1421 | if (!soft_limit_excess(root_memcg)) |
1480 | break; | 1422 | break; |
@@ -2107,11 +2049,11 @@ static void lock_page_lru(struct page *page, int *isolated) | |||
2107 | { | 2049 | { |
2108 | struct zone *zone = page_zone(page); | 2050 | struct zone *zone = page_zone(page); |
2109 | 2051 | ||
2110 | spin_lock_irq(&zone->lru_lock); | 2052 | spin_lock_irq(zone_lru_lock(zone)); |
2111 | if (PageLRU(page)) { | 2053 | if (PageLRU(page)) { |
2112 | struct lruvec *lruvec; | 2054 | struct lruvec *lruvec; |
2113 | 2055 | ||
2114 | lruvec = mem_cgroup_page_lruvec(page, zone); | 2056 | lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); |
2115 | ClearPageLRU(page); | 2057 | ClearPageLRU(page); |
2116 | del_page_from_lru_list(page, lruvec, page_lru(page)); | 2058 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
2117 | *isolated = 1; | 2059 | *isolated = 1; |
@@ -2126,12 +2068,12 @@ static void unlock_page_lru(struct page *page, int isolated) | |||
2126 | if (isolated) { | 2068 | if (isolated) { |
2127 | struct lruvec *lruvec; | 2069 | struct lruvec *lruvec; |
2128 | 2070 | ||
2129 | lruvec = mem_cgroup_page_lruvec(page, zone); | 2071 | lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); |
2130 | VM_BUG_ON_PAGE(PageLRU(page), page); | 2072 | VM_BUG_ON_PAGE(PageLRU(page), page); |
2131 | SetPageLRU(page); | 2073 | SetPageLRU(page); |
2132 | add_page_to_lru_list(page, lruvec, page_lru(page)); | 2074 | add_page_to_lru_list(page, lruvec, page_lru(page)); |
2133 | } | 2075 | } |
2134 | spin_unlock_irq(&zone->lru_lock); | 2076 | spin_unlock_irq(zone_lru_lock(zone)); |
2135 | } | 2077 | } |
2136 | 2078 | ||
2137 | static void commit_charge(struct page *page, struct mem_cgroup *memcg, | 2079 | static void commit_charge(struct page *page, struct mem_cgroup *memcg, |
@@ -2431,7 +2373,7 @@ void memcg_kmem_uncharge(struct page *page, int order) | |||
2431 | 2373 | ||
2432 | /* | 2374 | /* |
2433 | * Because tail pages are not marked as "used", set it. We're under | 2375 | * Because tail pages are not marked as "used", set it. We're under |
2434 | * zone->lru_lock and migration entries setup in all page mappings. | 2376 | * zone_lru_lock and migration entries setup in all page mappings. |
2435 | */ | 2377 | */ |
2436 | void mem_cgroup_split_huge_fixup(struct page *head) | 2378 | void mem_cgroup_split_huge_fixup(struct page *head) |
2437 | { | 2379 | { |
@@ -2601,22 +2543,22 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
2601 | return ret; | 2543 | return ret; |
2602 | } | 2544 | } |
2603 | 2545 | ||
2604 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | 2546 | unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, |
2605 | gfp_t gfp_mask, | 2547 | gfp_t gfp_mask, |
2606 | unsigned long *total_scanned) | 2548 | unsigned long *total_scanned) |
2607 | { | 2549 | { |
2608 | unsigned long nr_reclaimed = 0; | 2550 | unsigned long nr_reclaimed = 0; |
2609 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | 2551 | struct mem_cgroup_per_node *mz, *next_mz = NULL; |
2610 | unsigned long reclaimed; | 2552 | unsigned long reclaimed; |
2611 | int loop = 0; | 2553 | int loop = 0; |
2612 | struct mem_cgroup_tree_per_zone *mctz; | 2554 | struct mem_cgroup_tree_per_node *mctz; |
2613 | unsigned long excess; | 2555 | unsigned long excess; |
2614 | unsigned long nr_scanned; | 2556 | unsigned long nr_scanned; |
2615 | 2557 | ||
2616 | if (order > 0) | 2558 | if (order > 0) |
2617 | return 0; | 2559 | return 0; |
2618 | 2560 | ||
2619 | mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); | 2561 | mctz = soft_limit_tree_node(pgdat->node_id); |
2620 | /* | 2562 | /* |
2621 | * This loop can run a while, specially if mem_cgroup's continuously | 2563 | * This loop can run a while, specially if mem_cgroup's continuously |
2622 | * keep exceeding their soft limit and putting the system under | 2564 | * keep exceeding their soft limit and putting the system under |
@@ -2631,7 +2573,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
2631 | break; | 2573 | break; |
2632 | 2574 | ||
2633 | nr_scanned = 0; | 2575 | nr_scanned = 0; |
2634 | reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, | 2576 | reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, |
2635 | gfp_mask, &nr_scanned); | 2577 | gfp_mask, &nr_scanned); |
2636 | nr_reclaimed += reclaimed; | 2578 | nr_reclaimed += reclaimed; |
2637 | *total_scanned += nr_scanned; | 2579 | *total_scanned += nr_scanned; |
@@ -3252,22 +3194,21 @@ static int memcg_stat_show(struct seq_file *m, void *v) | |||
3252 | 3194 | ||
3253 | #ifdef CONFIG_DEBUG_VM | 3195 | #ifdef CONFIG_DEBUG_VM |
3254 | { | 3196 | { |
3255 | int nid, zid; | 3197 | pg_data_t *pgdat; |
3256 | struct mem_cgroup_per_zone *mz; | 3198 | struct mem_cgroup_per_node *mz; |
3257 | struct zone_reclaim_stat *rstat; | 3199 | struct zone_reclaim_stat *rstat; |
3258 | unsigned long recent_rotated[2] = {0, 0}; | 3200 | unsigned long recent_rotated[2] = {0, 0}; |
3259 | unsigned long recent_scanned[2] = {0, 0}; | 3201 | unsigned long recent_scanned[2] = {0, 0}; |
3260 | 3202 | ||
3261 | for_each_online_node(nid) | 3203 | for_each_online_pgdat(pgdat) { |
3262 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 3204 | mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); |
3263 | mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; | 3205 | rstat = &mz->lruvec.reclaim_stat; |
3264 | rstat = &mz->lruvec.reclaim_stat; | ||
3265 | 3206 | ||
3266 | recent_rotated[0] += rstat->recent_rotated[0]; | 3207 | recent_rotated[0] += rstat->recent_rotated[0]; |
3267 | recent_rotated[1] += rstat->recent_rotated[1]; | 3208 | recent_rotated[1] += rstat->recent_rotated[1]; |
3268 | recent_scanned[0] += rstat->recent_scanned[0]; | 3209 | recent_scanned[0] += rstat->recent_scanned[0]; |
3269 | recent_scanned[1] += rstat->recent_scanned[1]; | 3210 | recent_scanned[1] += rstat->recent_scanned[1]; |
3270 | } | 3211 | } |
3271 | seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); | 3212 | seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); |
3272 | seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); | 3213 | seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); |
3273 | seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); | 3214 | seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); |
@@ -4147,11 +4088,10 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) | |||
4147 | return idr_find(&mem_cgroup_idr, id); | 4088 | return idr_find(&mem_cgroup_idr, id); |
4148 | } | 4089 | } |
4149 | 4090 | ||
4150 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | 4091 | static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) |
4151 | { | 4092 | { |
4152 | struct mem_cgroup_per_node *pn; | 4093 | struct mem_cgroup_per_node *pn; |
4153 | struct mem_cgroup_per_zone *mz; | 4094 | int tmp = node; |
4154 | int zone, tmp = node; | ||
4155 | /* | 4095 | /* |
4156 | * This routine is called against possible nodes. | 4096 | * This routine is called against possible nodes. |
4157 | * But it's BUG to call kmalloc() against offline node. | 4097 | * But it's BUG to call kmalloc() against offline node. |
@@ -4166,18 +4106,16 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
4166 | if (!pn) | 4106 | if (!pn) |
4167 | return 1; | 4107 | return 1; |
4168 | 4108 | ||
4169 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 4109 | lruvec_init(&pn->lruvec); |
4170 | mz = &pn->zoneinfo[zone]; | 4110 | pn->usage_in_excess = 0; |
4171 | lruvec_init(&mz->lruvec); | 4111 | pn->on_tree = false; |
4172 | mz->usage_in_excess = 0; | 4112 | pn->memcg = memcg; |
4173 | mz->on_tree = false; | 4113 | |
4174 | mz->memcg = memcg; | ||
4175 | } | ||
4176 | memcg->nodeinfo[node] = pn; | 4114 | memcg->nodeinfo[node] = pn; |
4177 | return 0; | 4115 | return 0; |
4178 | } | 4116 | } |
4179 | 4117 | ||
4180 | static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | 4118 | static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) |
4181 | { | 4119 | { |
4182 | kfree(memcg->nodeinfo[node]); | 4120 | kfree(memcg->nodeinfo[node]); |
4183 | } | 4121 | } |
@@ -4188,7 +4126,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) | |||
4188 | 4126 | ||
4189 | memcg_wb_domain_exit(memcg); | 4127 | memcg_wb_domain_exit(memcg); |
4190 | for_each_node(node) | 4128 | for_each_node(node) |
4191 | free_mem_cgroup_per_zone_info(memcg, node); | 4129 | free_mem_cgroup_per_node_info(memcg, node); |
4192 | free_percpu(memcg->stat); | 4130 | free_percpu(memcg->stat); |
4193 | kfree(memcg); | 4131 | kfree(memcg); |
4194 | } | 4132 | } |
@@ -4217,7 +4155,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
4217 | goto fail; | 4155 | goto fail; |
4218 | 4156 | ||
4219 | for_each_node(node) | 4157 | for_each_node(node) |
4220 | if (alloc_mem_cgroup_per_zone_info(memcg, node)) | 4158 | if (alloc_mem_cgroup_per_node_info(memcg, node)) |
4221 | goto fail; | 4159 | goto fail; |
4222 | 4160 | ||
4223 | if (memcg_wb_domain_init(memcg, GFP_KERNEL)) | 4161 | if (memcg_wb_domain_init(memcg, GFP_KERNEL)) |
@@ -5233,7 +5171,7 @@ static int memory_stat_show(struct seq_file *m, void *v) | |||
5233 | seq_printf(m, "file %llu\n", | 5171 | seq_printf(m, "file %llu\n", |
5234 | (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); | 5172 | (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); |
5235 | seq_printf(m, "kernel_stack %llu\n", | 5173 | seq_printf(m, "kernel_stack %llu\n", |
5236 | (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE); | 5174 | (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); |
5237 | seq_printf(m, "slab %llu\n", | 5175 | seq_printf(m, "slab %llu\n", |
5238 | (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + | 5176 | (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + |
5239 | stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); | 5177 | stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); |
@@ -5820,18 +5758,12 @@ static int __init mem_cgroup_init(void) | |||
5820 | 5758 | ||
5821 | for_each_node(node) { | 5759 | for_each_node(node) { |
5822 | struct mem_cgroup_tree_per_node *rtpn; | 5760 | struct mem_cgroup_tree_per_node *rtpn; |
5823 | int zone; | ||
5824 | 5761 | ||
5825 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, | 5762 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, |
5826 | node_online(node) ? node : NUMA_NO_NODE); | 5763 | node_online(node) ? node : NUMA_NO_NODE); |
5827 | 5764 | ||
5828 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 5765 | rtpn->rb_root = RB_ROOT; |
5829 | struct mem_cgroup_tree_per_zone *rtpz; | 5766 | spin_lock_init(&rtpn->lock); |
5830 | |||
5831 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
5832 | rtpz->rb_root = RB_ROOT; | ||
5833 | spin_lock_init(&rtpz->lock); | ||
5834 | } | ||
5835 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | 5767 | soft_limit_tree.rb_tree_per_node[node] = rtpn; |
5836 | } | 5768 | } |
5837 | 5769 | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2fcca6b0e005..de88f33519c0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -741,8 +741,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
741 | * page->lru because it can be used in other hugepage operations, | 741 | * page->lru because it can be used in other hugepage operations, |
742 | * such as __unmap_hugepage_range() and gather_surplus_pages(). | 742 | * such as __unmap_hugepage_range() and gather_surplus_pages(). |
743 | * So instead we use page_mapping() and PageAnon(). | 743 | * So instead we use page_mapping() and PageAnon(). |
744 | * We assume that this function is called with page lock held, | ||
745 | * so there is no race between isolation and mapping/unmapping. | ||
746 | */ | 744 | */ |
747 | if (!(page_mapping(hpage) || PageAnon(hpage))) { | 745 | if (!(page_mapping(hpage) || PageAnon(hpage))) { |
748 | res = dequeue_hwpoisoned_huge_page(hpage); | 746 | res = dequeue_hwpoisoned_huge_page(hpage); |
@@ -1663,7 +1661,7 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1663 | put_hwpoison_page(page); | 1661 | put_hwpoison_page(page); |
1664 | if (!ret) { | 1662 | if (!ret) { |
1665 | LIST_HEAD(pagelist); | 1663 | LIST_HEAD(pagelist); |
1666 | inc_zone_page_state(page, NR_ISOLATED_ANON + | 1664 | inc_node_page_state(page, NR_ISOLATED_ANON + |
1667 | page_is_file_cache(page)); | 1665 | page_is_file_cache(page)); |
1668 | list_add(&page->lru, &pagelist); | 1666 | list_add(&page->lru, &pagelist); |
1669 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, | 1667 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
@@ -1671,7 +1669,7 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1671 | if (ret) { | 1669 | if (ret) { |
1672 | if (!list_empty(&pagelist)) { | 1670 | if (!list_empty(&pagelist)) { |
1673 | list_del(&page->lru); | 1671 | list_del(&page->lru); |
1674 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 1672 | dec_node_page_state(page, NR_ISOLATED_ANON + |
1675 | page_is_file_cache(page)); | 1673 | page_is_file_cache(page)); |
1676 | putback_lru_page(page); | 1674 | putback_lru_page(page); |
1677 | } | 1675 | } |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 82d0b98d27f8..3894b65b1555 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -1209,9 +1209,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
1209 | 1209 | ||
1210 | arch_refresh_nodedata(nid, pgdat); | 1210 | arch_refresh_nodedata(nid, pgdat); |
1211 | } else { | 1211 | } else { |
1212 | /* Reset the nr_zones and classzone_idx to 0 before reuse */ | 1212 | /* Reset the nr_zones, order and classzone_idx before reuse */ |
1213 | pgdat->nr_zones = 0; | 1213 | pgdat->nr_zones = 0; |
1214 | pgdat->classzone_idx = 0; | 1214 | pgdat->kswapd_order = 0; |
1215 | pgdat->kswapd_classzone_idx = 0; | ||
1215 | } | 1216 | } |
1216 | 1217 | ||
1217 | /* we can use NODE_DATA(nid) from here */ | 1218 | /* we can use NODE_DATA(nid) from here */ |
@@ -1547,6 +1548,37 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) | |||
1547 | return 0; | 1548 | return 0; |
1548 | } | 1549 | } |
1549 | 1550 | ||
1551 | static struct page *new_node_page(struct page *page, unsigned long private, | ||
1552 | int **result) | ||
1553 | { | ||
1554 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; | ||
1555 | int nid = page_to_nid(page); | ||
1556 | nodemask_t nmask = node_online_map; | ||
1557 | struct page *new_page; | ||
1558 | |||
1559 | /* | ||
1560 | * TODO: allocate a destination hugepage from a nearest neighbor node, | ||
1561 | * accordance with memory policy of the user process if possible. For | ||
1562 | * now as a simple work-around, we use the next node for destination. | ||
1563 | */ | ||
1564 | if (PageHuge(page)) | ||
1565 | return alloc_huge_page_node(page_hstate(compound_head(page)), | ||
1566 | next_node_in(nid, nmask)); | ||
1567 | |||
1568 | node_clear(nid, nmask); | ||
1569 | if (PageHighMem(page) | ||
1570 | || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) | ||
1571 | gfp_mask |= __GFP_HIGHMEM; | ||
1572 | |||
1573 | new_page = __alloc_pages_nodemask(gfp_mask, 0, | ||
1574 | node_zonelist(nid, gfp_mask), &nmask); | ||
1575 | if (!new_page) | ||
1576 | new_page = __alloc_pages(gfp_mask, 0, | ||
1577 | node_zonelist(nid, gfp_mask)); | ||
1578 | |||
1579 | return new_page; | ||
1580 | } | ||
1581 | |||
1550 | #define NR_OFFLINE_AT_ONCE_PAGES (256) | 1582 | #define NR_OFFLINE_AT_ONCE_PAGES (256) |
1551 | static int | 1583 | static int |
1552 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | 1584 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) |
@@ -1586,7 +1618,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1586 | put_page(page); | 1618 | put_page(page); |
1587 | list_add_tail(&page->lru, &source); | 1619 | list_add_tail(&page->lru, &source); |
1588 | move_pages--; | 1620 | move_pages--; |
1589 | inc_zone_page_state(page, NR_ISOLATED_ANON + | 1621 | inc_node_page_state(page, NR_ISOLATED_ANON + |
1590 | page_is_file_cache(page)); | 1622 | page_is_file_cache(page)); |
1591 | 1623 | ||
1592 | } else { | 1624 | } else { |
@@ -1610,11 +1642,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1610 | goto out; | 1642 | goto out; |
1611 | } | 1643 | } |
1612 | 1644 | ||
1613 | /* | 1645 | /* Allocate a new page from the nearest neighbor node */ |
1614 | * alloc_migrate_target should be improooooved!! | 1646 | ret = migrate_pages(&source, new_node_page, NULL, 0, |
1615 | * migrate_pages returns # of failed pages. | ||
1616 | */ | ||
1617 | ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, | ||
1618 | MIGRATE_SYNC, MR_MEMORY_HOTPLUG); | 1647 | MIGRATE_SYNC, MR_MEMORY_HOTPLUG); |
1619 | if (ret) | 1648 | if (ret) |
1620 | putback_movable_pages(&source); | 1649 | putback_movable_pages(&source); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 53e40d3f3933..d8c4e38fb5f4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -962,7 +962,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
962 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { | 962 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { |
963 | if (!isolate_lru_page(page)) { | 963 | if (!isolate_lru_page(page)) { |
964 | list_add_tail(&page->lru, pagelist); | 964 | list_add_tail(&page->lru, pagelist); |
965 | inc_zone_page_state(page, NR_ISOLATED_ANON + | 965 | inc_node_page_state(page, NR_ISOLATED_ANON + |
966 | page_is_file_cache(page)); | 966 | page_is_file_cache(page)); |
967 | } | 967 | } |
968 | } | 968 | } |
diff --git a/mm/mempool.c b/mm/mempool.c index 8f65464da5de..47a659dedd44 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -306,7 +306,7 @@ EXPORT_SYMBOL(mempool_resize); | |||
306 | * returns NULL. Note that due to preallocation, this function | 306 | * returns NULL. Note that due to preallocation, this function |
307 | * *never* fails when called from process contexts. (it might | 307 | * *never* fails when called from process contexts. (it might |
308 | * fail if called from an IRQ context.) | 308 | * fail if called from an IRQ context.) |
309 | * Note: neither __GFP_NOMEMALLOC nor __GFP_ZERO are supported. | 309 | * Note: using __GFP_ZERO is not supported. |
310 | */ | 310 | */ |
311 | void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) | 311 | void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) |
312 | { | 312 | { |
@@ -315,27 +315,16 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) | |||
315 | wait_queue_t wait; | 315 | wait_queue_t wait; |
316 | gfp_t gfp_temp; | 316 | gfp_t gfp_temp; |
317 | 317 | ||
318 | /* If oom killed, memory reserves are essential to prevent livelock */ | ||
319 | VM_WARN_ON_ONCE(gfp_mask & __GFP_NOMEMALLOC); | ||
320 | /* No element size to zero on allocation */ | ||
321 | VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); | 318 | VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); |
322 | |||
323 | might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); | 319 | might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); |
324 | 320 | ||
321 | gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ | ||
325 | gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ | 322 | gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ |
326 | gfp_mask |= __GFP_NOWARN; /* failures are OK */ | 323 | gfp_mask |= __GFP_NOWARN; /* failures are OK */ |
327 | 324 | ||
328 | gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); | 325 | gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); |
329 | 326 | ||
330 | repeat_alloc: | 327 | repeat_alloc: |
331 | if (likely(pool->curr_nr)) { | ||
332 | /* | ||
333 | * Don't allocate from emergency reserves if there are | ||
334 | * elements available. This check is racy, but it will | ||
335 | * be rechecked each loop. | ||
336 | */ | ||
337 | gfp_temp |= __GFP_NOMEMALLOC; | ||
338 | } | ||
339 | 328 | ||
340 | element = pool->alloc(gfp_temp, pool->pool_data); | 329 | element = pool->alloc(gfp_temp, pool->pool_data); |
341 | if (likely(element != NULL)) | 330 | if (likely(element != NULL)) |
@@ -359,12 +348,11 @@ repeat_alloc: | |||
359 | * We use gfp mask w/o direct reclaim or IO for the first round. If | 348 | * We use gfp mask w/o direct reclaim or IO for the first round. If |
360 | * alloc failed with that and @pool was empty, retry immediately. | 349 | * alloc failed with that and @pool was empty, retry immediately. |
361 | */ | 350 | */ |
362 | if ((gfp_temp & ~__GFP_NOMEMALLOC) != gfp_mask) { | 351 | if (gfp_temp != gfp_mask) { |
363 | spin_unlock_irqrestore(&pool->lock, flags); | 352 | spin_unlock_irqrestore(&pool->lock, flags); |
364 | gfp_temp = gfp_mask; | 353 | gfp_temp = gfp_mask; |
365 | goto repeat_alloc; | 354 | goto repeat_alloc; |
366 | } | 355 | } |
367 | gfp_temp = gfp_mask; | ||
368 | 356 | ||
369 | /* We must not sleep if !__GFP_DIRECT_RECLAIM */ | 357 | /* We must not sleep if !__GFP_DIRECT_RECLAIM */ |
370 | if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { | 358 | if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { |
diff --git a/mm/migrate.c b/mm/migrate.c index 2232f6923cc7..f7ee04a5ae27 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -168,7 +168,7 @@ void putback_movable_pages(struct list_head *l) | |||
168 | continue; | 168 | continue; |
169 | } | 169 | } |
170 | list_del(&page->lru); | 170 | list_del(&page->lru); |
171 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 171 | dec_node_page_state(page, NR_ISOLATED_ANON + |
172 | page_is_file_cache(page)); | 172 | page_is_file_cache(page)); |
173 | /* | 173 | /* |
174 | * We isolated non-lru movable page so here we can use | 174 | * We isolated non-lru movable page so here we can use |
@@ -501,19 +501,21 @@ int migrate_page_move_mapping(struct address_space *mapping, | |||
501 | * new page and drop references to the old page. | 501 | * new page and drop references to the old page. |
502 | * | 502 | * |
503 | * Note that anonymous pages are accounted for | 503 | * Note that anonymous pages are accounted for |
504 | * via NR_FILE_PAGES and NR_ANON_PAGES if they | 504 | * via NR_FILE_PAGES and NR_ANON_MAPPED if they |
505 | * are mapped to swap space. | 505 | * are mapped to swap space. |
506 | */ | 506 | */ |
507 | if (newzone != oldzone) { | 507 | if (newzone != oldzone) { |
508 | __dec_zone_state(oldzone, NR_FILE_PAGES); | 508 | __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES); |
509 | __inc_zone_state(newzone, NR_FILE_PAGES); | 509 | __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES); |
510 | if (PageSwapBacked(page) && !PageSwapCache(page)) { | 510 | if (PageSwapBacked(page) && !PageSwapCache(page)) { |
511 | __dec_zone_state(oldzone, NR_SHMEM); | 511 | __dec_node_state(oldzone->zone_pgdat, NR_SHMEM); |
512 | __inc_zone_state(newzone, NR_SHMEM); | 512 | __inc_node_state(newzone->zone_pgdat, NR_SHMEM); |
513 | } | 513 | } |
514 | if (dirty && mapping_cap_account_dirty(mapping)) { | 514 | if (dirty && mapping_cap_account_dirty(mapping)) { |
515 | __dec_zone_state(oldzone, NR_FILE_DIRTY); | 515 | __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); |
516 | __inc_zone_state(newzone, NR_FILE_DIRTY); | 516 | __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); |
517 | __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY); | ||
518 | __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING); | ||
517 | } | 519 | } |
518 | } | 520 | } |
519 | local_irq_enable(); | 521 | local_irq_enable(); |
@@ -1119,7 +1121,7 @@ out: | |||
1119 | * restored. | 1121 | * restored. |
1120 | */ | 1122 | */ |
1121 | list_del(&page->lru); | 1123 | list_del(&page->lru); |
1122 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 1124 | dec_node_page_state(page, NR_ISOLATED_ANON + |
1123 | page_is_file_cache(page)); | 1125 | page_is_file_cache(page)); |
1124 | } | 1126 | } |
1125 | 1127 | ||
@@ -1460,7 +1462,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
1460 | err = isolate_lru_page(page); | 1462 | err = isolate_lru_page(page); |
1461 | if (!err) { | 1463 | if (!err) { |
1462 | list_add_tail(&page->lru, &pagelist); | 1464 | list_add_tail(&page->lru, &pagelist); |
1463 | inc_zone_page_state(page, NR_ISOLATED_ANON + | 1465 | inc_node_page_state(page, NR_ISOLATED_ANON + |
1464 | page_is_file_cache(page)); | 1466 | page_is_file_cache(page)); |
1465 | } | 1467 | } |
1466 | put_and_set: | 1468 | put_and_set: |
@@ -1726,15 +1728,16 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, | |||
1726 | unsigned long nr_migrate_pages) | 1728 | unsigned long nr_migrate_pages) |
1727 | { | 1729 | { |
1728 | int z; | 1730 | int z; |
1731 | |||
1732 | if (!pgdat_reclaimable(pgdat)) | ||
1733 | return false; | ||
1734 | |||
1729 | for (z = pgdat->nr_zones - 1; z >= 0; z--) { | 1735 | for (z = pgdat->nr_zones - 1; z >= 0; z--) { |
1730 | struct zone *zone = pgdat->node_zones + z; | 1736 | struct zone *zone = pgdat->node_zones + z; |
1731 | 1737 | ||
1732 | if (!populated_zone(zone)) | 1738 | if (!populated_zone(zone)) |
1733 | continue; | 1739 | continue; |
1734 | 1740 | ||
1735 | if (!zone_reclaimable(zone)) | ||
1736 | continue; | ||
1737 | |||
1738 | /* Avoid waking kswapd by allocating pages_to_migrate pages. */ | 1741 | /* Avoid waking kswapd by allocating pages_to_migrate pages. */ |
1739 | if (!zone_watermark_ok(zone, 0, | 1742 | if (!zone_watermark_ok(zone, 0, |
1740 | high_wmark_pages(zone) + | 1743 | high_wmark_pages(zone) + |
@@ -1828,7 +1831,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | |||
1828 | } | 1831 | } |
1829 | 1832 | ||
1830 | page_lru = page_is_file_cache(page); | 1833 | page_lru = page_is_file_cache(page); |
1831 | mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, | 1834 | mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, |
1832 | hpage_nr_pages(page)); | 1835 | hpage_nr_pages(page)); |
1833 | 1836 | ||
1834 | /* | 1837 | /* |
@@ -1886,7 +1889,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, | |||
1886 | if (nr_remaining) { | 1889 | if (nr_remaining) { |
1887 | if (!list_empty(&migratepages)) { | 1890 | if (!list_empty(&migratepages)) { |
1888 | list_del(&page->lru); | 1891 | list_del(&page->lru); |
1889 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 1892 | dec_node_page_state(page, NR_ISOLATED_ANON + |
1890 | page_is_file_cache(page)); | 1893 | page_is_file_cache(page)); |
1891 | putback_lru_page(page); | 1894 | putback_lru_page(page); |
1892 | } | 1895 | } |
@@ -1931,7 +1934,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1931 | goto out_dropref; | 1934 | goto out_dropref; |
1932 | 1935 | ||
1933 | new_page = alloc_pages_node(node, | 1936 | new_page = alloc_pages_node(node, |
1934 | (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, | 1937 | (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), |
1935 | HPAGE_PMD_ORDER); | 1938 | HPAGE_PMD_ORDER); |
1936 | if (!new_page) | 1939 | if (!new_page) |
1937 | goto out_fail; | 1940 | goto out_fail; |
@@ -1979,7 +1982,7 @@ fail_putback: | |||
1979 | /* Retake the callers reference and putback on LRU */ | 1982 | /* Retake the callers reference and putback on LRU */ |
1980 | get_page(page); | 1983 | get_page(page); |
1981 | putback_lru_page(page); | 1984 | putback_lru_page(page); |
1982 | mod_zone_page_state(page_zone(page), | 1985 | mod_node_page_state(page_pgdat(page), |
1983 | NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); | 1986 | NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); |
1984 | 1987 | ||
1985 | goto out_unlock; | 1988 | goto out_unlock; |
@@ -2030,7 +2033,7 @@ fail_putback: | |||
2030 | count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); | 2033 | count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); |
2031 | count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); | 2034 | count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); |
2032 | 2035 | ||
2033 | mod_zone_page_state(page_zone(page), | 2036 | mod_node_page_state(page_pgdat(page), |
2034 | NR_ISOLATED_ANON + page_lru, | 2037 | NR_ISOLATED_ANON + page_lru, |
2035 | -HPAGE_PMD_NR); | 2038 | -HPAGE_PMD_NR); |
2036 | return isolated; | 2039 | return isolated; |
diff --git a/mm/mlock.c b/mm/mlock.c index ef8dc9f395c4..14645be06e30 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -103,7 +103,7 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage) | |||
103 | if (PageLRU(page)) { | 103 | if (PageLRU(page)) { |
104 | struct lruvec *lruvec; | 104 | struct lruvec *lruvec; |
105 | 105 | ||
106 | lruvec = mem_cgroup_page_lruvec(page, page_zone(page)); | 106 | lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); |
107 | if (getpage) | 107 | if (getpage) |
108 | get_page(page); | 108 | get_page(page); |
109 | ClearPageLRU(page); | 109 | ClearPageLRU(page); |
@@ -188,7 +188,7 @@ unsigned int munlock_vma_page(struct page *page) | |||
188 | * might otherwise copy PageMlocked to part of the tail pages before | 188 | * might otherwise copy PageMlocked to part of the tail pages before |
189 | * we clear it in the head page. It also stabilizes hpage_nr_pages(). | 189 | * we clear it in the head page. It also stabilizes hpage_nr_pages(). |
190 | */ | 190 | */ |
191 | spin_lock_irq(&zone->lru_lock); | 191 | spin_lock_irq(zone_lru_lock(zone)); |
192 | 192 | ||
193 | nr_pages = hpage_nr_pages(page); | 193 | nr_pages = hpage_nr_pages(page); |
194 | if (!TestClearPageMlocked(page)) | 194 | if (!TestClearPageMlocked(page)) |
@@ -197,14 +197,14 @@ unsigned int munlock_vma_page(struct page *page) | |||
197 | __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); | 197 | __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); |
198 | 198 | ||
199 | if (__munlock_isolate_lru_page(page, true)) { | 199 | if (__munlock_isolate_lru_page(page, true)) { |
200 | spin_unlock_irq(&zone->lru_lock); | 200 | spin_unlock_irq(zone_lru_lock(zone)); |
201 | __munlock_isolated_page(page); | 201 | __munlock_isolated_page(page); |
202 | goto out; | 202 | goto out; |
203 | } | 203 | } |
204 | __munlock_isolation_failed(page); | 204 | __munlock_isolation_failed(page); |
205 | 205 | ||
206 | unlock_out: | 206 | unlock_out: |
207 | spin_unlock_irq(&zone->lru_lock); | 207 | spin_unlock_irq(zone_lru_lock(zone)); |
208 | 208 | ||
209 | out: | 209 | out: |
210 | return nr_pages - 1; | 210 | return nr_pages - 1; |
@@ -289,7 +289,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) | |||
289 | pagevec_init(&pvec_putback, 0); | 289 | pagevec_init(&pvec_putback, 0); |
290 | 290 | ||
291 | /* Phase 1: page isolation */ | 291 | /* Phase 1: page isolation */ |
292 | spin_lock_irq(&zone->lru_lock); | 292 | spin_lock_irq(zone_lru_lock(zone)); |
293 | for (i = 0; i < nr; i++) { | 293 | for (i = 0; i < nr; i++) { |
294 | struct page *page = pvec->pages[i]; | 294 | struct page *page = pvec->pages[i]; |
295 | 295 | ||
@@ -315,7 +315,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) | |||
315 | } | 315 | } |
316 | delta_munlocked = -nr + pagevec_count(&pvec_putback); | 316 | delta_munlocked = -nr + pagevec_count(&pvec_putback); |
317 | __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); | 317 | __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); |
318 | spin_unlock_irq(&zone->lru_lock); | 318 | spin_unlock_irq(zone_lru_lock(zone)); |
319 | 319 | ||
320 | /* Now we can release pins of pages that we are not munlocking */ | 320 | /* Now we can release pins of pages that we are not munlocking */ |
321 | pagevec_release(&pvec_putback); | 321 | pagevec_release(&pvec_putback); |
@@ -621,7 +621,6 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
621 | { | 621 | { |
622 | struct mm_struct *mm = vma->vm_mm; | 622 | struct mm_struct *mm = vma->vm_mm; |
623 | struct vm_area_struct *next = vma->vm_next; | 623 | struct vm_area_struct *next = vma->vm_next; |
624 | struct vm_area_struct *importer = NULL; | ||
625 | struct address_space *mapping = NULL; | 624 | struct address_space *mapping = NULL; |
626 | struct rb_root *root = NULL; | 625 | struct rb_root *root = NULL; |
627 | struct anon_vma *anon_vma = NULL; | 626 | struct anon_vma *anon_vma = NULL; |
@@ -631,17 +630,25 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
631 | int remove_next = 0; | 630 | int remove_next = 0; |
632 | 631 | ||
633 | if (next && !insert) { | 632 | if (next && !insert) { |
634 | struct vm_area_struct *exporter = NULL; | 633 | struct vm_area_struct *exporter = NULL, *importer = NULL; |
635 | 634 | ||
636 | if (end >= next->vm_end) { | 635 | if (end >= next->vm_end) { |
637 | /* | 636 | /* |
638 | * vma expands, overlapping all the next, and | 637 | * vma expands, overlapping all the next, and |
639 | * perhaps the one after too (mprotect case 6). | 638 | * perhaps the one after too (mprotect case 6). |
640 | */ | 639 | */ |
641 | again: remove_next = 1 + (end > next->vm_end); | 640 | remove_next = 1 + (end > next->vm_end); |
642 | end = next->vm_end; | 641 | end = next->vm_end; |
643 | exporter = next; | 642 | exporter = next; |
644 | importer = vma; | 643 | importer = vma; |
644 | |||
645 | /* | ||
646 | * If next doesn't have anon_vma, import from vma after | ||
647 | * next, if the vma overlaps with it. | ||
648 | */ | ||
649 | if (remove_next == 2 && next && !next->anon_vma) | ||
650 | exporter = next->vm_next; | ||
651 | |||
645 | } else if (end > next->vm_start) { | 652 | } else if (end > next->vm_start) { |
646 | /* | 653 | /* |
647 | * vma expands, overlapping part of the next: | 654 | * vma expands, overlapping part of the next: |
@@ -675,7 +682,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
675 | return error; | 682 | return error; |
676 | } | 683 | } |
677 | } | 684 | } |
678 | 685 | again: | |
679 | vma_adjust_trans_huge(vma, start, end, adjust_next); | 686 | vma_adjust_trans_huge(vma, start, end, adjust_next); |
680 | 687 | ||
681 | if (file) { | 688 | if (file) { |
@@ -796,8 +803,11 @@ again: remove_next = 1 + (end > next->vm_end); | |||
796 | * up the code too much to do both in one go. | 803 | * up the code too much to do both in one go. |
797 | */ | 804 | */ |
798 | next = vma->vm_next; | 805 | next = vma->vm_next; |
799 | if (remove_next == 2) | 806 | if (remove_next == 2) { |
807 | remove_next = 1; | ||
808 | end = next->vm_end; | ||
800 | goto again; | 809 | goto again; |
810 | } | ||
801 | else if (next) | 811 | else if (next) |
802 | vma_gap_update(next); | 812 | vma_gap_update(next); |
803 | else | 813 | else |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d4a929d79470..7d0a275df822 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -176,11 +176,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
176 | 176 | ||
177 | /* | 177 | /* |
178 | * Do not even consider tasks which are explicitly marked oom | 178 | * Do not even consider tasks which are explicitly marked oom |
179 | * unkillable or have been already oom reaped. | 179 | * unkillable or have been already oom reaped or the are in |
180 | * the middle of vfork | ||
180 | */ | 181 | */ |
181 | adj = (long)p->signal->oom_score_adj; | 182 | adj = (long)p->signal->oom_score_adj; |
182 | if (adj == OOM_SCORE_ADJ_MIN || | 183 | if (adj == OOM_SCORE_ADJ_MIN || |
183 | test_bit(MMF_OOM_REAPED, &p->mm->flags)) { | 184 | test_bit(MMF_OOM_REAPED, &p->mm->flags) || |
185 | in_vfork(p)) { | ||
184 | task_unlock(p); | 186 | task_unlock(p); |
185 | return 0; | 187 | return 0; |
186 | } | 188 | } |
@@ -281,10 +283,22 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, | |||
281 | 283 | ||
282 | /* | 284 | /* |
283 | * This task already has access to memory reserves and is being killed. | 285 | * This task already has access to memory reserves and is being killed. |
284 | * Don't allow any other task to have access to the reserves. | 286 | * Don't allow any other task to have access to the reserves unless |
287 | * the task has MMF_OOM_REAPED because chances that it would release | ||
288 | * any memory is quite low. | ||
285 | */ | 289 | */ |
286 | if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) | 290 | if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) { |
287 | return OOM_SCAN_ABORT; | 291 | struct task_struct *p = find_lock_task_mm(task); |
292 | enum oom_scan_t ret = OOM_SCAN_ABORT; | ||
293 | |||
294 | if (p) { | ||
295 | if (test_bit(MMF_OOM_REAPED, &p->mm->flags)) | ||
296 | ret = OOM_SCAN_CONTINUE; | ||
297 | task_unlock(p); | ||
298 | } | ||
299 | |||
300 | return ret; | ||
301 | } | ||
288 | 302 | ||
289 | /* | 303 | /* |
290 | * If task is allocating a lot of memory and has been marked to be | 304 | * If task is allocating a lot of memory and has been marked to be |
@@ -415,7 +429,7 @@ bool oom_killer_disabled __read_mostly; | |||
415 | * task's threads: if one of those is using this mm then this task was also | 429 | * task's threads: if one of those is using this mm then this task was also |
416 | * using it. | 430 | * using it. |
417 | */ | 431 | */ |
418 | static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) | 432 | bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) |
419 | { | 433 | { |
420 | struct task_struct *t; | 434 | struct task_struct *t; |
421 | 435 | ||
@@ -554,8 +568,27 @@ static void oom_reap_task(struct task_struct *tsk) | |||
554 | schedule_timeout_idle(HZ/10); | 568 | schedule_timeout_idle(HZ/10); |
555 | 569 | ||
556 | if (attempts > MAX_OOM_REAP_RETRIES) { | 570 | if (attempts > MAX_OOM_REAP_RETRIES) { |
571 | struct task_struct *p; | ||
572 | |||
557 | pr_info("oom_reaper: unable to reap pid:%d (%s)\n", | 573 | pr_info("oom_reaper: unable to reap pid:%d (%s)\n", |
558 | task_pid_nr(tsk), tsk->comm); | 574 | task_pid_nr(tsk), tsk->comm); |
575 | |||
576 | /* | ||
577 | * If we've already tried to reap this task in the past and | ||
578 | * failed it probably doesn't make much sense to try yet again | ||
579 | * so hide the mm from the oom killer so that it can move on | ||
580 | * to another task with a different mm struct. | ||
581 | */ | ||
582 | p = find_lock_task_mm(tsk); | ||
583 | if (p) { | ||
584 | if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) { | ||
585 | pr_info("oom_reaper: giving up pid:%d (%s)\n", | ||
586 | task_pid_nr(tsk), tsk->comm); | ||
587 | set_bit(MMF_OOM_REAPED, &p->mm->flags); | ||
588 | } | ||
589 | task_unlock(p); | ||
590 | } | ||
591 | |||
559 | debug_show_all_locks(); | 592 | debug_show_all_locks(); |
560 | } | 593 | } |
561 | 594 | ||
@@ -594,7 +627,7 @@ static int oom_reaper(void *unused) | |||
594 | return 0; | 627 | return 0; |
595 | } | 628 | } |
596 | 629 | ||
597 | static void wake_oom_reaper(struct task_struct *tsk) | 630 | void wake_oom_reaper(struct task_struct *tsk) |
598 | { | 631 | { |
599 | if (!oom_reaper_th) | 632 | if (!oom_reaper_th) |
600 | return; | 633 | return; |
@@ -612,46 +645,6 @@ static void wake_oom_reaper(struct task_struct *tsk) | |||
612 | wake_up(&oom_reaper_wait); | 645 | wake_up(&oom_reaper_wait); |
613 | } | 646 | } |
614 | 647 | ||
615 | /* Check if we can reap the given task. This has to be called with stable | ||
616 | * tsk->mm | ||
617 | */ | ||
618 | void try_oom_reaper(struct task_struct *tsk) | ||
619 | { | ||
620 | struct mm_struct *mm = tsk->mm; | ||
621 | struct task_struct *p; | ||
622 | |||
623 | if (!mm) | ||
624 | return; | ||
625 | |||
626 | /* | ||
627 | * There might be other threads/processes which are either not | ||
628 | * dying or even not killable. | ||
629 | */ | ||
630 | if (atomic_read(&mm->mm_users) > 1) { | ||
631 | rcu_read_lock(); | ||
632 | for_each_process(p) { | ||
633 | if (!process_shares_mm(p, mm)) | ||
634 | continue; | ||
635 | if (fatal_signal_pending(p)) | ||
636 | continue; | ||
637 | |||
638 | /* | ||
639 | * If the task is exiting make sure the whole thread group | ||
640 | * is exiting and cannot acces mm anymore. | ||
641 | */ | ||
642 | if (signal_group_exit(p->signal)) | ||
643 | continue; | ||
644 | |||
645 | /* Give up */ | ||
646 | rcu_read_unlock(); | ||
647 | return; | ||
648 | } | ||
649 | rcu_read_unlock(); | ||
650 | } | ||
651 | |||
652 | wake_oom_reaper(tsk); | ||
653 | } | ||
654 | |||
655 | static int __init oom_init(void) | 648 | static int __init oom_init(void) |
656 | { | 649 | { |
657 | oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); | 650 | oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); |
@@ -663,10 +656,6 @@ static int __init oom_init(void) | |||
663 | return 0; | 656 | return 0; |
664 | } | 657 | } |
665 | subsys_initcall(oom_init) | 658 | subsys_initcall(oom_init) |
666 | #else | ||
667 | static void wake_oom_reaper(struct task_struct *tsk) | ||
668 | { | ||
669 | } | ||
670 | #endif | 659 | #endif |
671 | 660 | ||
672 | /** | 661 | /** |
@@ -743,6 +732,80 @@ void oom_killer_enable(void) | |||
743 | oom_killer_disabled = false; | 732 | oom_killer_disabled = false; |
744 | } | 733 | } |
745 | 734 | ||
735 | static inline bool __task_will_free_mem(struct task_struct *task) | ||
736 | { | ||
737 | struct signal_struct *sig = task->signal; | ||
738 | |||
739 | /* | ||
740 | * A coredumping process may sleep for an extended period in exit_mm(), | ||
741 | * so the oom killer cannot assume that the process will promptly exit | ||
742 | * and release memory. | ||
743 | */ | ||
744 | if (sig->flags & SIGNAL_GROUP_COREDUMP) | ||
745 | return false; | ||
746 | |||
747 | if (sig->flags & SIGNAL_GROUP_EXIT) | ||
748 | return true; | ||
749 | |||
750 | if (thread_group_empty(task) && (task->flags & PF_EXITING)) | ||
751 | return true; | ||
752 | |||
753 | return false; | ||
754 | } | ||
755 | |||
756 | /* | ||
757 | * Checks whether the given task is dying or exiting and likely to | ||
758 | * release its address space. This means that all threads and processes | ||
759 | * sharing the same mm have to be killed or exiting. | ||
760 | * Caller has to make sure that task->mm is stable (hold task_lock or | ||
761 | * it operates on the current). | ||
762 | */ | ||
763 | bool task_will_free_mem(struct task_struct *task) | ||
764 | { | ||
765 | struct mm_struct *mm = task->mm; | ||
766 | struct task_struct *p; | ||
767 | bool ret; | ||
768 | |||
769 | /* | ||
770 | * Skip tasks without mm because it might have passed its exit_mm and | ||
771 | * exit_oom_victim. oom_reaper could have rescued that but do not rely | ||
772 | * on that for now. We can consider find_lock_task_mm in future. | ||
773 | */ | ||
774 | if (!mm) | ||
775 | return false; | ||
776 | |||
777 | if (!__task_will_free_mem(task)) | ||
778 | return false; | ||
779 | |||
780 | /* | ||
781 | * This task has already been drained by the oom reaper so there are | ||
782 | * only small chances it will free some more | ||
783 | */ | ||
784 | if (test_bit(MMF_OOM_REAPED, &mm->flags)) | ||
785 | return false; | ||
786 | |||
787 | if (atomic_read(&mm->mm_users) <= 1) | ||
788 | return true; | ||
789 | |||
790 | /* | ||
791 | * This is really pessimistic but we do not have any reliable way | ||
792 | * to check that external processes share with our mm | ||
793 | */ | ||
794 | rcu_read_lock(); | ||
795 | for_each_process(p) { | ||
796 | if (!process_shares_mm(p, mm)) | ||
797 | continue; | ||
798 | if (same_thread_group(task, p)) | ||
799 | continue; | ||
800 | ret = __task_will_free_mem(p); | ||
801 | if (!ret) | ||
802 | break; | ||
803 | } | ||
804 | rcu_read_unlock(); | ||
805 | |||
806 | return ret; | ||
807 | } | ||
808 | |||
746 | /* | 809 | /* |
747 | * Must be called while holding a reference to p, which will be released upon | 810 | * Must be called while holding a reference to p, which will be released upon |
748 | * returning. | 811 | * returning. |
@@ -765,9 +828,9 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
765 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 828 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
766 | */ | 829 | */ |
767 | task_lock(p); | 830 | task_lock(p); |
768 | if (p->mm && task_will_free_mem(p)) { | 831 | if (task_will_free_mem(p)) { |
769 | mark_oom_victim(p); | 832 | mark_oom_victim(p); |
770 | try_oom_reaper(p); | 833 | wake_oom_reaper(p); |
771 | task_unlock(p); | 834 | task_unlock(p); |
772 | put_task_struct(p); | 835 | put_task_struct(p); |
773 | return; | 836 | return; |
@@ -850,14 +913,18 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
850 | continue; | 913 | continue; |
851 | if (same_thread_group(p, victim)) | 914 | if (same_thread_group(p, victim)) |
852 | continue; | 915 | continue; |
853 | if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || | 916 | if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) { |
854 | p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { | ||
855 | /* | 917 | /* |
856 | * We cannot use oom_reaper for the mm shared by this | 918 | * We cannot use oom_reaper for the mm shared by this |
857 | * process because it wouldn't get killed and so the | 919 | * process because it wouldn't get killed and so the |
858 | * memory might be still used. | 920 | * memory might be still used. Hide the mm from the oom |
921 | * killer to guarantee OOM forward progress. | ||
859 | */ | 922 | */ |
860 | can_oom_reap = false; | 923 | can_oom_reap = false; |
924 | set_bit(MMF_OOM_REAPED, &mm->flags); | ||
925 | pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", | ||
926 | task_pid_nr(victim), victim->comm, | ||
927 | task_pid_nr(p), p->comm); | ||
861 | continue; | 928 | continue; |
862 | } | 929 | } |
863 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); | 930 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); |
@@ -939,14 +1006,10 @@ bool out_of_memory(struct oom_control *oc) | |||
939 | * If current has a pending SIGKILL or is exiting, then automatically | 1006 | * If current has a pending SIGKILL or is exiting, then automatically |
940 | * select it. The goal is to allow it to allocate so that it may | 1007 | * select it. The goal is to allow it to allocate so that it may |
941 | * quickly exit and free its memory. | 1008 | * quickly exit and free its memory. |
942 | * | ||
943 | * But don't select if current has already released its mm and cleared | ||
944 | * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur. | ||
945 | */ | 1009 | */ |
946 | if (current->mm && | 1010 | if (task_will_free_mem(current)) { |
947 | (fatal_signal_pending(current) || task_will_free_mem(current))) { | ||
948 | mark_oom_victim(current); | 1011 | mark_oom_victim(current); |
949 | try_oom_reaper(current); | 1012 | wake_oom_reaper(current); |
950 | return true; | 1013 | return true; |
951 | } | 1014 | } |
952 | 1015 | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d578d2a56b19..f4cd7d8005c9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -267,26 +267,35 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, | |||
267 | */ | 267 | */ |
268 | 268 | ||
269 | /** | 269 | /** |
270 | * zone_dirtyable_memory - number of dirtyable pages in a zone | 270 | * node_dirtyable_memory - number of dirtyable pages in a node |
271 | * @zone: the zone | 271 | * @pgdat: the node |
272 | * | 272 | * |
273 | * Returns the zone's number of pages potentially available for dirty | 273 | * Returns the node's number of pages potentially available for dirty |
274 | * page cache. This is the base value for the per-zone dirty limits. | 274 | * page cache. This is the base value for the per-node dirty limits. |
275 | */ | 275 | */ |
276 | static unsigned long zone_dirtyable_memory(struct zone *zone) | 276 | static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) |
277 | { | 277 | { |
278 | unsigned long nr_pages; | 278 | unsigned long nr_pages = 0; |
279 | int z; | ||
280 | |||
281 | for (z = 0; z < MAX_NR_ZONES; z++) { | ||
282 | struct zone *zone = pgdat->node_zones + z; | ||
283 | |||
284 | if (!populated_zone(zone)) | ||
285 | continue; | ||
286 | |||
287 | nr_pages += zone_page_state(zone, NR_FREE_PAGES); | ||
288 | } | ||
279 | 289 | ||
280 | nr_pages = zone_page_state(zone, NR_FREE_PAGES); | ||
281 | /* | 290 | /* |
282 | * Pages reserved for the kernel should not be considered | 291 | * Pages reserved for the kernel should not be considered |
283 | * dirtyable, to prevent a situation where reclaim has to | 292 | * dirtyable, to prevent a situation where reclaim has to |
284 | * clean pages in order to balance the zones. | 293 | * clean pages in order to balance the zones. |
285 | */ | 294 | */ |
286 | nr_pages -= min(nr_pages, zone->totalreserve_pages); | 295 | nr_pages -= min(nr_pages, pgdat->totalreserve_pages); |
287 | 296 | ||
288 | nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); | 297 | nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE); |
289 | nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); | 298 | nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE); |
290 | 299 | ||
291 | return nr_pages; | 300 | return nr_pages; |
292 | } | 301 | } |
@@ -299,13 +308,26 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) | |||
299 | int i; | 308 | int i; |
300 | 309 | ||
301 | for_each_node_state(node, N_HIGH_MEMORY) { | 310 | for_each_node_state(node, N_HIGH_MEMORY) { |
302 | for (i = 0; i < MAX_NR_ZONES; i++) { | 311 | for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { |
303 | struct zone *z = &NODE_DATA(node)->node_zones[i]; | 312 | struct zone *z; |
313 | unsigned long nr_pages; | ||
314 | |||
315 | if (!is_highmem_idx(i)) | ||
316 | continue; | ||
317 | |||
318 | z = &NODE_DATA(node)->node_zones[i]; | ||
319 | if (!populated_zone(z)) | ||
320 | continue; | ||
304 | 321 | ||
305 | if (is_highmem(z)) | 322 | nr_pages = zone_page_state(z, NR_FREE_PAGES); |
306 | x += zone_dirtyable_memory(z); | 323 | /* watch for underflows */ |
324 | nr_pages -= min(nr_pages, high_wmark_pages(z)); | ||
325 | nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE); | ||
326 | nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE); | ||
327 | x += nr_pages; | ||
307 | } | 328 | } |
308 | } | 329 | } |
330 | |||
309 | /* | 331 | /* |
310 | * Unreclaimable memory (kernel memory or anonymous memory | 332 | * Unreclaimable memory (kernel memory or anonymous memory |
311 | * without swap) can bring down the dirtyable pages below | 333 | * without swap) can bring down the dirtyable pages below |
@@ -348,8 +370,8 @@ static unsigned long global_dirtyable_memory(void) | |||
348 | */ | 370 | */ |
349 | x -= min(x, totalreserve_pages); | 371 | x -= min(x, totalreserve_pages); |
350 | 372 | ||
351 | x += global_page_state(NR_INACTIVE_FILE); | 373 | x += global_node_page_state(NR_INACTIVE_FILE); |
352 | x += global_page_state(NR_ACTIVE_FILE); | 374 | x += global_node_page_state(NR_ACTIVE_FILE); |
353 | 375 | ||
354 | if (!vm_highmem_is_dirtyable) | 376 | if (!vm_highmem_is_dirtyable) |
355 | x -= highmem_dirtyable_memory(x); | 377 | x -= highmem_dirtyable_memory(x); |
@@ -445,23 +467,23 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | |||
445 | } | 467 | } |
446 | 468 | ||
447 | /** | 469 | /** |
448 | * zone_dirty_limit - maximum number of dirty pages allowed in a zone | 470 | * node_dirty_limit - maximum number of dirty pages allowed in a node |
449 | * @zone: the zone | 471 | * @pgdat: the node |
450 | * | 472 | * |
451 | * Returns the maximum number of dirty pages allowed in a zone, based | 473 | * Returns the maximum number of dirty pages allowed in a node, based |
452 | * on the zone's dirtyable memory. | 474 | * on the node's dirtyable memory. |
453 | */ | 475 | */ |
454 | static unsigned long zone_dirty_limit(struct zone *zone) | 476 | static unsigned long node_dirty_limit(struct pglist_data *pgdat) |
455 | { | 477 | { |
456 | unsigned long zone_memory = zone_dirtyable_memory(zone); | 478 | unsigned long node_memory = node_dirtyable_memory(pgdat); |
457 | struct task_struct *tsk = current; | 479 | struct task_struct *tsk = current; |
458 | unsigned long dirty; | 480 | unsigned long dirty; |
459 | 481 | ||
460 | if (vm_dirty_bytes) | 482 | if (vm_dirty_bytes) |
461 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * | 483 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * |
462 | zone_memory / global_dirtyable_memory(); | 484 | node_memory / global_dirtyable_memory(); |
463 | else | 485 | else |
464 | dirty = vm_dirty_ratio * zone_memory / 100; | 486 | dirty = vm_dirty_ratio * node_memory / 100; |
465 | 487 | ||
466 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) | 488 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) |
467 | dirty += dirty / 4; | 489 | dirty += dirty / 4; |
@@ -470,19 +492,22 @@ static unsigned long zone_dirty_limit(struct zone *zone) | |||
470 | } | 492 | } |
471 | 493 | ||
472 | /** | 494 | /** |
473 | * zone_dirty_ok - tells whether a zone is within its dirty limits | 495 | * node_dirty_ok - tells whether a node is within its dirty limits |
474 | * @zone: the zone to check | 496 | * @pgdat: the node to check |
475 | * | 497 | * |
476 | * Returns %true when the dirty pages in @zone are within the zone's | 498 | * Returns %true when the dirty pages in @pgdat are within the node's |
477 | * dirty limit, %false if the limit is exceeded. | 499 | * dirty limit, %false if the limit is exceeded. |
478 | */ | 500 | */ |
479 | bool zone_dirty_ok(struct zone *zone) | 501 | bool node_dirty_ok(struct pglist_data *pgdat) |
480 | { | 502 | { |
481 | unsigned long limit = zone_dirty_limit(zone); | 503 | unsigned long limit = node_dirty_limit(pgdat); |
504 | unsigned long nr_pages = 0; | ||
505 | |||
506 | nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); | ||
507 | nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS); | ||
508 | nr_pages += node_page_state(pgdat, NR_WRITEBACK); | ||
482 | 509 | ||
483 | return zone_page_state(zone, NR_FILE_DIRTY) + | 510 | return nr_pages <= limit; |
484 | zone_page_state(zone, NR_UNSTABLE_NFS) + | ||
485 | zone_page_state(zone, NR_WRITEBACK) <= limit; | ||
486 | } | 511 | } |
487 | 512 | ||
488 | int dirty_background_ratio_handler(struct ctl_table *table, int write, | 513 | int dirty_background_ratio_handler(struct ctl_table *table, int write, |
@@ -1570,10 +1595,10 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1570 | * written to the server's write cache, but has not yet | 1595 | * written to the server's write cache, but has not yet |
1571 | * been flushed to permanent storage. | 1596 | * been flushed to permanent storage. |
1572 | */ | 1597 | */ |
1573 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 1598 | nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) + |
1574 | global_page_state(NR_UNSTABLE_NFS); | 1599 | global_node_page_state(NR_UNSTABLE_NFS); |
1575 | gdtc->avail = global_dirtyable_memory(); | 1600 | gdtc->avail = global_dirtyable_memory(); |
1576 | gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); | 1601 | gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); |
1577 | 1602 | ||
1578 | domain_dirty_limits(gdtc); | 1603 | domain_dirty_limits(gdtc); |
1579 | 1604 | ||
@@ -1910,8 +1935,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) | |||
1910 | * as we're trying to decide whether to put more under writeback. | 1935 | * as we're trying to decide whether to put more under writeback. |
1911 | */ | 1936 | */ |
1912 | gdtc->avail = global_dirtyable_memory(); | 1937 | gdtc->avail = global_dirtyable_memory(); |
1913 | gdtc->dirty = global_page_state(NR_FILE_DIRTY) + | 1938 | gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) + |
1914 | global_page_state(NR_UNSTABLE_NFS); | 1939 | global_node_page_state(NR_UNSTABLE_NFS); |
1915 | domain_dirty_limits(gdtc); | 1940 | domain_dirty_limits(gdtc); |
1916 | 1941 | ||
1917 | if (gdtc->dirty > gdtc->bg_thresh) | 1942 | if (gdtc->dirty > gdtc->bg_thresh) |
@@ -1955,8 +1980,8 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
1955 | */ | 1980 | */ |
1956 | dirty_thresh += dirty_thresh / 10; /* wheeee... */ | 1981 | dirty_thresh += dirty_thresh / 10; /* wheeee... */ |
1957 | 1982 | ||
1958 | if (global_page_state(NR_UNSTABLE_NFS) + | 1983 | if (global_node_page_state(NR_UNSTABLE_NFS) + |
1959 | global_page_state(NR_WRITEBACK) <= dirty_thresh) | 1984 | global_node_page_state(NR_WRITEBACK) <= dirty_thresh) |
1960 | break; | 1985 | break; |
1961 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1986 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
1962 | 1987 | ||
@@ -1984,8 +2009,8 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, | |||
1984 | void laptop_mode_timer_fn(unsigned long data) | 2009 | void laptop_mode_timer_fn(unsigned long data) |
1985 | { | 2010 | { |
1986 | struct request_queue *q = (struct request_queue *)data; | 2011 | struct request_queue *q = (struct request_queue *)data; |
1987 | int nr_pages = global_page_state(NR_FILE_DIRTY) + | 2012 | int nr_pages = global_node_page_state(NR_FILE_DIRTY) + |
1988 | global_page_state(NR_UNSTABLE_NFS); | 2013 | global_node_page_state(NR_UNSTABLE_NFS); |
1989 | struct bdi_writeback *wb; | 2014 | struct bdi_writeback *wb; |
1990 | 2015 | ||
1991 | /* | 2016 | /* |
@@ -2436,8 +2461,9 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) | |||
2436 | wb = inode_to_wb(inode); | 2461 | wb = inode_to_wb(inode); |
2437 | 2462 | ||
2438 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); | 2463 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); |
2439 | __inc_zone_page_state(page, NR_FILE_DIRTY); | 2464 | __inc_node_page_state(page, NR_FILE_DIRTY); |
2440 | __inc_zone_page_state(page, NR_DIRTIED); | 2465 | __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
2466 | __inc_node_page_state(page, NR_DIRTIED); | ||
2441 | __inc_wb_stat(wb, WB_RECLAIMABLE); | 2467 | __inc_wb_stat(wb, WB_RECLAIMABLE); |
2442 | __inc_wb_stat(wb, WB_DIRTIED); | 2468 | __inc_wb_stat(wb, WB_DIRTIED); |
2443 | task_io_account_write(PAGE_SIZE); | 2469 | task_io_account_write(PAGE_SIZE); |
@@ -2457,7 +2483,8 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, | |||
2457 | { | 2483 | { |
2458 | if (mapping_cap_account_dirty(mapping)) { | 2484 | if (mapping_cap_account_dirty(mapping)) { |
2459 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); | 2485 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); |
2460 | dec_zone_page_state(page, NR_FILE_DIRTY); | 2486 | dec_node_page_state(page, NR_FILE_DIRTY); |
2487 | dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); | ||
2461 | dec_wb_stat(wb, WB_RECLAIMABLE); | 2488 | dec_wb_stat(wb, WB_RECLAIMABLE); |
2462 | task_io_account_cancelled_write(PAGE_SIZE); | 2489 | task_io_account_cancelled_write(PAGE_SIZE); |
2463 | } | 2490 | } |
@@ -2525,7 +2552,7 @@ void account_page_redirty(struct page *page) | |||
2525 | 2552 | ||
2526 | wb = unlocked_inode_to_wb_begin(inode, &locked); | 2553 | wb = unlocked_inode_to_wb_begin(inode, &locked); |
2527 | current->nr_dirtied--; | 2554 | current->nr_dirtied--; |
2528 | dec_zone_page_state(page, NR_DIRTIED); | 2555 | dec_node_page_state(page, NR_DIRTIED); |
2529 | dec_wb_stat(wb, WB_DIRTIED); | 2556 | dec_wb_stat(wb, WB_DIRTIED); |
2530 | unlocked_inode_to_wb_end(inode, locked); | 2557 | unlocked_inode_to_wb_end(inode, locked); |
2531 | } | 2558 | } |
@@ -2713,7 +2740,8 @@ int clear_page_dirty_for_io(struct page *page) | |||
2713 | wb = unlocked_inode_to_wb_begin(inode, &locked); | 2740 | wb = unlocked_inode_to_wb_begin(inode, &locked); |
2714 | if (TestClearPageDirty(page)) { | 2741 | if (TestClearPageDirty(page)) { |
2715 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); | 2742 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); |
2716 | dec_zone_page_state(page, NR_FILE_DIRTY); | 2743 | dec_node_page_state(page, NR_FILE_DIRTY); |
2744 | dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); | ||
2717 | dec_wb_stat(wb, WB_RECLAIMABLE); | 2745 | dec_wb_stat(wb, WB_RECLAIMABLE); |
2718 | ret = 1; | 2746 | ret = 1; |
2719 | } | 2747 | } |
@@ -2759,8 +2787,9 @@ int test_clear_page_writeback(struct page *page) | |||
2759 | } | 2787 | } |
2760 | if (ret) { | 2788 | if (ret) { |
2761 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); | 2789 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); |
2762 | dec_zone_page_state(page, NR_WRITEBACK); | 2790 | dec_node_page_state(page, NR_WRITEBACK); |
2763 | inc_zone_page_state(page, NR_WRITTEN); | 2791 | dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
2792 | inc_node_page_state(page, NR_WRITTEN); | ||
2764 | } | 2793 | } |
2765 | unlock_page_memcg(page); | 2794 | unlock_page_memcg(page); |
2766 | return ret; | 2795 | return ret; |
@@ -2813,7 +2842,8 @@ int __test_set_page_writeback(struct page *page, bool keep_write) | |||
2813 | } | 2842 | } |
2814 | if (!ret) { | 2843 | if (!ret) { |
2815 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); | 2844 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); |
2816 | inc_zone_page_state(page, NR_WRITEBACK); | 2845 | inc_node_page_state(page, NR_WRITEBACK); |
2846 | inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); | ||
2817 | } | 2847 | } |
2818 | unlock_page_memcg(page); | 2848 | unlock_page_memcg(page); |
2819 | return ret; | 2849 | return ret; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 452513bf02ce..ea759b935360 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -295,14 +295,6 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn) | |||
295 | return false; | 295 | return false; |
296 | } | 296 | } |
297 | 297 | ||
298 | static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) | ||
299 | { | ||
300 | if (pfn >= NODE_DATA(nid)->first_deferred_pfn) | ||
301 | return true; | ||
302 | |||
303 | return false; | ||
304 | } | ||
305 | |||
306 | /* | 298 | /* |
307 | * Returns false when the remaining initialisation should be deferred until | 299 | * Returns false when the remaining initialisation should be deferred until |
308 | * later in the boot cycle when it can be parallelised. | 300 | * later in the boot cycle when it can be parallelised. |
@@ -342,11 +334,6 @@ static inline bool early_page_uninitialised(unsigned long pfn) | |||
342 | return false; | 334 | return false; |
343 | } | 335 | } |
344 | 336 | ||
345 | static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) | ||
346 | { | ||
347 | return false; | ||
348 | } | ||
349 | |||
350 | static inline bool update_defer_init(pg_data_t *pgdat, | 337 | static inline bool update_defer_init(pg_data_t *pgdat, |
351 | unsigned long pfn, unsigned long zone_end, | 338 | unsigned long pfn, unsigned long zone_end, |
352 | unsigned long *nr_initialised) | 339 | unsigned long *nr_initialised) |
@@ -1091,9 +1078,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
1091 | 1078 | ||
1092 | spin_lock(&zone->lock); | 1079 | spin_lock(&zone->lock); |
1093 | isolated_pageblocks = has_isolate_pageblock(zone); | 1080 | isolated_pageblocks = has_isolate_pageblock(zone); |
1094 | nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); | 1081 | nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); |
1095 | if (nr_scanned) | 1082 | if (nr_scanned) |
1096 | __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); | 1083 | __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); |
1097 | 1084 | ||
1098 | while (count) { | 1085 | while (count) { |
1099 | struct page *page; | 1086 | struct page *page; |
@@ -1148,9 +1135,9 @@ static void free_one_page(struct zone *zone, | |||
1148 | { | 1135 | { |
1149 | unsigned long nr_scanned; | 1136 | unsigned long nr_scanned; |
1150 | spin_lock(&zone->lock); | 1137 | spin_lock(&zone->lock); |
1151 | nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); | 1138 | nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); |
1152 | if (nr_scanned) | 1139 | if (nr_scanned) |
1153 | __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); | 1140 | __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); |
1154 | 1141 | ||
1155 | if (unlikely(has_isolate_pageblock(zone) || | 1142 | if (unlikely(has_isolate_pageblock(zone) || |
1156 | is_migrate_isolate(migratetype))) { | 1143 | is_migrate_isolate(migratetype))) { |
@@ -2517,7 +2504,10 @@ int __isolate_free_page(struct page *page, unsigned int order) | |||
2517 | zone->free_area[order].nr_free--; | 2504 | zone->free_area[order].nr_free--; |
2518 | rmv_page_order(page); | 2505 | rmv_page_order(page); |
2519 | 2506 | ||
2520 | /* Set the pageblock if the isolated page is at least a pageblock */ | 2507 | /* |
2508 | * Set the pageblock if the isolated page is at least half of a | ||
2509 | * pageblock | ||
2510 | */ | ||
2521 | if (order >= pageblock_order - 1) { | 2511 | if (order >= pageblock_order - 1) { |
2522 | struct page *endpage = page + (1 << order) - 1; | 2512 | struct page *endpage = page + (1 << order) - 1; |
2523 | for (; page < endpage; page += pageblock_nr_pages) { | 2513 | for (; page < endpage; page += pageblock_nr_pages) { |
@@ -2597,7 +2587,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
2597 | else | 2587 | else |
2598 | page = list_first_entry(list, struct page, lru); | 2588 | page = list_first_entry(list, struct page, lru); |
2599 | 2589 | ||
2600 | __dec_zone_state(zone, NR_ALLOC_BATCH); | ||
2601 | list_del(&page->lru); | 2590 | list_del(&page->lru); |
2602 | pcp->count--; | 2591 | pcp->count--; |
2603 | 2592 | ||
@@ -2623,16 +2612,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
2623 | spin_unlock(&zone->lock); | 2612 | spin_unlock(&zone->lock); |
2624 | if (!page) | 2613 | if (!page) |
2625 | goto failed; | 2614 | goto failed; |
2626 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); | ||
2627 | __mod_zone_freepage_state(zone, -(1 << order), | 2615 | __mod_zone_freepage_state(zone, -(1 << order), |
2628 | get_pcppage_migratetype(page)); | 2616 | get_pcppage_migratetype(page)); |
2629 | } | 2617 | } |
2630 | 2618 | ||
2631 | if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && | 2619 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); |
2632 | !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) | ||
2633 | set_bit(ZONE_FAIR_DEPLETED, &zone->flags); | ||
2634 | |||
2635 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | ||
2636 | zone_statistics(preferred_zone, zone, gfp_flags); | 2620 | zone_statistics(preferred_zone, zone, gfp_flags); |
2637 | local_irq_restore(flags); | 2621 | local_irq_restore(flags); |
2638 | 2622 | ||
@@ -2842,40 +2826,18 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, | |||
2842 | } | 2826 | } |
2843 | 2827 | ||
2844 | #ifdef CONFIG_NUMA | 2828 | #ifdef CONFIG_NUMA |
2845 | static bool zone_local(struct zone *local_zone, struct zone *zone) | ||
2846 | { | ||
2847 | return local_zone->node == zone->node; | ||
2848 | } | ||
2849 | |||
2850 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | 2829 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) |
2851 | { | 2830 | { |
2852 | return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < | 2831 | return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < |
2853 | RECLAIM_DISTANCE; | 2832 | RECLAIM_DISTANCE; |
2854 | } | 2833 | } |
2855 | #else /* CONFIG_NUMA */ | 2834 | #else /* CONFIG_NUMA */ |
2856 | static bool zone_local(struct zone *local_zone, struct zone *zone) | ||
2857 | { | ||
2858 | return true; | ||
2859 | } | ||
2860 | |||
2861 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | 2835 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) |
2862 | { | 2836 | { |
2863 | return true; | 2837 | return true; |
2864 | } | 2838 | } |
2865 | #endif /* CONFIG_NUMA */ | 2839 | #endif /* CONFIG_NUMA */ |
2866 | 2840 | ||
2867 | static void reset_alloc_batches(struct zone *preferred_zone) | ||
2868 | { | ||
2869 | struct zone *zone = preferred_zone->zone_pgdat->node_zones; | ||
2870 | |||
2871 | do { | ||
2872 | mod_zone_page_state(zone, NR_ALLOC_BATCH, | ||
2873 | high_wmark_pages(zone) - low_wmark_pages(zone) - | ||
2874 | atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); | ||
2875 | clear_bit(ZONE_FAIR_DEPLETED, &zone->flags); | ||
2876 | } while (zone++ != preferred_zone); | ||
2877 | } | ||
2878 | |||
2879 | /* | 2841 | /* |
2880 | * get_page_from_freelist goes through the zonelist trying to allocate | 2842 | * get_page_from_freelist goes through the zonelist trying to allocate |
2881 | * a page. | 2843 | * a page. |
@@ -2886,10 +2848,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, | |||
2886 | { | 2848 | { |
2887 | struct zoneref *z = ac->preferred_zoneref; | 2849 | struct zoneref *z = ac->preferred_zoneref; |
2888 | struct zone *zone; | 2850 | struct zone *zone; |
2889 | bool fair_skipped = false; | 2851 | struct pglist_data *last_pgdat_dirty_limit = NULL; |
2890 | bool apply_fair = (alloc_flags & ALLOC_FAIR); | ||
2891 | 2852 | ||
2892 | zonelist_scan: | ||
2893 | /* | 2853 | /* |
2894 | * Scan zonelist, looking for a zone with enough free. | 2854 | * Scan zonelist, looking for a zone with enough free. |
2895 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. | 2855 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. |
@@ -2904,50 +2864,33 @@ zonelist_scan: | |||
2904 | !__cpuset_zone_allowed(zone, gfp_mask)) | 2864 | !__cpuset_zone_allowed(zone, gfp_mask)) |
2905 | continue; | 2865 | continue; |
2906 | /* | 2866 | /* |
2907 | * Distribute pages in proportion to the individual | ||
2908 | * zone size to ensure fair page aging. The zone a | ||
2909 | * page was allocated in should have no effect on the | ||
2910 | * time the page has in memory before being reclaimed. | ||
2911 | */ | ||
2912 | if (apply_fair) { | ||
2913 | if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { | ||
2914 | fair_skipped = true; | ||
2915 | continue; | ||
2916 | } | ||
2917 | if (!zone_local(ac->preferred_zoneref->zone, zone)) { | ||
2918 | if (fair_skipped) | ||
2919 | goto reset_fair; | ||
2920 | apply_fair = false; | ||
2921 | } | ||
2922 | } | ||
2923 | /* | ||
2924 | * When allocating a page cache page for writing, we | 2867 | * When allocating a page cache page for writing, we |
2925 | * want to get it from a zone that is within its dirty | 2868 | * want to get it from a node that is within its dirty |
2926 | * limit, such that no single zone holds more than its | 2869 | * limit, such that no single node holds more than its |
2927 | * proportional share of globally allowed dirty pages. | 2870 | * proportional share of globally allowed dirty pages. |
2928 | * The dirty limits take into account the zone's | 2871 | * The dirty limits take into account the node's |
2929 | * lowmem reserves and high watermark so that kswapd | 2872 | * lowmem reserves and high watermark so that kswapd |
2930 | * should be able to balance it without having to | 2873 | * should be able to balance it without having to |
2931 | * write pages from its LRU list. | 2874 | * write pages from its LRU list. |
2932 | * | 2875 | * |
2933 | * This may look like it could increase pressure on | ||
2934 | * lower zones by failing allocations in higher zones | ||
2935 | * before they are full. But the pages that do spill | ||
2936 | * over are limited as the lower zones are protected | ||
2937 | * by this very same mechanism. It should not become | ||
2938 | * a practical burden to them. | ||
2939 | * | ||
2940 | * XXX: For now, allow allocations to potentially | 2876 | * XXX: For now, allow allocations to potentially |
2941 | * exceed the per-zone dirty limit in the slowpath | 2877 | * exceed the per-node dirty limit in the slowpath |
2942 | * (spread_dirty_pages unset) before going into reclaim, | 2878 | * (spread_dirty_pages unset) before going into reclaim, |
2943 | * which is important when on a NUMA setup the allowed | 2879 | * which is important when on a NUMA setup the allowed |
2944 | * zones are together not big enough to reach the | 2880 | * nodes are together not big enough to reach the |
2945 | * global limit. The proper fix for these situations | 2881 | * global limit. The proper fix for these situations |
2946 | * will require awareness of zones in the | 2882 | * will require awareness of nodes in the |
2947 | * dirty-throttling and the flusher threads. | 2883 | * dirty-throttling and the flusher threads. |
2948 | */ | 2884 | */ |
2949 | if (ac->spread_dirty_pages && !zone_dirty_ok(zone)) | 2885 | if (ac->spread_dirty_pages) { |
2950 | continue; | 2886 | if (last_pgdat_dirty_limit == zone->zone_pgdat) |
2887 | continue; | ||
2888 | |||
2889 | if (!node_dirty_ok(zone->zone_pgdat)) { | ||
2890 | last_pgdat_dirty_limit = zone->zone_pgdat; | ||
2891 | continue; | ||
2892 | } | ||
2893 | } | ||
2951 | 2894 | ||
2952 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; | 2895 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
2953 | if (!zone_watermark_fast(zone, order, mark, | 2896 | if (!zone_watermark_fast(zone, order, mark, |
@@ -2959,16 +2902,16 @@ zonelist_scan: | |||
2959 | if (alloc_flags & ALLOC_NO_WATERMARKS) | 2902 | if (alloc_flags & ALLOC_NO_WATERMARKS) |
2960 | goto try_this_zone; | 2903 | goto try_this_zone; |
2961 | 2904 | ||
2962 | if (zone_reclaim_mode == 0 || | 2905 | if (node_reclaim_mode == 0 || |
2963 | !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) | 2906 | !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) |
2964 | continue; | 2907 | continue; |
2965 | 2908 | ||
2966 | ret = zone_reclaim(zone, gfp_mask, order); | 2909 | ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); |
2967 | switch (ret) { | 2910 | switch (ret) { |
2968 | case ZONE_RECLAIM_NOSCAN: | 2911 | case NODE_RECLAIM_NOSCAN: |
2969 | /* did not scan */ | 2912 | /* did not scan */ |
2970 | continue; | 2913 | continue; |
2971 | case ZONE_RECLAIM_FULL: | 2914 | case NODE_RECLAIM_FULL: |
2972 | /* scanned but unreclaimable */ | 2915 | /* scanned but unreclaimable */ |
2973 | continue; | 2916 | continue; |
2974 | default: | 2917 | default: |
@@ -2998,23 +2941,6 @@ try_this_zone: | |||
2998 | } | 2941 | } |
2999 | } | 2942 | } |
3000 | 2943 | ||
3001 | /* | ||
3002 | * The first pass makes sure allocations are spread fairly within the | ||
3003 | * local node. However, the local node might have free pages left | ||
3004 | * after the fairness batches are exhausted, and remote zones haven't | ||
3005 | * even been considered yet. Try once more without fairness, and | ||
3006 | * include remote zones now, before entering the slowpath and waking | ||
3007 | * kswapd: prefer spilling to a remote zone over swapping locally. | ||
3008 | */ | ||
3009 | if (fair_skipped) { | ||
3010 | reset_fair: | ||
3011 | apply_fair = false; | ||
3012 | fair_skipped = false; | ||
3013 | reset_alloc_batches(ac->preferred_zoneref->zone); | ||
3014 | z = ac->preferred_zoneref; | ||
3015 | goto zonelist_scan; | ||
3016 | } | ||
3017 | |||
3018 | return NULL; | 2944 | return NULL; |
3019 | } | 2945 | } |
3020 | 2946 | ||
@@ -3159,7 +3085,6 @@ out: | |||
3159 | return page; | 3085 | return page; |
3160 | } | 3086 | } |
3161 | 3087 | ||
3162 | |||
3163 | /* | 3088 | /* |
3164 | * Maximum number of compaction retries wit a progress before OOM | 3089 | * Maximum number of compaction retries wit a progress before OOM |
3165 | * killer is consider as the only way to move forward. | 3090 | * killer is consider as the only way to move forward. |
@@ -3171,17 +3096,16 @@ out: | |||
3171 | static struct page * | 3096 | static struct page * |
3172 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 3097 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
3173 | unsigned int alloc_flags, const struct alloc_context *ac, | 3098 | unsigned int alloc_flags, const struct alloc_context *ac, |
3174 | enum migrate_mode mode, enum compact_result *compact_result) | 3099 | enum compact_priority prio, enum compact_result *compact_result) |
3175 | { | 3100 | { |
3176 | struct page *page; | 3101 | struct page *page; |
3177 | int contended_compaction; | ||
3178 | 3102 | ||
3179 | if (!order) | 3103 | if (!order) |
3180 | return NULL; | 3104 | return NULL; |
3181 | 3105 | ||
3182 | current->flags |= PF_MEMALLOC; | 3106 | current->flags |= PF_MEMALLOC; |
3183 | *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, | 3107 | *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, |
3184 | mode, &contended_compaction); | 3108 | prio); |
3185 | current->flags &= ~PF_MEMALLOC; | 3109 | current->flags &= ~PF_MEMALLOC; |
3186 | 3110 | ||
3187 | if (*compact_result <= COMPACT_INACTIVE) | 3111 | if (*compact_result <= COMPACT_INACTIVE) |
@@ -3193,8 +3117,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
3193 | */ | 3117 | */ |
3194 | count_vm_event(COMPACTSTALL); | 3118 | count_vm_event(COMPACTSTALL); |
3195 | 3119 | ||
3196 | page = get_page_from_freelist(gfp_mask, order, | 3120 | page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); |
3197 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); | ||
3198 | 3121 | ||
3199 | if (page) { | 3122 | if (page) { |
3200 | struct zone *zone = page_zone(page); | 3123 | struct zone *zone = page_zone(page); |
@@ -3211,24 +3134,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
3211 | */ | 3134 | */ |
3212 | count_vm_event(COMPACTFAIL); | 3135 | count_vm_event(COMPACTFAIL); |
3213 | 3136 | ||
3214 | /* | ||
3215 | * In all zones where compaction was attempted (and not | ||
3216 | * deferred or skipped), lock contention has been detected. | ||
3217 | * For THP allocation we do not want to disrupt the others | ||
3218 | * so we fallback to base pages instead. | ||
3219 | */ | ||
3220 | if (contended_compaction == COMPACT_CONTENDED_LOCK) | ||
3221 | *compact_result = COMPACT_CONTENDED; | ||
3222 | |||
3223 | /* | ||
3224 | * If compaction was aborted due to need_resched(), we do not | ||
3225 | * want to further increase allocation latency, unless it is | ||
3226 | * khugepaged trying to collapse. | ||
3227 | */ | ||
3228 | if (contended_compaction == COMPACT_CONTENDED_SCHED | ||
3229 | && !(current->flags & PF_KTHREAD)) | ||
3230 | *compact_result = COMPACT_CONTENDED; | ||
3231 | |||
3232 | cond_resched(); | 3137 | cond_resched(); |
3233 | 3138 | ||
3234 | return NULL; | 3139 | return NULL; |
@@ -3236,7 +3141,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
3236 | 3141 | ||
3237 | static inline bool | 3142 | static inline bool |
3238 | should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, | 3143 | should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, |
3239 | enum compact_result compact_result, enum migrate_mode *migrate_mode, | 3144 | enum compact_result compact_result, |
3145 | enum compact_priority *compact_priority, | ||
3240 | int compaction_retries) | 3146 | int compaction_retries) |
3241 | { | 3147 | { |
3242 | int max_retries = MAX_COMPACT_RETRIES; | 3148 | int max_retries = MAX_COMPACT_RETRIES; |
@@ -3247,11 +3153,11 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, | |||
3247 | /* | 3153 | /* |
3248 | * compaction considers all the zone as desperately out of memory | 3154 | * compaction considers all the zone as desperately out of memory |
3249 | * so it doesn't really make much sense to retry except when the | 3155 | * so it doesn't really make much sense to retry except when the |
3250 | * failure could be caused by weak migration mode. | 3156 | * failure could be caused by insufficient priority |
3251 | */ | 3157 | */ |
3252 | if (compaction_failed(compact_result)) { | 3158 | if (compaction_failed(compact_result)) { |
3253 | if (*migrate_mode == MIGRATE_ASYNC) { | 3159 | if (*compact_priority > MIN_COMPACT_PRIORITY) { |
3254 | *migrate_mode = MIGRATE_SYNC_LIGHT; | 3160 | (*compact_priority)--; |
3255 | return true; | 3161 | return true; |
3256 | } | 3162 | } |
3257 | return false; | 3163 | return false; |
@@ -3285,7 +3191,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, | |||
3285 | static inline struct page * | 3191 | static inline struct page * |
3286 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 3192 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
3287 | unsigned int alloc_flags, const struct alloc_context *ac, | 3193 | unsigned int alloc_flags, const struct alloc_context *ac, |
3288 | enum migrate_mode mode, enum compact_result *compact_result) | 3194 | enum compact_priority prio, enum compact_result *compact_result) |
3289 | { | 3195 | { |
3290 | *compact_result = COMPACT_SKIPPED; | 3196 | *compact_result = COMPACT_SKIPPED; |
3291 | return NULL; | 3197 | return NULL; |
@@ -3294,7 +3200,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
3294 | static inline bool | 3200 | static inline bool |
3295 | should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, | 3201 | should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, |
3296 | enum compact_result compact_result, | 3202 | enum compact_result compact_result, |
3297 | enum migrate_mode *migrate_mode, | 3203 | enum compact_priority *compact_priority, |
3298 | int compaction_retries) | 3204 | int compaction_retries) |
3299 | { | 3205 | { |
3300 | struct zone *zone; | 3206 | struct zone *zone; |
@@ -3362,8 +3268,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
3362 | return NULL; | 3268 | return NULL; |
3363 | 3269 | ||
3364 | retry: | 3270 | retry: |
3365 | page = get_page_from_freelist(gfp_mask, order, | 3271 | page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); |
3366 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); | ||
3367 | 3272 | ||
3368 | /* | 3273 | /* |
3369 | * If an allocation failed after direct reclaim, it could be because | 3274 | * If an allocation failed after direct reclaim, it could be because |
@@ -3384,10 +3289,14 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) | |||
3384 | { | 3289 | { |
3385 | struct zoneref *z; | 3290 | struct zoneref *z; |
3386 | struct zone *zone; | 3291 | struct zone *zone; |
3292 | pg_data_t *last_pgdat = NULL; | ||
3387 | 3293 | ||
3388 | for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, | 3294 | for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, |
3389 | ac->high_zoneidx, ac->nodemask) | 3295 | ac->high_zoneidx, ac->nodemask) { |
3390 | wakeup_kswapd(zone, order, ac_classzone_idx(ac)); | 3296 | if (last_pgdat != zone->zone_pgdat) |
3297 | wakeup_kswapd(zone, order, ac->high_zoneidx); | ||
3298 | last_pgdat = zone->zone_pgdat; | ||
3299 | } | ||
3391 | } | 3300 | } |
3392 | 3301 | ||
3393 | static inline unsigned int | 3302 | static inline unsigned int |
@@ -3421,16 +3330,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
3421 | } else if (unlikely(rt_task(current)) && !in_interrupt()) | 3330 | } else if (unlikely(rt_task(current)) && !in_interrupt()) |
3422 | alloc_flags |= ALLOC_HARDER; | 3331 | alloc_flags |= ALLOC_HARDER; |
3423 | 3332 | ||
3424 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | ||
3425 | if (gfp_mask & __GFP_MEMALLOC) | ||
3426 | alloc_flags |= ALLOC_NO_WATERMARKS; | ||
3427 | else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) | ||
3428 | alloc_flags |= ALLOC_NO_WATERMARKS; | ||
3429 | else if (!in_interrupt() && | ||
3430 | ((current->flags & PF_MEMALLOC) || | ||
3431 | unlikely(test_thread_flag(TIF_MEMDIE)))) | ||
3432 | alloc_flags |= ALLOC_NO_WATERMARKS; | ||
3433 | } | ||
3434 | #ifdef CONFIG_CMA | 3333 | #ifdef CONFIG_CMA |
3435 | if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | 3334 | if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) |
3436 | alloc_flags |= ALLOC_CMA; | 3335 | alloc_flags |= ALLOC_CMA; |
@@ -3440,12 +3339,19 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
3440 | 3339 | ||
3441 | bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | 3340 | bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) |
3442 | { | 3341 | { |
3443 | return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); | 3342 | if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) |
3444 | } | 3343 | return false; |
3445 | 3344 | ||
3446 | static inline bool is_thp_gfp_mask(gfp_t gfp_mask) | 3345 | if (gfp_mask & __GFP_MEMALLOC) |
3447 | { | 3346 | return true; |
3448 | return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; | 3347 | if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) |
3348 | return true; | ||
3349 | if (!in_interrupt() && | ||
3350 | ((current->flags & PF_MEMALLOC) || | ||
3351 | unlikely(test_thread_flag(TIF_MEMDIE)))) | ||
3352 | return true; | ||
3353 | |||
3354 | return false; | ||
3449 | } | 3355 | } |
3450 | 3356 | ||
3451 | /* | 3357 | /* |
@@ -3481,10 +3387,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, | |||
3481 | return false; | 3387 | return false; |
3482 | 3388 | ||
3483 | /* | 3389 | /* |
3484 | * Keep reclaiming pages while there is a chance this will lead somewhere. | 3390 | * Keep reclaiming pages while there is a chance this will lead |
3485 | * If none of the target zones can satisfy our allocation request even | 3391 | * somewhere. If none of the target zones can satisfy our allocation |
3486 | * if all reclaimable pages are considered then we are screwed and have | 3392 | * request even if all reclaimable pages are considered then we are |
3487 | * to go OOM. | 3393 | * screwed and have to go OOM. |
3488 | */ | 3394 | */ |
3489 | for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, | 3395 | for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, |
3490 | ac->nodemask) { | 3396 | ac->nodemask) { |
@@ -3509,14 +3415,12 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, | |||
3509 | * prevent from pre mature OOM | 3415 | * prevent from pre mature OOM |
3510 | */ | 3416 | */ |
3511 | if (!did_some_progress) { | 3417 | if (!did_some_progress) { |
3512 | unsigned long writeback; | 3418 | unsigned long write_pending; |
3513 | unsigned long dirty; | ||
3514 | 3419 | ||
3515 | writeback = zone_page_state_snapshot(zone, | 3420 | write_pending = zone_page_state_snapshot(zone, |
3516 | NR_WRITEBACK); | 3421 | NR_ZONE_WRITE_PENDING); |
3517 | dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY); | ||
3518 | 3422 | ||
3519 | if (2*(writeback + dirty) > reclaimable) { | 3423 | if (2 * write_pending > reclaimable) { |
3520 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 3424 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
3521 | return true; | 3425 | return true; |
3522 | } | 3426 | } |
@@ -3551,7 +3455,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
3551 | struct page *page = NULL; | 3455 | struct page *page = NULL; |
3552 | unsigned int alloc_flags; | 3456 | unsigned int alloc_flags; |
3553 | unsigned long did_some_progress; | 3457 | unsigned long did_some_progress; |
3554 | enum migrate_mode migration_mode = MIGRATE_ASYNC; | 3458 | enum compact_priority compact_priority = DEF_COMPACT_PRIORITY; |
3555 | enum compact_result compact_result; | 3459 | enum compact_result compact_result; |
3556 | int compaction_retries = 0; | 3460 | int compaction_retries = 0; |
3557 | int no_progress_loops = 0; | 3461 | int no_progress_loops = 0; |
@@ -3575,42 +3479,88 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
3575 | (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) | 3479 | (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) |
3576 | gfp_mask &= ~__GFP_ATOMIC; | 3480 | gfp_mask &= ~__GFP_ATOMIC; |
3577 | 3481 | ||
3578 | retry: | 3482 | /* |
3483 | * The fast path uses conservative alloc_flags to succeed only until | ||
3484 | * kswapd needs to be woken up, and to avoid the cost of setting up | ||
3485 | * alloc_flags precisely. So we do that now. | ||
3486 | */ | ||
3487 | alloc_flags = gfp_to_alloc_flags(gfp_mask); | ||
3488 | |||
3579 | if (gfp_mask & __GFP_KSWAPD_RECLAIM) | 3489 | if (gfp_mask & __GFP_KSWAPD_RECLAIM) |
3580 | wake_all_kswapds(order, ac); | 3490 | wake_all_kswapds(order, ac); |
3581 | 3491 | ||
3582 | /* | 3492 | /* |
3583 | * OK, we're below the kswapd watermark and have kicked background | 3493 | * The adjusted alloc_flags might result in immediate success, so try |
3584 | * reclaim. Now things get more complex, so set up alloc_flags according | 3494 | * that first |
3585 | * to how we want to proceed. | ||
3586 | */ | 3495 | */ |
3587 | alloc_flags = gfp_to_alloc_flags(gfp_mask); | 3496 | page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); |
3497 | if (page) | ||
3498 | goto got_pg; | ||
3499 | |||
3500 | /* | ||
3501 | * For costly allocations, try direct compaction first, as it's likely | ||
3502 | * that we have enough base pages and don't need to reclaim. Don't try | ||
3503 | * that for allocations that are allowed to ignore watermarks, as the | ||
3504 | * ALLOC_NO_WATERMARKS attempt didn't yet happen. | ||
3505 | */ | ||
3506 | if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER && | ||
3507 | !gfp_pfmemalloc_allowed(gfp_mask)) { | ||
3508 | page = __alloc_pages_direct_compact(gfp_mask, order, | ||
3509 | alloc_flags, ac, | ||
3510 | INIT_COMPACT_PRIORITY, | ||
3511 | &compact_result); | ||
3512 | if (page) | ||
3513 | goto got_pg; | ||
3514 | |||
3515 | /* | ||
3516 | * Checks for costly allocations with __GFP_NORETRY, which | ||
3517 | * includes THP page fault allocations | ||
3518 | */ | ||
3519 | if (gfp_mask & __GFP_NORETRY) { | ||
3520 | /* | ||
3521 | * If compaction is deferred for high-order allocations, | ||
3522 | * it is because sync compaction recently failed. If | ||
3523 | * this is the case and the caller requested a THP | ||
3524 | * allocation, we do not want to heavily disrupt the | ||
3525 | * system, so we fail the allocation instead of entering | ||
3526 | * direct reclaim. | ||
3527 | */ | ||
3528 | if (compact_result == COMPACT_DEFERRED) | ||
3529 | goto nopage; | ||
3530 | |||
3531 | /* | ||
3532 | * Looks like reclaim/compaction is worth trying, but | ||
3533 | * sync compaction could be very expensive, so keep | ||
3534 | * using async compaction. | ||
3535 | */ | ||
3536 | compact_priority = INIT_COMPACT_PRIORITY; | ||
3537 | } | ||
3538 | } | ||
3539 | |||
3540 | retry: | ||
3541 | /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ | ||
3542 | if (gfp_mask & __GFP_KSWAPD_RECLAIM) | ||
3543 | wake_all_kswapds(order, ac); | ||
3544 | |||
3545 | if (gfp_pfmemalloc_allowed(gfp_mask)) | ||
3546 | alloc_flags = ALLOC_NO_WATERMARKS; | ||
3588 | 3547 | ||
3589 | /* | 3548 | /* |
3590 | * Reset the zonelist iterators if memory policies can be ignored. | 3549 | * Reset the zonelist iterators if memory policies can be ignored. |
3591 | * These allocations are high priority and system rather than user | 3550 | * These allocations are high priority and system rather than user |
3592 | * orientated. | 3551 | * orientated. |
3593 | */ | 3552 | */ |
3594 | if ((alloc_flags & ALLOC_NO_WATERMARKS) || !(alloc_flags & ALLOC_CPUSET)) { | 3553 | if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) { |
3595 | ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); | 3554 | ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); |
3596 | ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, | 3555 | ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, |
3597 | ac->high_zoneidx, ac->nodemask); | 3556 | ac->high_zoneidx, ac->nodemask); |
3598 | } | 3557 | } |
3599 | 3558 | ||
3600 | /* This is the last chance, in general, before the goto nopage. */ | 3559 | /* Attempt with potentially adjusted zonelist and alloc_flags */ |
3601 | page = get_page_from_freelist(gfp_mask, order, | 3560 | page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); |
3602 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); | ||
3603 | if (page) | 3561 | if (page) |
3604 | goto got_pg; | 3562 | goto got_pg; |
3605 | 3563 | ||
3606 | /* Allocate without watermarks if the context allows */ | ||
3607 | if (alloc_flags & ALLOC_NO_WATERMARKS) { | ||
3608 | page = get_page_from_freelist(gfp_mask, order, | ||
3609 | ALLOC_NO_WATERMARKS, ac); | ||
3610 | if (page) | ||
3611 | goto got_pg; | ||
3612 | } | ||
3613 | |||
3614 | /* Caller is not willing to reclaim, we can't balance anything */ | 3564 | /* Caller is not willing to reclaim, we can't balance anything */ |
3615 | if (!can_direct_reclaim) { | 3565 | if (!can_direct_reclaim) { |
3616 | /* | 3566 | /* |
@@ -3640,38 +3590,6 @@ retry: | |||
3640 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) | 3590 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) |
3641 | goto nopage; | 3591 | goto nopage; |
3642 | 3592 | ||
3643 | /* | ||
3644 | * Try direct compaction. The first pass is asynchronous. Subsequent | ||
3645 | * attempts after direct reclaim are synchronous | ||
3646 | */ | ||
3647 | page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, | ||
3648 | migration_mode, | ||
3649 | &compact_result); | ||
3650 | if (page) | ||
3651 | goto got_pg; | ||
3652 | |||
3653 | /* Checks for THP-specific high-order allocations */ | ||
3654 | if (is_thp_gfp_mask(gfp_mask)) { | ||
3655 | /* | ||
3656 | * If compaction is deferred for high-order allocations, it is | ||
3657 | * because sync compaction recently failed. If this is the case | ||
3658 | * and the caller requested a THP allocation, we do not want | ||
3659 | * to heavily disrupt the system, so we fail the allocation | ||
3660 | * instead of entering direct reclaim. | ||
3661 | */ | ||
3662 | if (compact_result == COMPACT_DEFERRED) | ||
3663 | goto nopage; | ||
3664 | |||
3665 | /* | ||
3666 | * Compaction is contended so rather back off than cause | ||
3667 | * excessive stalls. | ||
3668 | */ | ||
3669 | if(compact_result == COMPACT_CONTENDED) | ||
3670 | goto nopage; | ||
3671 | } | ||
3672 | |||
3673 | if (order && compaction_made_progress(compact_result)) | ||
3674 | compaction_retries++; | ||
3675 | 3593 | ||
3676 | /* Try direct reclaim and then allocating */ | 3594 | /* Try direct reclaim and then allocating */ |
3677 | page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, | 3595 | page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, |
@@ -3679,16 +3597,25 @@ retry: | |||
3679 | if (page) | 3597 | if (page) |
3680 | goto got_pg; | 3598 | goto got_pg; |
3681 | 3599 | ||
3600 | /* Try direct compaction and then allocating */ | ||
3601 | page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, | ||
3602 | compact_priority, &compact_result); | ||
3603 | if (page) | ||
3604 | goto got_pg; | ||
3605 | |||
3606 | if (order && compaction_made_progress(compact_result)) | ||
3607 | compaction_retries++; | ||
3608 | |||
3682 | /* Do not loop if specifically requested */ | 3609 | /* Do not loop if specifically requested */ |
3683 | if (gfp_mask & __GFP_NORETRY) | 3610 | if (gfp_mask & __GFP_NORETRY) |
3684 | goto noretry; | 3611 | goto nopage; |
3685 | 3612 | ||
3686 | /* | 3613 | /* |
3687 | * Do not retry costly high order allocations unless they are | 3614 | * Do not retry costly high order allocations unless they are |
3688 | * __GFP_REPEAT | 3615 | * __GFP_REPEAT |
3689 | */ | 3616 | */ |
3690 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) | 3617 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) |
3691 | goto noretry; | 3618 | goto nopage; |
3692 | 3619 | ||
3693 | /* | 3620 | /* |
3694 | * Costly allocations might have made a progress but this doesn't mean | 3621 | * Costly allocations might have made a progress but this doesn't mean |
@@ -3712,7 +3639,7 @@ retry: | |||
3712 | */ | 3639 | */ |
3713 | if (did_some_progress > 0 && | 3640 | if (did_some_progress > 0 && |
3714 | should_compact_retry(ac, order, alloc_flags, | 3641 | should_compact_retry(ac, order, alloc_flags, |
3715 | compact_result, &migration_mode, | 3642 | compact_result, &compact_priority, |
3716 | compaction_retries)) | 3643 | compaction_retries)) |
3717 | goto retry; | 3644 | goto retry; |
3718 | 3645 | ||
@@ -3727,25 +3654,6 @@ retry: | |||
3727 | goto retry; | 3654 | goto retry; |
3728 | } | 3655 | } |
3729 | 3656 | ||
3730 | noretry: | ||
3731 | /* | ||
3732 | * High-order allocations do not necessarily loop after direct reclaim | ||
3733 | * and reclaim/compaction depends on compaction being called after | ||
3734 | * reclaim so call directly if necessary. | ||
3735 | * It can become very expensive to allocate transparent hugepages at | ||
3736 | * fault, so use asynchronous memory compaction for THP unless it is | ||
3737 | * khugepaged trying to collapse. All other requests should tolerate | ||
3738 | * at least light sync migration. | ||
3739 | */ | ||
3740 | if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD)) | ||
3741 | migration_mode = MIGRATE_ASYNC; | ||
3742 | else | ||
3743 | migration_mode = MIGRATE_SYNC_LIGHT; | ||
3744 | page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, | ||
3745 | ac, migration_mode, | ||
3746 | &compact_result); | ||
3747 | if (page) | ||
3748 | goto got_pg; | ||
3749 | nopage: | 3657 | nopage: |
3750 | warn_alloc_failed(gfp_mask, order, NULL); | 3658 | warn_alloc_failed(gfp_mask, order, NULL); |
3751 | got_pg: | 3659 | got_pg: |
@@ -3761,7 +3669,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
3761 | { | 3669 | { |
3762 | struct page *page; | 3670 | struct page *page; |
3763 | unsigned int cpuset_mems_cookie; | 3671 | unsigned int cpuset_mems_cookie; |
3764 | unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; | 3672 | unsigned int alloc_flags = ALLOC_WMARK_LOW; |
3765 | gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ | 3673 | gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ |
3766 | struct alloc_context ac = { | 3674 | struct alloc_context ac = { |
3767 | .high_zoneidx = gfp_zone(gfp_mask), | 3675 | .high_zoneidx = gfp_zone(gfp_mask), |
@@ -4192,7 +4100,7 @@ EXPORT_SYMBOL_GPL(si_mem_available); | |||
4192 | void si_meminfo(struct sysinfo *val) | 4100 | void si_meminfo(struct sysinfo *val) |
4193 | { | 4101 | { |
4194 | val->totalram = totalram_pages; | 4102 | val->totalram = totalram_pages; |
4195 | val->sharedram = global_page_state(NR_SHMEM); | 4103 | val->sharedram = global_node_page_state(NR_SHMEM); |
4196 | val->freeram = global_page_state(NR_FREE_PAGES); | 4104 | val->freeram = global_page_state(NR_FREE_PAGES); |
4197 | val->bufferram = nr_blockdev_pages(); | 4105 | val->bufferram = nr_blockdev_pages(); |
4198 | val->totalhigh = totalhigh_pages; | 4106 | val->totalhigh = totalhigh_pages; |
@@ -4214,8 +4122,8 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
4214 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) | 4122 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) |
4215 | managed_pages += pgdat->node_zones[zone_type].managed_pages; | 4123 | managed_pages += pgdat->node_zones[zone_type].managed_pages; |
4216 | val->totalram = managed_pages; | 4124 | val->totalram = managed_pages; |
4217 | val->sharedram = node_page_state(nid, NR_SHMEM); | 4125 | val->sharedram = node_page_state(pgdat, NR_SHMEM); |
4218 | val->freeram = node_page_state(nid, NR_FREE_PAGES); | 4126 | val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); |
4219 | #ifdef CONFIG_HIGHMEM | 4127 | #ifdef CONFIG_HIGHMEM |
4220 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { | 4128 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { |
4221 | struct zone *zone = &pgdat->node_zones[zone_type]; | 4129 | struct zone *zone = &pgdat->node_zones[zone_type]; |
@@ -4298,6 +4206,7 @@ void show_free_areas(unsigned int filter) | |||
4298 | unsigned long free_pcp = 0; | 4206 | unsigned long free_pcp = 0; |
4299 | int cpu; | 4207 | int cpu; |
4300 | struct zone *zone; | 4208 | struct zone *zone; |
4209 | pg_data_t *pgdat; | ||
4301 | 4210 | ||
4302 | for_each_populated_zone(zone) { | 4211 | for_each_populated_zone(zone) { |
4303 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | 4212 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
@@ -4312,35 +4221,74 @@ void show_free_areas(unsigned int filter) | |||
4312 | " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" | 4221 | " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" |
4313 | " slab_reclaimable:%lu slab_unreclaimable:%lu\n" | 4222 | " slab_reclaimable:%lu slab_unreclaimable:%lu\n" |
4314 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" | 4223 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" |
4315 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
4316 | " anon_thp: %lu shmem_thp: %lu shmem_pmdmapped: %lu\n" | ||
4317 | #endif | ||
4318 | " free:%lu free_pcp:%lu free_cma:%lu\n", | 4224 | " free:%lu free_pcp:%lu free_cma:%lu\n", |
4319 | global_page_state(NR_ACTIVE_ANON), | 4225 | global_node_page_state(NR_ACTIVE_ANON), |
4320 | global_page_state(NR_INACTIVE_ANON), | 4226 | global_node_page_state(NR_INACTIVE_ANON), |
4321 | global_page_state(NR_ISOLATED_ANON), | 4227 | global_node_page_state(NR_ISOLATED_ANON), |
4322 | global_page_state(NR_ACTIVE_FILE), | 4228 | global_node_page_state(NR_ACTIVE_FILE), |
4323 | global_page_state(NR_INACTIVE_FILE), | 4229 | global_node_page_state(NR_INACTIVE_FILE), |
4324 | global_page_state(NR_ISOLATED_FILE), | 4230 | global_node_page_state(NR_ISOLATED_FILE), |
4325 | global_page_state(NR_UNEVICTABLE), | 4231 | global_node_page_state(NR_UNEVICTABLE), |
4326 | global_page_state(NR_FILE_DIRTY), | 4232 | global_node_page_state(NR_FILE_DIRTY), |
4327 | global_page_state(NR_WRITEBACK), | 4233 | global_node_page_state(NR_WRITEBACK), |
4328 | global_page_state(NR_UNSTABLE_NFS), | 4234 | global_node_page_state(NR_UNSTABLE_NFS), |
4329 | global_page_state(NR_SLAB_RECLAIMABLE), | 4235 | global_page_state(NR_SLAB_RECLAIMABLE), |
4330 | global_page_state(NR_SLAB_UNRECLAIMABLE), | 4236 | global_page_state(NR_SLAB_UNRECLAIMABLE), |
4331 | global_page_state(NR_FILE_MAPPED), | 4237 | global_node_page_state(NR_FILE_MAPPED), |
4332 | global_page_state(NR_SHMEM), | 4238 | global_node_page_state(NR_SHMEM), |
4333 | global_page_state(NR_PAGETABLE), | 4239 | global_page_state(NR_PAGETABLE), |
4334 | global_page_state(NR_BOUNCE), | 4240 | global_page_state(NR_BOUNCE), |
4335 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
4336 | global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR, | ||
4337 | global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR, | ||
4338 | global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR, | ||
4339 | #endif | ||
4340 | global_page_state(NR_FREE_PAGES), | 4241 | global_page_state(NR_FREE_PAGES), |
4341 | free_pcp, | 4242 | free_pcp, |
4342 | global_page_state(NR_FREE_CMA_PAGES)); | 4243 | global_page_state(NR_FREE_CMA_PAGES)); |
4343 | 4244 | ||
4245 | for_each_online_pgdat(pgdat) { | ||
4246 | printk("Node %d" | ||
4247 | " active_anon:%lukB" | ||
4248 | " inactive_anon:%lukB" | ||
4249 | " active_file:%lukB" | ||
4250 | " inactive_file:%lukB" | ||
4251 | " unevictable:%lukB" | ||
4252 | " isolated(anon):%lukB" | ||
4253 | " isolated(file):%lukB" | ||
4254 | " mapped:%lukB" | ||
4255 | " dirty:%lukB" | ||
4256 | " writeback:%lukB" | ||
4257 | " shmem:%lukB" | ||
4258 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
4259 | " shmem_thp: %lukB" | ||
4260 | " shmem_pmdmapped: %lukB" | ||
4261 | " anon_thp: %lukB" | ||
4262 | #endif | ||
4263 | " writeback_tmp:%lukB" | ||
4264 | " unstable:%lukB" | ||
4265 | " pages_scanned:%lu" | ||
4266 | " all_unreclaimable? %s" | ||
4267 | "\n", | ||
4268 | pgdat->node_id, | ||
4269 | K(node_page_state(pgdat, NR_ACTIVE_ANON)), | ||
4270 | K(node_page_state(pgdat, NR_INACTIVE_ANON)), | ||
4271 | K(node_page_state(pgdat, NR_ACTIVE_FILE)), | ||
4272 | K(node_page_state(pgdat, NR_INACTIVE_FILE)), | ||
4273 | K(node_page_state(pgdat, NR_UNEVICTABLE)), | ||
4274 | K(node_page_state(pgdat, NR_ISOLATED_ANON)), | ||
4275 | K(node_page_state(pgdat, NR_ISOLATED_FILE)), | ||
4276 | K(node_page_state(pgdat, NR_FILE_MAPPED)), | ||
4277 | K(node_page_state(pgdat, NR_FILE_DIRTY)), | ||
4278 | K(node_page_state(pgdat, NR_WRITEBACK)), | ||
4279 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
4280 | K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), | ||
4281 | K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) | ||
4282 | * HPAGE_PMD_NR), | ||
4283 | K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), | ||
4284 | #endif | ||
4285 | K(node_page_state(pgdat, NR_SHMEM)), | ||
4286 | K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), | ||
4287 | K(node_page_state(pgdat, NR_UNSTABLE_NFS)), | ||
4288 | node_page_state(pgdat, NR_PAGES_SCANNED), | ||
4289 | !pgdat_reclaimable(pgdat) ? "yes" : "no"); | ||
4290 | } | ||
4291 | |||
4344 | for_each_populated_zone(zone) { | 4292 | for_each_populated_zone(zone) { |
4345 | int i; | 4293 | int i; |
4346 | 4294 | ||
@@ -4362,72 +4310,41 @@ void show_free_areas(unsigned int filter) | |||
4362 | " active_file:%lukB" | 4310 | " active_file:%lukB" |
4363 | " inactive_file:%lukB" | 4311 | " inactive_file:%lukB" |
4364 | " unevictable:%lukB" | 4312 | " unevictable:%lukB" |
4365 | " isolated(anon):%lukB" | 4313 | " writepending:%lukB" |
4366 | " isolated(file):%lukB" | ||
4367 | " present:%lukB" | 4314 | " present:%lukB" |
4368 | " managed:%lukB" | 4315 | " managed:%lukB" |
4369 | " mlocked:%lukB" | 4316 | " mlocked:%lukB" |
4370 | " dirty:%lukB" | ||
4371 | " writeback:%lukB" | ||
4372 | " mapped:%lukB" | ||
4373 | " shmem:%lukB" | ||
4374 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
4375 | " shmem_thp: %lukB" | ||
4376 | " shmem_pmdmapped: %lukB" | ||
4377 | " anon_thp: %lukB" | ||
4378 | #endif | ||
4379 | " slab_reclaimable:%lukB" | 4317 | " slab_reclaimable:%lukB" |
4380 | " slab_unreclaimable:%lukB" | 4318 | " slab_unreclaimable:%lukB" |
4381 | " kernel_stack:%lukB" | 4319 | " kernel_stack:%lukB" |
4382 | " pagetables:%lukB" | 4320 | " pagetables:%lukB" |
4383 | " unstable:%lukB" | ||
4384 | " bounce:%lukB" | 4321 | " bounce:%lukB" |
4385 | " free_pcp:%lukB" | 4322 | " free_pcp:%lukB" |
4386 | " local_pcp:%ukB" | 4323 | " local_pcp:%ukB" |
4387 | " free_cma:%lukB" | 4324 | " free_cma:%lukB" |
4388 | " writeback_tmp:%lukB" | ||
4389 | " pages_scanned:%lu" | ||
4390 | " all_unreclaimable? %s" | ||
4391 | "\n", | 4325 | "\n", |
4392 | zone->name, | 4326 | zone->name, |
4393 | K(zone_page_state(zone, NR_FREE_PAGES)), | 4327 | K(zone_page_state(zone, NR_FREE_PAGES)), |
4394 | K(min_wmark_pages(zone)), | 4328 | K(min_wmark_pages(zone)), |
4395 | K(low_wmark_pages(zone)), | 4329 | K(low_wmark_pages(zone)), |
4396 | K(high_wmark_pages(zone)), | 4330 | K(high_wmark_pages(zone)), |
4397 | K(zone_page_state(zone, NR_ACTIVE_ANON)), | 4331 | K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), |
4398 | K(zone_page_state(zone, NR_INACTIVE_ANON)), | 4332 | K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), |
4399 | K(zone_page_state(zone, NR_ACTIVE_FILE)), | 4333 | K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), |
4400 | K(zone_page_state(zone, NR_INACTIVE_FILE)), | 4334 | K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), |
4401 | K(zone_page_state(zone, NR_UNEVICTABLE)), | 4335 | K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), |
4402 | K(zone_page_state(zone, NR_ISOLATED_ANON)), | 4336 | K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), |
4403 | K(zone_page_state(zone, NR_ISOLATED_FILE)), | ||
4404 | K(zone->present_pages), | 4337 | K(zone->present_pages), |
4405 | K(zone->managed_pages), | 4338 | K(zone->managed_pages), |
4406 | K(zone_page_state(zone, NR_MLOCK)), | 4339 | K(zone_page_state(zone, NR_MLOCK)), |
4407 | K(zone_page_state(zone, NR_FILE_DIRTY)), | ||
4408 | K(zone_page_state(zone, NR_WRITEBACK)), | ||
4409 | K(zone_page_state(zone, NR_FILE_MAPPED)), | ||
4410 | K(zone_page_state(zone, NR_SHMEM)), | ||
4411 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
4412 | K(zone_page_state(zone, NR_SHMEM_THPS) * HPAGE_PMD_NR), | ||
4413 | K(zone_page_state(zone, NR_SHMEM_PMDMAPPED) | ||
4414 | * HPAGE_PMD_NR), | ||
4415 | K(zone_page_state(zone, NR_ANON_THPS) * HPAGE_PMD_NR), | ||
4416 | #endif | ||
4417 | K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), | 4340 | K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), |
4418 | K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), | 4341 | K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), |
4419 | zone_page_state(zone, NR_KERNEL_STACK) * | 4342 | zone_page_state(zone, NR_KERNEL_STACK_KB), |
4420 | THREAD_SIZE / 1024, | ||
4421 | K(zone_page_state(zone, NR_PAGETABLE)), | 4343 | K(zone_page_state(zone, NR_PAGETABLE)), |
4422 | K(zone_page_state(zone, NR_UNSTABLE_NFS)), | ||
4423 | K(zone_page_state(zone, NR_BOUNCE)), | 4344 | K(zone_page_state(zone, NR_BOUNCE)), |
4424 | K(free_pcp), | 4345 | K(free_pcp), |
4425 | K(this_cpu_read(zone->pageset->pcp.count)), | 4346 | K(this_cpu_read(zone->pageset->pcp.count)), |
4426 | K(zone_page_state(zone, NR_FREE_CMA_PAGES)), | 4347 | K(zone_page_state(zone, NR_FREE_CMA_PAGES))); |
4427 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), | ||
4428 | K(zone_page_state(zone, NR_PAGES_SCANNED)), | ||
4429 | (!zone_reclaimable(zone) ? "yes" : "no") | ||
4430 | ); | ||
4431 | printk("lowmem_reserve[]:"); | 4348 | printk("lowmem_reserve[]:"); |
4432 | for (i = 0; i < MAX_NR_ZONES; i++) | 4349 | for (i = 0; i < MAX_NR_ZONES; i++) |
4433 | printk(" %ld", zone->lowmem_reserve[i]); | 4350 | printk(" %ld", zone->lowmem_reserve[i]); |
@@ -4469,7 +4386,7 @@ void show_free_areas(unsigned int filter) | |||
4469 | 4386 | ||
4470 | hugetlb_show_meminfo(); | 4387 | hugetlb_show_meminfo(); |
4471 | 4388 | ||
4472 | printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); | 4389 | printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); |
4473 | 4390 | ||
4474 | show_swap_cache_info(); | 4391 | show_swap_cache_info(); |
4475 | } | 4392 | } |
@@ -5340,6 +5257,11 @@ static void __meminit setup_zone_pageset(struct zone *zone) | |||
5340 | zone->pageset = alloc_percpu(struct per_cpu_pageset); | 5257 | zone->pageset = alloc_percpu(struct per_cpu_pageset); |
5341 | for_each_possible_cpu(cpu) | 5258 | for_each_possible_cpu(cpu) |
5342 | zone_pageset_init(zone, cpu); | 5259 | zone_pageset_init(zone, cpu); |
5260 | |||
5261 | if (!zone->zone_pgdat->per_cpu_nodestats) { | ||
5262 | zone->zone_pgdat->per_cpu_nodestats = | ||
5263 | alloc_percpu(struct per_cpu_nodestat); | ||
5264 | } | ||
5343 | } | 5265 | } |
5344 | 5266 | ||
5345 | /* | 5267 | /* |
@@ -5909,6 +5831,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) | |||
5909 | init_waitqueue_head(&pgdat->kcompactd_wait); | 5831 | init_waitqueue_head(&pgdat->kcompactd_wait); |
5910 | #endif | 5832 | #endif |
5911 | pgdat_page_ext_init(pgdat); | 5833 | pgdat_page_ext_init(pgdat); |
5834 | spin_lock_init(&pgdat->lru_lock); | ||
5835 | lruvec_init(node_lruvec(pgdat)); | ||
5912 | 5836 | ||
5913 | for (j = 0; j < MAX_NR_ZONES; j++) { | 5837 | for (j = 0; j < MAX_NR_ZONES; j++) { |
5914 | struct zone *zone = pgdat->node_zones + j; | 5838 | struct zone *zone = pgdat->node_zones + j; |
@@ -5958,21 +5882,16 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) | |||
5958 | zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; | 5882 | zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; |
5959 | #ifdef CONFIG_NUMA | 5883 | #ifdef CONFIG_NUMA |
5960 | zone->node = nid; | 5884 | zone->node = nid; |
5961 | zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) | 5885 | pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio) |
5962 | / 100; | 5886 | / 100; |
5963 | zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; | 5887 | pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100; |
5964 | #endif | 5888 | #endif |
5965 | zone->name = zone_names[j]; | 5889 | zone->name = zone_names[j]; |
5890 | zone->zone_pgdat = pgdat; | ||
5966 | spin_lock_init(&zone->lock); | 5891 | spin_lock_init(&zone->lock); |
5967 | spin_lock_init(&zone->lru_lock); | ||
5968 | zone_seqlock_init(zone); | 5892 | zone_seqlock_init(zone); |
5969 | zone->zone_pgdat = pgdat; | ||
5970 | zone_pcp_init(zone); | 5893 | zone_pcp_init(zone); |
5971 | 5894 | ||
5972 | /* For bootup, initialized properly in watermark setup */ | ||
5973 | mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); | ||
5974 | |||
5975 | lruvec_init(&zone->lruvec); | ||
5976 | if (!size) | 5895 | if (!size) |
5977 | continue; | 5896 | continue; |
5978 | 5897 | ||
@@ -6038,11 +5957,12 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
6038 | unsigned long end_pfn = 0; | 5957 | unsigned long end_pfn = 0; |
6039 | 5958 | ||
6040 | /* pg_data_t should be reset to zero when it's allocated */ | 5959 | /* pg_data_t should be reset to zero when it's allocated */ |
6041 | WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); | 5960 | WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); |
6042 | 5961 | ||
6043 | reset_deferred_meminit(pgdat); | 5962 | reset_deferred_meminit(pgdat); |
6044 | pgdat->node_id = nid; | 5963 | pgdat->node_id = nid; |
6045 | pgdat->node_start_pfn = node_start_pfn; | 5964 | pgdat->node_start_pfn = node_start_pfn; |
5965 | pgdat->per_cpu_nodestats = NULL; | ||
6046 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 5966 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
6047 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); | 5967 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
6048 | pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, | 5968 | pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, |
@@ -6699,6 +6619,9 @@ static void calculate_totalreserve_pages(void) | |||
6699 | enum zone_type i, j; | 6619 | enum zone_type i, j; |
6700 | 6620 | ||
6701 | for_each_online_pgdat(pgdat) { | 6621 | for_each_online_pgdat(pgdat) { |
6622 | |||
6623 | pgdat->totalreserve_pages = 0; | ||
6624 | |||
6702 | for (i = 0; i < MAX_NR_ZONES; i++) { | 6625 | for (i = 0; i < MAX_NR_ZONES; i++) { |
6703 | struct zone *zone = pgdat->node_zones + i; | 6626 | struct zone *zone = pgdat->node_zones + i; |
6704 | long max = 0; | 6627 | long max = 0; |
@@ -6715,7 +6638,7 @@ static void calculate_totalreserve_pages(void) | |||
6715 | if (max > zone->managed_pages) | 6638 | if (max > zone->managed_pages) |
6716 | max = zone->managed_pages; | 6639 | max = zone->managed_pages; |
6717 | 6640 | ||
6718 | zone->totalreserve_pages = max; | 6641 | pgdat->totalreserve_pages += max; |
6719 | 6642 | ||
6720 | reserve_pages += max; | 6643 | reserve_pages += max; |
6721 | } | 6644 | } |
@@ -6816,10 +6739,6 @@ static void __setup_per_zone_wmarks(void) | |||
6816 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; | 6739 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; |
6817 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; | 6740 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; |
6818 | 6741 | ||
6819 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, | ||
6820 | high_wmark_pages(zone) - low_wmark_pages(zone) - | ||
6821 | atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); | ||
6822 | |||
6823 | spin_unlock_irqrestore(&zone->lock, flags); | 6742 | spin_unlock_irqrestore(&zone->lock, flags); |
6824 | } | 6743 | } |
6825 | 6744 | ||
@@ -6930,6 +6849,7 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, | |||
6930 | int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, | 6849 | int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, |
6931 | void __user *buffer, size_t *length, loff_t *ppos) | 6850 | void __user *buffer, size_t *length, loff_t *ppos) |
6932 | { | 6851 | { |
6852 | struct pglist_data *pgdat; | ||
6933 | struct zone *zone; | 6853 | struct zone *zone; |
6934 | int rc; | 6854 | int rc; |
6935 | 6855 | ||
@@ -6937,8 +6857,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, | |||
6937 | if (rc) | 6857 | if (rc) |
6938 | return rc; | 6858 | return rc; |
6939 | 6859 | ||
6860 | for_each_online_pgdat(pgdat) | ||
6861 | pgdat->min_slab_pages = 0; | ||
6862 | |||
6940 | for_each_zone(zone) | 6863 | for_each_zone(zone) |
6941 | zone->min_unmapped_pages = (zone->managed_pages * | 6864 | zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * |
6942 | sysctl_min_unmapped_ratio) / 100; | 6865 | sysctl_min_unmapped_ratio) / 100; |
6943 | return 0; | 6866 | return 0; |
6944 | } | 6867 | } |
@@ -6946,6 +6869,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, | |||
6946 | int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, | 6869 | int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, |
6947 | void __user *buffer, size_t *length, loff_t *ppos) | 6870 | void __user *buffer, size_t *length, loff_t *ppos) |
6948 | { | 6871 | { |
6872 | struct pglist_data *pgdat; | ||
6949 | struct zone *zone; | 6873 | struct zone *zone; |
6950 | int rc; | 6874 | int rc; |
6951 | 6875 | ||
@@ -6953,8 +6877,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, | |||
6953 | if (rc) | 6877 | if (rc) |
6954 | return rc; | 6878 | return rc; |
6955 | 6879 | ||
6880 | for_each_online_pgdat(pgdat) | ||
6881 | pgdat->min_slab_pages = 0; | ||
6882 | |||
6956 | for_each_zone(zone) | 6883 | for_each_zone(zone) |
6957 | zone->min_slab_pages = (zone->managed_pages * | 6884 | zone->zone_pgdat->min_slab_pages += (zone->managed_pages * |
6958 | sysctl_min_slab_ratio) / 100; | 6885 | sysctl_min_slab_ratio) / 100; |
6959 | return 0; | 6886 | return 0; |
6960 | } | 6887 | } |
diff --git a/mm/page_idle.c b/mm/page_idle.c index 4ea9c4ef5146..ae11aa914e55 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c | |||
@@ -41,12 +41,12 @@ static struct page *page_idle_get_page(unsigned long pfn) | |||
41 | return NULL; | 41 | return NULL; |
42 | 42 | ||
43 | zone = page_zone(page); | 43 | zone = page_zone(page); |
44 | spin_lock_irq(&zone->lru_lock); | 44 | spin_lock_irq(zone_lru_lock(zone)); |
45 | if (unlikely(!PageLRU(page))) { | 45 | if (unlikely(!PageLRU(page))) { |
46 | put_page(page); | 46 | put_page(page); |
47 | page = NULL; | 47 | page = NULL; |
48 | } | 48 | } |
49 | spin_unlock_irq(&zone->lru_lock); | 49 | spin_unlock_irq(zone_lru_lock(zone)); |
50 | return page; | 50 | return page; |
51 | } | 51 | } |
52 | 52 | ||
diff --git a/mm/page_io.c b/mm/page_io.c index dcc5d3769608..fb1fa269d3a0 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -166,6 +166,8 @@ int generic_swapfile_activate(struct swap_info_struct *sis, | |||
166 | unsigned block_in_page; | 166 | unsigned block_in_page; |
167 | sector_t first_block; | 167 | sector_t first_block; |
168 | 168 | ||
169 | cond_resched(); | ||
170 | |||
169 | first_block = bmap(inode, probe_block); | 171 | first_block = bmap(inode, probe_block); |
170 | if (first_block == 0) | 172 | if (first_block == 0) |
171 | goto bad_bmap; | 173 | goto bad_bmap; |
@@ -27,7 +27,7 @@ | |||
27 | * mapping->i_mmap_rwsem | 27 | * mapping->i_mmap_rwsem |
28 | * anon_vma->rwsem | 28 | * anon_vma->rwsem |
29 | * mm->page_table_lock or pte_lock | 29 | * mm->page_table_lock or pte_lock |
30 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 30 | * zone_lru_lock (in mark_page_accessed, isolate_lru_page) |
31 | * swap_lock (in swap_duplicate, swap_info_get) | 31 | * swap_lock (in swap_duplicate, swap_info_get) |
32 | * mmlist_lock (in mmput, drain_mmlist and others) | 32 | * mmlist_lock (in mmput, drain_mmlist and others) |
33 | * mapping->private_lock (in __set_page_dirty_buffers) | 33 | * mapping->private_lock (in __set_page_dirty_buffers) |
@@ -1213,8 +1213,8 @@ void do_page_add_anon_rmap(struct page *page, | |||
1213 | * disabled. | 1213 | * disabled. |
1214 | */ | 1214 | */ |
1215 | if (compound) | 1215 | if (compound) |
1216 | __inc_zone_page_state(page, NR_ANON_THPS); | 1216 | __inc_node_page_state(page, NR_ANON_THPS); |
1217 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); | 1217 | __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); |
1218 | } | 1218 | } |
1219 | if (unlikely(PageKsm(page))) | 1219 | if (unlikely(PageKsm(page))) |
1220 | return; | 1220 | return; |
@@ -1251,14 +1251,14 @@ void page_add_new_anon_rmap(struct page *page, | |||
1251 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | 1251 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
1252 | /* increment count (starts at -1) */ | 1252 | /* increment count (starts at -1) */ |
1253 | atomic_set(compound_mapcount_ptr(page), 0); | 1253 | atomic_set(compound_mapcount_ptr(page), 0); |
1254 | __inc_zone_page_state(page, NR_ANON_THPS); | 1254 | __inc_node_page_state(page, NR_ANON_THPS); |
1255 | } else { | 1255 | } else { |
1256 | /* Anon THP always mapped first with PMD */ | 1256 | /* Anon THP always mapped first with PMD */ |
1257 | VM_BUG_ON_PAGE(PageTransCompound(page), page); | 1257 | VM_BUG_ON_PAGE(PageTransCompound(page), page); |
1258 | /* increment count (starts at -1) */ | 1258 | /* increment count (starts at -1) */ |
1259 | atomic_set(&page->_mapcount, 0); | 1259 | atomic_set(&page->_mapcount, 0); |
1260 | } | 1260 | } |
1261 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); | 1261 | __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); |
1262 | __page_set_anon_rmap(page, vma, address, 1); | 1262 | __page_set_anon_rmap(page, vma, address, 1); |
1263 | } | 1263 | } |
1264 | 1264 | ||
@@ -1282,7 +1282,7 @@ void page_add_file_rmap(struct page *page, bool compound) | |||
1282 | if (!atomic_inc_and_test(compound_mapcount_ptr(page))) | 1282 | if (!atomic_inc_and_test(compound_mapcount_ptr(page))) |
1283 | goto out; | 1283 | goto out; |
1284 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | 1284 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
1285 | __inc_zone_page_state(page, NR_SHMEM_PMDMAPPED); | 1285 | __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); |
1286 | } else { | 1286 | } else { |
1287 | if (PageTransCompound(page)) { | 1287 | if (PageTransCompound(page)) { |
1288 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 1288 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
@@ -1293,7 +1293,7 @@ void page_add_file_rmap(struct page *page, bool compound) | |||
1293 | if (!atomic_inc_and_test(&page->_mapcount)) | 1293 | if (!atomic_inc_and_test(&page->_mapcount)) |
1294 | goto out; | 1294 | goto out; |
1295 | } | 1295 | } |
1296 | __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, nr); | 1296 | __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); |
1297 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); | 1297 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); |
1298 | out: | 1298 | out: |
1299 | unlock_page_memcg(page); | 1299 | unlock_page_memcg(page); |
@@ -1322,18 +1322,18 @@ static void page_remove_file_rmap(struct page *page, bool compound) | |||
1322 | if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) | 1322 | if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) |
1323 | goto out; | 1323 | goto out; |
1324 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | 1324 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
1325 | __dec_zone_page_state(page, NR_SHMEM_PMDMAPPED); | 1325 | __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); |
1326 | } else { | 1326 | } else { |
1327 | if (!atomic_add_negative(-1, &page->_mapcount)) | 1327 | if (!atomic_add_negative(-1, &page->_mapcount)) |
1328 | goto out; | 1328 | goto out; |
1329 | } | 1329 | } |
1330 | 1330 | ||
1331 | /* | 1331 | /* |
1332 | * We use the irq-unsafe __{inc|mod}_zone_page_stat because | 1332 | * We use the irq-unsafe __{inc|mod}_zone_page_state because |
1333 | * these counters are not modified in interrupt context, and | 1333 | * these counters are not modified in interrupt context, and |
1334 | * pte lock(a spinlock) is held, which implies preemption disabled. | 1334 | * pte lock(a spinlock) is held, which implies preemption disabled. |
1335 | */ | 1335 | */ |
1336 | __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, -nr); | 1336 | __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); |
1337 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); | 1337 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); |
1338 | 1338 | ||
1339 | if (unlikely(PageMlocked(page))) | 1339 | if (unlikely(PageMlocked(page))) |
@@ -1356,7 +1356,7 @@ static void page_remove_anon_compound_rmap(struct page *page) | |||
1356 | if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) | 1356 | if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) |
1357 | return; | 1357 | return; |
1358 | 1358 | ||
1359 | __dec_zone_page_state(page, NR_ANON_THPS); | 1359 | __dec_node_page_state(page, NR_ANON_THPS); |
1360 | 1360 | ||
1361 | if (TestClearPageDoubleMap(page)) { | 1361 | if (TestClearPageDoubleMap(page)) { |
1362 | /* | 1362 | /* |
@@ -1375,7 +1375,7 @@ static void page_remove_anon_compound_rmap(struct page *page) | |||
1375 | clear_page_mlock(page); | 1375 | clear_page_mlock(page); |
1376 | 1376 | ||
1377 | if (nr) { | 1377 | if (nr) { |
1378 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr); | 1378 | __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); |
1379 | deferred_split_huge_page(page); | 1379 | deferred_split_huge_page(page); |
1380 | } | 1380 | } |
1381 | } | 1381 | } |
@@ -1404,7 +1404,7 @@ void page_remove_rmap(struct page *page, bool compound) | |||
1404 | * these counters are not modified in interrupt context, and | 1404 | * these counters are not modified in interrupt context, and |
1405 | * pte lock(a spinlock) is held, which implies preemption disabled. | 1405 | * pte lock(a spinlock) is held, which implies preemption disabled. |
1406 | */ | 1406 | */ |
1407 | __dec_zone_page_state(page, NR_ANON_PAGES); | 1407 | __dec_node_page_state(page, NR_ANON_MAPPED); |
1408 | 1408 | ||
1409 | if (unlikely(PageMlocked(page))) | 1409 | if (unlikely(PageMlocked(page))) |
1410 | clear_page_mlock(page); | 1410 | clear_page_mlock(page); |
diff --git a/mm/shmem.c b/mm/shmem.c index 62e42c7d544c..2ac19a61d565 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -575,9 +575,9 @@ static int shmem_add_to_page_cache(struct page *page, | |||
575 | if (!error) { | 575 | if (!error) { |
576 | mapping->nrpages += nr; | 576 | mapping->nrpages += nr; |
577 | if (PageTransHuge(page)) | 577 | if (PageTransHuge(page)) |
578 | __inc_zone_page_state(page, NR_SHMEM_THPS); | 578 | __inc_node_page_state(page, NR_SHMEM_THPS); |
579 | __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, nr); | 579 | __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); |
580 | __mod_zone_page_state(page_zone(page), NR_SHMEM, nr); | 580 | __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); |
581 | spin_unlock_irq(&mapping->tree_lock); | 581 | spin_unlock_irq(&mapping->tree_lock); |
582 | } else { | 582 | } else { |
583 | page->mapping = NULL; | 583 | page->mapping = NULL; |
@@ -601,8 +601,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) | |||
601 | error = shmem_radix_tree_replace(mapping, page->index, page, radswap); | 601 | error = shmem_radix_tree_replace(mapping, page->index, page, radswap); |
602 | page->mapping = NULL; | 602 | page->mapping = NULL; |
603 | mapping->nrpages--; | 603 | mapping->nrpages--; |
604 | __dec_zone_page_state(page, NR_FILE_PAGES); | 604 | __dec_node_page_state(page, NR_FILE_PAGES); |
605 | __dec_zone_page_state(page, NR_SHMEM); | 605 | __dec_node_page_state(page, NR_SHMEM); |
606 | spin_unlock_irq(&mapping->tree_lock); | 606 | spin_unlock_irq(&mapping->tree_lock); |
607 | put_page(page); | 607 | put_page(page); |
608 | BUG_ON(error); | 608 | BUG_ON(error); |
@@ -1493,8 +1493,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
1493 | error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, | 1493 | error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, |
1494 | newpage); | 1494 | newpage); |
1495 | if (!error) { | 1495 | if (!error) { |
1496 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | 1496 | __inc_node_page_state(newpage, NR_FILE_PAGES); |
1497 | __dec_zone_page_state(oldpage, NR_FILE_PAGES); | 1497 | __dec_node_page_state(oldpage, NR_FILE_PAGES); |
1498 | } | 1498 | } |
1499 | spin_unlock_irq(&swap_mapping->tree_lock); | 1499 | spin_unlock_irq(&swap_mapping->tree_lock); |
1500 | 1500 | ||
@@ -369,6 +369,8 @@ static inline size_t slab_ksize(const struct kmem_cache *s) | |||
369 | if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) | 369 | if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) |
370 | return s->object_size; | 370 | return s->object_size; |
371 | # endif | 371 | # endif |
372 | if (s->flags & SLAB_KASAN) | ||
373 | return s->object_size; | ||
372 | /* | 374 | /* |
373 | * If we have the need to store the freelist pointer | 375 | * If we have the need to store the freelist pointer |
374 | * back there or track user information then we can | 376 | * back there or track user information then we can |
@@ -124,7 +124,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s) | |||
124 | #endif | 124 | #endif |
125 | } | 125 | } |
126 | 126 | ||
127 | static inline void *fixup_red_left(struct kmem_cache *s, void *p) | 127 | inline void *fixup_red_left(struct kmem_cache *s, void *p) |
128 | { | 128 | { |
129 | if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) | 129 | if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) |
130 | p += s->red_left_pad; | 130 | p += s->red_left_pad; |
@@ -454,8 +454,6 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) | |||
454 | */ | 454 | */ |
455 | #if defined(CONFIG_SLUB_DEBUG_ON) | 455 | #if defined(CONFIG_SLUB_DEBUG_ON) |
456 | static int slub_debug = DEBUG_DEFAULT_FLAGS; | 456 | static int slub_debug = DEBUG_DEFAULT_FLAGS; |
457 | #elif defined(CONFIG_KASAN) | ||
458 | static int slub_debug = SLAB_STORE_USER; | ||
459 | #else | 457 | #else |
460 | static int slub_debug; | 458 | static int slub_debug; |
461 | #endif | 459 | #endif |
@@ -660,6 +658,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) | |||
660 | if (s->flags & SLAB_STORE_USER) | 658 | if (s->flags & SLAB_STORE_USER) |
661 | off += 2 * sizeof(struct track); | 659 | off += 2 * sizeof(struct track); |
662 | 660 | ||
661 | off += kasan_metadata_size(s); | ||
662 | |||
663 | if (off != size_from_object(s)) | 663 | if (off != size_from_object(s)) |
664 | /* Beginning of the filler is the free pointer */ | 664 | /* Beginning of the filler is the free pointer */ |
665 | print_section("Padding ", p + off, size_from_object(s) - off); | 665 | print_section("Padding ", p + off, size_from_object(s) - off); |
@@ -787,6 +787,8 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) | |||
787 | /* We also have user information there */ | 787 | /* We also have user information there */ |
788 | off += 2 * sizeof(struct track); | 788 | off += 2 * sizeof(struct track); |
789 | 789 | ||
790 | off += kasan_metadata_size(s); | ||
791 | |||
790 | if (size_from_object(s) == off) | 792 | if (size_from_object(s) == off) |
791 | return 1; | 793 | return 1; |
792 | 794 | ||
@@ -1322,8 +1324,10 @@ static inline void kfree_hook(const void *x) | |||
1322 | kasan_kfree_large(x); | 1324 | kasan_kfree_large(x); |
1323 | } | 1325 | } |
1324 | 1326 | ||
1325 | static inline void slab_free_hook(struct kmem_cache *s, void *x) | 1327 | static inline void *slab_free_hook(struct kmem_cache *s, void *x) |
1326 | { | 1328 | { |
1329 | void *freeptr; | ||
1330 | |||
1327 | kmemleak_free_recursive(x, s->flags); | 1331 | kmemleak_free_recursive(x, s->flags); |
1328 | 1332 | ||
1329 | /* | 1333 | /* |
@@ -1344,7 +1348,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
1344 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | 1348 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) |
1345 | debug_check_no_obj_freed(x, s->object_size); | 1349 | debug_check_no_obj_freed(x, s->object_size); |
1346 | 1350 | ||
1351 | freeptr = get_freepointer(s, x); | ||
1352 | /* | ||
1353 | * kasan_slab_free() may put x into memory quarantine, delaying its | ||
1354 | * reuse. In this case the object's freelist pointer is changed. | ||
1355 | */ | ||
1347 | kasan_slab_free(s, x); | 1356 | kasan_slab_free(s, x); |
1357 | return freeptr; | ||
1348 | } | 1358 | } |
1349 | 1359 | ||
1350 | static inline void slab_free_freelist_hook(struct kmem_cache *s, | 1360 | static inline void slab_free_freelist_hook(struct kmem_cache *s, |
@@ -1362,11 +1372,11 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s, | |||
1362 | 1372 | ||
1363 | void *object = head; | 1373 | void *object = head; |
1364 | void *tail_obj = tail ? : head; | 1374 | void *tail_obj = tail ? : head; |
1375 | void *freeptr; | ||
1365 | 1376 | ||
1366 | do { | 1377 | do { |
1367 | slab_free_hook(s, object); | 1378 | freeptr = slab_free_hook(s, object); |
1368 | } while ((object != tail_obj) && | 1379 | } while ((object != tail_obj) && (object = freeptr)); |
1369 | (object = get_freepointer(s, object))); | ||
1370 | #endif | 1380 | #endif |
1371 | } | 1381 | } |
1372 | 1382 | ||
@@ -2878,16 +2888,13 @@ slab_empty: | |||
2878 | * same page) possible by specifying head and tail ptr, plus objects | 2888 | * same page) possible by specifying head and tail ptr, plus objects |
2879 | * count (cnt). Bulk free indicated by tail pointer being set. | 2889 | * count (cnt). Bulk free indicated by tail pointer being set. |
2880 | */ | 2890 | */ |
2881 | static __always_inline void slab_free(struct kmem_cache *s, struct page *page, | 2891 | static __always_inline void do_slab_free(struct kmem_cache *s, |
2882 | void *head, void *tail, int cnt, | 2892 | struct page *page, void *head, void *tail, |
2883 | unsigned long addr) | 2893 | int cnt, unsigned long addr) |
2884 | { | 2894 | { |
2885 | void *tail_obj = tail ? : head; | 2895 | void *tail_obj = tail ? : head; |
2886 | struct kmem_cache_cpu *c; | 2896 | struct kmem_cache_cpu *c; |
2887 | unsigned long tid; | 2897 | unsigned long tid; |
2888 | |||
2889 | slab_free_freelist_hook(s, head, tail); | ||
2890 | |||
2891 | redo: | 2898 | redo: |
2892 | /* | 2899 | /* |
2893 | * Determine the currently cpus per cpu slab. | 2900 | * Determine the currently cpus per cpu slab. |
@@ -2921,6 +2928,27 @@ redo: | |||
2921 | 2928 | ||
2922 | } | 2929 | } |
2923 | 2930 | ||
2931 | static __always_inline void slab_free(struct kmem_cache *s, struct page *page, | ||
2932 | void *head, void *tail, int cnt, | ||
2933 | unsigned long addr) | ||
2934 | { | ||
2935 | slab_free_freelist_hook(s, head, tail); | ||
2936 | /* | ||
2937 | * slab_free_freelist_hook() could have put the items into quarantine. | ||
2938 | * If so, no need to free them. | ||
2939 | */ | ||
2940 | if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU)) | ||
2941 | return; | ||
2942 | do_slab_free(s, page, head, tail, cnt, addr); | ||
2943 | } | ||
2944 | |||
2945 | #ifdef CONFIG_KASAN | ||
2946 | void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) | ||
2947 | { | ||
2948 | do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr); | ||
2949 | } | ||
2950 | #endif | ||
2951 | |||
2924 | void kmem_cache_free(struct kmem_cache *s, void *x) | 2952 | void kmem_cache_free(struct kmem_cache *s, void *x) |
2925 | { | 2953 | { |
2926 | s = cache_from_obj(s, x); | 2954 | s = cache_from_obj(s, x); |
@@ -3363,7 +3391,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min) | |||
3363 | static int calculate_sizes(struct kmem_cache *s, int forced_order) | 3391 | static int calculate_sizes(struct kmem_cache *s, int forced_order) |
3364 | { | 3392 | { |
3365 | unsigned long flags = s->flags; | 3393 | unsigned long flags = s->flags; |
3366 | unsigned long size = s->object_size; | 3394 | size_t size = s->object_size; |
3367 | int order; | 3395 | int order; |
3368 | 3396 | ||
3369 | /* | 3397 | /* |
@@ -3422,7 +3450,10 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
3422 | * the object. | 3450 | * the object. |
3423 | */ | 3451 | */ |
3424 | size += 2 * sizeof(struct track); | 3452 | size += 2 * sizeof(struct track); |
3453 | #endif | ||
3425 | 3454 | ||
3455 | kasan_cache_create(s, &size, &s->flags); | ||
3456 | #ifdef CONFIG_SLUB_DEBUG | ||
3426 | if (flags & SLAB_RED_ZONE) { | 3457 | if (flags & SLAB_RED_ZONE) { |
3427 | /* | 3458 | /* |
3428 | * Add some empty padding so that we can catch | 3459 | * Add some empty padding so that we can catch |
diff --git a/mm/sparse.c b/mm/sparse.c index 5d0cf4540364..36d7bbb80e49 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -100,11 +100,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) | |||
100 | } | 100 | } |
101 | #endif | 101 | #endif |
102 | 102 | ||
103 | /* | 103 | #ifdef CONFIG_SPARSEMEM_EXTREME |
104 | * Although written for the SPARSEMEM_EXTREME case, this happens | ||
105 | * to also work for the flat array case because | ||
106 | * NR_SECTION_ROOTS==NR_MEM_SECTIONS. | ||
107 | */ | ||
108 | int __section_nr(struct mem_section* ms) | 104 | int __section_nr(struct mem_section* ms) |
109 | { | 105 | { |
110 | unsigned long root_nr; | 106 | unsigned long root_nr; |
@@ -123,6 +119,12 @@ int __section_nr(struct mem_section* ms) | |||
123 | 119 | ||
124 | return (root_nr * SECTIONS_PER_ROOT) + (ms - root); | 120 | return (root_nr * SECTIONS_PER_ROOT) + (ms - root); |
125 | } | 121 | } |
122 | #else | ||
123 | int __section_nr(struct mem_section* ms) | ||
124 | { | ||
125 | return (int)(ms - mem_section[0]); | ||
126 | } | ||
127 | #endif | ||
126 | 128 | ||
127 | /* | 129 | /* |
128 | * During early boot, before section_mem_map is used for an actual | 130 | * During early boot, before section_mem_map is used for an actual |
@@ -62,12 +62,12 @@ static void __page_cache_release(struct page *page) | |||
62 | struct lruvec *lruvec; | 62 | struct lruvec *lruvec; |
63 | unsigned long flags; | 63 | unsigned long flags; |
64 | 64 | ||
65 | spin_lock_irqsave(&zone->lru_lock, flags); | 65 | spin_lock_irqsave(zone_lru_lock(zone), flags); |
66 | lruvec = mem_cgroup_page_lruvec(page, zone); | 66 | lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); |
67 | VM_BUG_ON_PAGE(!PageLRU(page), page); | 67 | VM_BUG_ON_PAGE(!PageLRU(page), page); |
68 | __ClearPageLRU(page); | 68 | __ClearPageLRU(page); |
69 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); | 69 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); |
70 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 70 | spin_unlock_irqrestore(zone_lru_lock(zone), flags); |
71 | } | 71 | } |
72 | mem_cgroup_uncharge(page); | 72 | mem_cgroup_uncharge(page); |
73 | } | 73 | } |
@@ -179,26 +179,26 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, | |||
179 | void *arg) | 179 | void *arg) |
180 | { | 180 | { |
181 | int i; | 181 | int i; |
182 | struct zone *zone = NULL; | 182 | struct pglist_data *pgdat = NULL; |
183 | struct lruvec *lruvec; | 183 | struct lruvec *lruvec; |
184 | unsigned long flags = 0; | 184 | unsigned long flags = 0; |
185 | 185 | ||
186 | for (i = 0; i < pagevec_count(pvec); i++) { | 186 | for (i = 0; i < pagevec_count(pvec); i++) { |
187 | struct page *page = pvec->pages[i]; | 187 | struct page *page = pvec->pages[i]; |
188 | struct zone *pagezone = page_zone(page); | 188 | struct pglist_data *pagepgdat = page_pgdat(page); |
189 | 189 | ||
190 | if (pagezone != zone) { | 190 | if (pagepgdat != pgdat) { |
191 | if (zone) | 191 | if (pgdat) |
192 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 192 | spin_unlock_irqrestore(&pgdat->lru_lock, flags); |
193 | zone = pagezone; | 193 | pgdat = pagepgdat; |
194 | spin_lock_irqsave(&zone->lru_lock, flags); | 194 | spin_lock_irqsave(&pgdat->lru_lock, flags); |
195 | } | 195 | } |
196 | 196 | ||
197 | lruvec = mem_cgroup_page_lruvec(page, zone); | 197 | lruvec = mem_cgroup_page_lruvec(page, pgdat); |
198 | (*move_fn)(page, lruvec, arg); | 198 | (*move_fn)(page, lruvec, arg); |
199 | } | 199 | } |
200 | if (zone) | 200 | if (pgdat) |
201 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 201 | spin_unlock_irqrestore(&pgdat->lru_lock, flags); |
202 | release_pages(pvec->pages, pvec->nr, pvec->cold); | 202 | release_pages(pvec->pages, pvec->nr, pvec->cold); |
203 | pagevec_reinit(pvec); | 203 | pagevec_reinit(pvec); |
204 | } | 204 | } |
@@ -318,9 +318,9 @@ void activate_page(struct page *page) | |||
318 | struct zone *zone = page_zone(page); | 318 | struct zone *zone = page_zone(page); |
319 | 319 | ||
320 | page = compound_head(page); | 320 | page = compound_head(page); |
321 | spin_lock_irq(&zone->lru_lock); | 321 | spin_lock_irq(zone_lru_lock(zone)); |
322 | __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); | 322 | __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL); |
323 | spin_unlock_irq(&zone->lru_lock); | 323 | spin_unlock_irq(zone_lru_lock(zone)); |
324 | } | 324 | } |
325 | #endif | 325 | #endif |
326 | 326 | ||
@@ -445,16 +445,16 @@ void lru_cache_add(struct page *page) | |||
445 | */ | 445 | */ |
446 | void add_page_to_unevictable_list(struct page *page) | 446 | void add_page_to_unevictable_list(struct page *page) |
447 | { | 447 | { |
448 | struct zone *zone = page_zone(page); | 448 | struct pglist_data *pgdat = page_pgdat(page); |
449 | struct lruvec *lruvec; | 449 | struct lruvec *lruvec; |
450 | 450 | ||
451 | spin_lock_irq(&zone->lru_lock); | 451 | spin_lock_irq(&pgdat->lru_lock); |
452 | lruvec = mem_cgroup_page_lruvec(page, zone); | 452 | lruvec = mem_cgroup_page_lruvec(page, pgdat); |
453 | ClearPageActive(page); | 453 | ClearPageActive(page); |
454 | SetPageUnevictable(page); | 454 | SetPageUnevictable(page); |
455 | SetPageLRU(page); | 455 | SetPageLRU(page); |
456 | add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); | 456 | add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); |
457 | spin_unlock_irq(&zone->lru_lock); | 457 | spin_unlock_irq(&pgdat->lru_lock); |
458 | } | 458 | } |
459 | 459 | ||
460 | /** | 460 | /** |
@@ -730,7 +730,7 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
730 | { | 730 | { |
731 | int i; | 731 | int i; |
732 | LIST_HEAD(pages_to_free); | 732 | LIST_HEAD(pages_to_free); |
733 | struct zone *zone = NULL; | 733 | struct pglist_data *locked_pgdat = NULL; |
734 | struct lruvec *lruvec; | 734 | struct lruvec *lruvec; |
735 | unsigned long uninitialized_var(flags); | 735 | unsigned long uninitialized_var(flags); |
736 | unsigned int uninitialized_var(lock_batch); | 736 | unsigned int uninitialized_var(lock_batch); |
@@ -741,11 +741,11 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
741 | /* | 741 | /* |
742 | * Make sure the IRQ-safe lock-holding time does not get | 742 | * Make sure the IRQ-safe lock-holding time does not get |
743 | * excessive with a continuous string of pages from the | 743 | * excessive with a continuous string of pages from the |
744 | * same zone. The lock is held only if zone != NULL. | 744 | * same pgdat. The lock is held only if pgdat != NULL. |
745 | */ | 745 | */ |
746 | if (zone && ++lock_batch == SWAP_CLUSTER_MAX) { | 746 | if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) { |
747 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 747 | spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); |
748 | zone = NULL; | 748 | locked_pgdat = NULL; |
749 | } | 749 | } |
750 | 750 | ||
751 | if (is_huge_zero_page(page)) { | 751 | if (is_huge_zero_page(page)) { |
@@ -758,27 +758,27 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
758 | continue; | 758 | continue; |
759 | 759 | ||
760 | if (PageCompound(page)) { | 760 | if (PageCompound(page)) { |
761 | if (zone) { | 761 | if (locked_pgdat) { |
762 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 762 | spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); |
763 | zone = NULL; | 763 | locked_pgdat = NULL; |
764 | } | 764 | } |
765 | __put_compound_page(page); | 765 | __put_compound_page(page); |
766 | continue; | 766 | continue; |
767 | } | 767 | } |
768 | 768 | ||
769 | if (PageLRU(page)) { | 769 | if (PageLRU(page)) { |
770 | struct zone *pagezone = page_zone(page); | 770 | struct pglist_data *pgdat = page_pgdat(page); |
771 | 771 | ||
772 | if (pagezone != zone) { | 772 | if (pgdat != locked_pgdat) { |
773 | if (zone) | 773 | if (locked_pgdat) |
774 | spin_unlock_irqrestore(&zone->lru_lock, | 774 | spin_unlock_irqrestore(&locked_pgdat->lru_lock, |
775 | flags); | 775 | flags); |
776 | lock_batch = 0; | 776 | lock_batch = 0; |
777 | zone = pagezone; | 777 | locked_pgdat = pgdat; |
778 | spin_lock_irqsave(&zone->lru_lock, flags); | 778 | spin_lock_irqsave(&locked_pgdat->lru_lock, flags); |
779 | } | 779 | } |
780 | 780 | ||
781 | lruvec = mem_cgroup_page_lruvec(page, zone); | 781 | lruvec = mem_cgroup_page_lruvec(page, locked_pgdat); |
782 | VM_BUG_ON_PAGE(!PageLRU(page), page); | 782 | VM_BUG_ON_PAGE(!PageLRU(page), page); |
783 | __ClearPageLRU(page); | 783 | __ClearPageLRU(page); |
784 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); | 784 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); |
@@ -789,8 +789,8 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
789 | 789 | ||
790 | list_add(&page->lru, &pages_to_free); | 790 | list_add(&page->lru, &pages_to_free); |
791 | } | 791 | } |
792 | if (zone) | 792 | if (locked_pgdat) |
793 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 793 | spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); |
794 | 794 | ||
795 | mem_cgroup_uncharge_list(&pages_to_free); | 795 | mem_cgroup_uncharge_list(&pages_to_free); |
796 | free_hot_cold_page_list(&pages_to_free, cold); | 796 | free_hot_cold_page_list(&pages_to_free, cold); |
@@ -826,7 +826,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, | |||
826 | VM_BUG_ON_PAGE(PageCompound(page_tail), page); | 826 | VM_BUG_ON_PAGE(PageCompound(page_tail), page); |
827 | VM_BUG_ON_PAGE(PageLRU(page_tail), page); | 827 | VM_BUG_ON_PAGE(PageLRU(page_tail), page); |
828 | VM_BUG_ON(NR_CPUS != 1 && | 828 | VM_BUG_ON(NR_CPUS != 1 && |
829 | !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); | 829 | !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock)); |
830 | 830 | ||
831 | if (!list) | 831 | if (!list) |
832 | SetPageLRU(page_tail); | 832 | SetPageLRU(page_tail); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index c99463ac02fb..c8310a37be3a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -95,7 +95,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) | |||
95 | entry.val, page); | 95 | entry.val, page); |
96 | if (likely(!error)) { | 96 | if (likely(!error)) { |
97 | address_space->nrpages++; | 97 | address_space->nrpages++; |
98 | __inc_zone_page_state(page, NR_FILE_PAGES); | 98 | __inc_node_page_state(page, NR_FILE_PAGES); |
99 | INC_CACHE_INFO(add_total); | 99 | INC_CACHE_INFO(add_total); |
100 | } | 100 | } |
101 | spin_unlock_irq(&address_space->tree_lock); | 101 | spin_unlock_irq(&address_space->tree_lock); |
@@ -147,7 +147,7 @@ void __delete_from_swap_cache(struct page *page) | |||
147 | set_page_private(page, 0); | 147 | set_page_private(page, 0); |
148 | ClearPageSwapCache(page); | 148 | ClearPageSwapCache(page); |
149 | address_space->nrpages--; | 149 | address_space->nrpages--; |
150 | __dec_zone_page_state(page, NR_FILE_PAGES); | 150 | __dec_node_page_state(page, NR_FILE_PAGES); |
151 | INC_CACHE_INFO(del_total); | 151 | INC_CACHE_INFO(del_total); |
152 | } | 152 | } |
153 | 153 | ||
@@ -528,7 +528,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
528 | 528 | ||
529 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 529 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { |
530 | free = global_page_state(NR_FREE_PAGES); | 530 | free = global_page_state(NR_FREE_PAGES); |
531 | free += global_page_state(NR_FILE_PAGES); | 531 | free += global_node_page_state(NR_FILE_PAGES); |
532 | 532 | ||
533 | /* | 533 | /* |
534 | * shmem pages shouldn't be counted as free in this | 534 | * shmem pages shouldn't be counted as free in this |
@@ -536,7 +536,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
536 | * that won't affect the overall amount of available | 536 | * that won't affect the overall amount of available |
537 | * memory in the system. | 537 | * memory in the system. |
538 | */ | 538 | */ |
539 | free -= global_page_state(NR_SHMEM); | 539 | free -= global_node_page_state(NR_SHMEM); |
540 | 540 | ||
541 | free += get_nr_swap_pages(); | 541 | free += get_nr_swap_pages(); |
542 | 542 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 21d417ccff69..650d26832569 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -84,6 +84,9 @@ struct scan_control { | |||
84 | /* Scan (total_size >> priority) pages at once */ | 84 | /* Scan (total_size >> priority) pages at once */ |
85 | int priority; | 85 | int priority; |
86 | 86 | ||
87 | /* The highest zone to isolate pages for reclaim from */ | ||
88 | enum zone_type reclaim_idx; | ||
89 | |||
87 | unsigned int may_writepage:1; | 90 | unsigned int may_writepage:1; |
88 | 91 | ||
89 | /* Can mapped pages be reclaimed? */ | 92 | /* Can mapped pages be reclaimed? */ |
@@ -191,26 +194,44 @@ static bool sane_reclaim(struct scan_control *sc) | |||
191 | } | 194 | } |
192 | #endif | 195 | #endif |
193 | 196 | ||
197 | /* | ||
198 | * This misses isolated pages which are not accounted for to save counters. | ||
199 | * As the data only determines if reclaim or compaction continues, it is | ||
200 | * not expected that isolated pages will be a dominating factor. | ||
201 | */ | ||
194 | unsigned long zone_reclaimable_pages(struct zone *zone) | 202 | unsigned long zone_reclaimable_pages(struct zone *zone) |
195 | { | 203 | { |
196 | unsigned long nr; | 204 | unsigned long nr; |
197 | 205 | ||
198 | nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) + | 206 | nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + |
199 | zone_page_state_snapshot(zone, NR_INACTIVE_FILE) + | 207 | zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); |
200 | zone_page_state_snapshot(zone, NR_ISOLATED_FILE); | 208 | if (get_nr_swap_pages() > 0) |
209 | nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + | ||
210 | zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); | ||
211 | |||
212 | return nr; | ||
213 | } | ||
214 | |||
215 | unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) | ||
216 | { | ||
217 | unsigned long nr; | ||
218 | |||
219 | nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) + | ||
220 | node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) + | ||
221 | node_page_state_snapshot(pgdat, NR_ISOLATED_FILE); | ||
201 | 222 | ||
202 | if (get_nr_swap_pages() > 0) | 223 | if (get_nr_swap_pages() > 0) |
203 | nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) + | 224 | nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) + |
204 | zone_page_state_snapshot(zone, NR_INACTIVE_ANON) + | 225 | node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) + |
205 | zone_page_state_snapshot(zone, NR_ISOLATED_ANON); | 226 | node_page_state_snapshot(pgdat, NR_ISOLATED_ANON); |
206 | 227 | ||
207 | return nr; | 228 | return nr; |
208 | } | 229 | } |
209 | 230 | ||
210 | bool zone_reclaimable(struct zone *zone) | 231 | bool pgdat_reclaimable(struct pglist_data *pgdat) |
211 | { | 232 | { |
212 | return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) < | 233 | return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) < |
213 | zone_reclaimable_pages(zone) * 6; | 234 | pgdat_reclaimable_pages(pgdat) * 6; |
214 | } | 235 | } |
215 | 236 | ||
216 | unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) | 237 | unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) |
@@ -218,7 +239,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) | |||
218 | if (!mem_cgroup_disabled()) | 239 | if (!mem_cgroup_disabled()) |
219 | return mem_cgroup_get_lru_size(lruvec, lru); | 240 | return mem_cgroup_get_lru_size(lruvec, lru); |
220 | 241 | ||
221 | return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); | 242 | return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); |
222 | } | 243 | } |
223 | 244 | ||
224 | /* | 245 | /* |
@@ -593,7 +614,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
593 | ClearPageReclaim(page); | 614 | ClearPageReclaim(page); |
594 | } | 615 | } |
595 | trace_mm_vmscan_writepage(page); | 616 | trace_mm_vmscan_writepage(page); |
596 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | 617 | inc_node_page_state(page, NR_VMSCAN_WRITE); |
597 | return PAGE_SUCCESS; | 618 | return PAGE_SUCCESS; |
598 | } | 619 | } |
599 | 620 | ||
@@ -877,7 +898,7 @@ static void page_check_dirty_writeback(struct page *page, | |||
877 | * shrink_page_list() returns the number of reclaimed pages | 898 | * shrink_page_list() returns the number of reclaimed pages |
878 | */ | 899 | */ |
879 | static unsigned long shrink_page_list(struct list_head *page_list, | 900 | static unsigned long shrink_page_list(struct list_head *page_list, |
880 | struct zone *zone, | 901 | struct pglist_data *pgdat, |
881 | struct scan_control *sc, | 902 | struct scan_control *sc, |
882 | enum ttu_flags ttu_flags, | 903 | enum ttu_flags ttu_flags, |
883 | unsigned long *ret_nr_dirty, | 904 | unsigned long *ret_nr_dirty, |
@@ -917,7 +938,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
917 | goto keep; | 938 | goto keep; |
918 | 939 | ||
919 | VM_BUG_ON_PAGE(PageActive(page), page); | 940 | VM_BUG_ON_PAGE(PageActive(page), page); |
920 | VM_BUG_ON_PAGE(page_zone(page) != zone, page); | ||
921 | 941 | ||
922 | sc->nr_scanned++; | 942 | sc->nr_scanned++; |
923 | 943 | ||
@@ -996,7 +1016,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
996 | /* Case 1 above */ | 1016 | /* Case 1 above */ |
997 | if (current_is_kswapd() && | 1017 | if (current_is_kswapd() && |
998 | PageReclaim(page) && | 1018 | PageReclaim(page) && |
999 | test_bit(ZONE_WRITEBACK, &zone->flags)) { | 1019 | test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { |
1000 | nr_immediate++; | 1020 | nr_immediate++; |
1001 | goto keep_locked; | 1021 | goto keep_locked; |
1002 | 1022 | ||
@@ -1092,14 +1112,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1092 | */ | 1112 | */ |
1093 | if (page_is_file_cache(page) && | 1113 | if (page_is_file_cache(page) && |
1094 | (!current_is_kswapd() || | 1114 | (!current_is_kswapd() || |
1095 | !test_bit(ZONE_DIRTY, &zone->flags))) { | 1115 | !test_bit(PGDAT_DIRTY, &pgdat->flags))) { |
1096 | /* | 1116 | /* |
1097 | * Immediately reclaim when written back. | 1117 | * Immediately reclaim when written back. |
1098 | * Similar in principal to deactivate_page() | 1118 | * Similar in principal to deactivate_page() |
1099 | * except we already have the page isolated | 1119 | * except we already have the page isolated |
1100 | * and know it's dirty | 1120 | * and know it's dirty |
1101 | */ | 1121 | */ |
1102 | inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); | 1122 | inc_node_page_state(page, NR_VMSCAN_IMMEDIATE); |
1103 | SetPageReclaim(page); | 1123 | SetPageReclaim(page); |
1104 | 1124 | ||
1105 | goto keep_locked; | 1125 | goto keep_locked; |
@@ -1266,11 +1286,11 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
1266 | } | 1286 | } |
1267 | } | 1287 | } |
1268 | 1288 | ||
1269 | ret = shrink_page_list(&clean_pages, zone, &sc, | 1289 | ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, |
1270 | TTU_UNMAP|TTU_IGNORE_ACCESS, | 1290 | TTU_UNMAP|TTU_IGNORE_ACCESS, |
1271 | &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); | 1291 | &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); |
1272 | list_splice(&clean_pages, page_list); | 1292 | list_splice(&clean_pages, page_list); |
1273 | mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); | 1293 | mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); |
1274 | return ret; | 1294 | return ret; |
1275 | } | 1295 | } |
1276 | 1296 | ||
@@ -1348,8 +1368,31 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) | |||
1348 | return ret; | 1368 | return ret; |
1349 | } | 1369 | } |
1350 | 1370 | ||
1371 | |||
1351 | /* | 1372 | /* |
1352 | * zone->lru_lock is heavily contended. Some of the functions that | 1373 | * Update LRU sizes after isolating pages. The LRU size updates must |
1374 | * be complete before mem_cgroup_update_lru_size due to a santity check. | ||
1375 | */ | ||
1376 | static __always_inline void update_lru_sizes(struct lruvec *lruvec, | ||
1377 | enum lru_list lru, unsigned long *nr_zone_taken, | ||
1378 | unsigned long nr_taken) | ||
1379 | { | ||
1380 | int zid; | ||
1381 | |||
1382 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | ||
1383 | if (!nr_zone_taken[zid]) | ||
1384 | continue; | ||
1385 | |||
1386 | __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); | ||
1387 | } | ||
1388 | |||
1389 | #ifdef CONFIG_MEMCG | ||
1390 | mem_cgroup_update_lru_size(lruvec, lru, -nr_taken); | ||
1391 | #endif | ||
1392 | } | ||
1393 | |||
1394 | /* | ||
1395 | * zone_lru_lock is heavily contended. Some of the functions that | ||
1353 | * shrink the lists perform better by taking out a batch of pages | 1396 | * shrink the lists perform better by taking out a batch of pages |
1354 | * and working on them outside the LRU lock. | 1397 | * and working on them outside the LRU lock. |
1355 | * | 1398 | * |
@@ -1375,10 +1418,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1375 | { | 1418 | { |
1376 | struct list_head *src = &lruvec->lists[lru]; | 1419 | struct list_head *src = &lruvec->lists[lru]; |
1377 | unsigned long nr_taken = 0; | 1420 | unsigned long nr_taken = 0; |
1378 | unsigned long scan; | 1421 | unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; |
1422 | unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; | ||
1423 | unsigned long scan, nr_pages; | ||
1424 | LIST_HEAD(pages_skipped); | ||
1379 | 1425 | ||
1380 | for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && | 1426 | for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && |
1381 | !list_empty(src); scan++) { | 1427 | !list_empty(src);) { |
1382 | struct page *page; | 1428 | struct page *page; |
1383 | 1429 | ||
1384 | page = lru_to_page(src); | 1430 | page = lru_to_page(src); |
@@ -1386,9 +1432,23 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1386 | 1432 | ||
1387 | VM_BUG_ON_PAGE(!PageLRU(page), page); | 1433 | VM_BUG_ON_PAGE(!PageLRU(page), page); |
1388 | 1434 | ||
1435 | if (page_zonenum(page) > sc->reclaim_idx) { | ||
1436 | list_move(&page->lru, &pages_skipped); | ||
1437 | nr_skipped[page_zonenum(page)]++; | ||
1438 | continue; | ||
1439 | } | ||
1440 | |||
1441 | /* | ||
1442 | * Account for scanned and skipped separetly to avoid the pgdat | ||
1443 | * being prematurely marked unreclaimable by pgdat_reclaimable. | ||
1444 | */ | ||
1445 | scan++; | ||
1446 | |||
1389 | switch (__isolate_lru_page(page, mode)) { | 1447 | switch (__isolate_lru_page(page, mode)) { |
1390 | case 0: | 1448 | case 0: |
1391 | nr_taken += hpage_nr_pages(page); | 1449 | nr_pages = hpage_nr_pages(page); |
1450 | nr_taken += nr_pages; | ||
1451 | nr_zone_taken[page_zonenum(page)] += nr_pages; | ||
1392 | list_move(&page->lru, dst); | 1452 | list_move(&page->lru, dst); |
1393 | break; | 1453 | break; |
1394 | 1454 | ||
@@ -1402,9 +1462,38 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1402 | } | 1462 | } |
1403 | } | 1463 | } |
1404 | 1464 | ||
1465 | /* | ||
1466 | * Splice any skipped pages to the start of the LRU list. Note that | ||
1467 | * this disrupts the LRU order when reclaiming for lower zones but | ||
1468 | * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX | ||
1469 | * scanning would soon rescan the same pages to skip and put the | ||
1470 | * system at risk of premature OOM. | ||
1471 | */ | ||
1472 | if (!list_empty(&pages_skipped)) { | ||
1473 | int zid; | ||
1474 | unsigned long total_skipped = 0; | ||
1475 | |||
1476 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | ||
1477 | if (!nr_skipped[zid]) | ||
1478 | continue; | ||
1479 | |||
1480 | __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); | ||
1481 | total_skipped += nr_skipped[zid]; | ||
1482 | } | ||
1483 | |||
1484 | /* | ||
1485 | * Account skipped pages as a partial scan as the pgdat may be | ||
1486 | * close to unreclaimable. If the LRU list is empty, account | ||
1487 | * skipped pages as a full scan. | ||
1488 | */ | ||
1489 | scan += list_empty(src) ? total_skipped : total_skipped >> 2; | ||
1490 | |||
1491 | list_splice(&pages_skipped, src); | ||
1492 | } | ||
1405 | *nr_scanned = scan; | 1493 | *nr_scanned = scan; |
1406 | trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, | 1494 | trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, |
1407 | nr_taken, mode, is_file_lru(lru)); | 1495 | nr_taken, mode, is_file_lru(lru)); |
1496 | update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken); | ||
1408 | return nr_taken; | 1497 | return nr_taken; |
1409 | } | 1498 | } |
1410 | 1499 | ||
@@ -1444,8 +1533,8 @@ int isolate_lru_page(struct page *page) | |||
1444 | struct zone *zone = page_zone(page); | 1533 | struct zone *zone = page_zone(page); |
1445 | struct lruvec *lruvec; | 1534 | struct lruvec *lruvec; |
1446 | 1535 | ||
1447 | spin_lock_irq(&zone->lru_lock); | 1536 | spin_lock_irq(zone_lru_lock(zone)); |
1448 | lruvec = mem_cgroup_page_lruvec(page, zone); | 1537 | lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); |
1449 | if (PageLRU(page)) { | 1538 | if (PageLRU(page)) { |
1450 | int lru = page_lru(page); | 1539 | int lru = page_lru(page); |
1451 | get_page(page); | 1540 | get_page(page); |
@@ -1453,7 +1542,7 @@ int isolate_lru_page(struct page *page) | |||
1453 | del_page_from_lru_list(page, lruvec, lru); | 1542 | del_page_from_lru_list(page, lruvec, lru); |
1454 | ret = 0; | 1543 | ret = 0; |
1455 | } | 1544 | } |
1456 | spin_unlock_irq(&zone->lru_lock); | 1545 | spin_unlock_irq(zone_lru_lock(zone)); |
1457 | } | 1546 | } |
1458 | return ret; | 1547 | return ret; |
1459 | } | 1548 | } |
@@ -1465,7 +1554,7 @@ int isolate_lru_page(struct page *page) | |||
1465 | * the LRU list will go small and be scanned faster than necessary, leading to | 1554 | * the LRU list will go small and be scanned faster than necessary, leading to |
1466 | * unnecessary swapping, thrashing and OOM. | 1555 | * unnecessary swapping, thrashing and OOM. |
1467 | */ | 1556 | */ |
1468 | static int too_many_isolated(struct zone *zone, int file, | 1557 | static int too_many_isolated(struct pglist_data *pgdat, int file, |
1469 | struct scan_control *sc) | 1558 | struct scan_control *sc) |
1470 | { | 1559 | { |
1471 | unsigned long inactive, isolated; | 1560 | unsigned long inactive, isolated; |
@@ -1477,11 +1566,11 @@ static int too_many_isolated(struct zone *zone, int file, | |||
1477 | return 0; | 1566 | return 0; |
1478 | 1567 | ||
1479 | if (file) { | 1568 | if (file) { |
1480 | inactive = zone_page_state(zone, NR_INACTIVE_FILE); | 1569 | inactive = node_page_state(pgdat, NR_INACTIVE_FILE); |
1481 | isolated = zone_page_state(zone, NR_ISOLATED_FILE); | 1570 | isolated = node_page_state(pgdat, NR_ISOLATED_FILE); |
1482 | } else { | 1571 | } else { |
1483 | inactive = zone_page_state(zone, NR_INACTIVE_ANON); | 1572 | inactive = node_page_state(pgdat, NR_INACTIVE_ANON); |
1484 | isolated = zone_page_state(zone, NR_ISOLATED_ANON); | 1573 | isolated = node_page_state(pgdat, NR_ISOLATED_ANON); |
1485 | } | 1574 | } |
1486 | 1575 | ||
1487 | /* | 1576 | /* |
@@ -1499,7 +1588,7 @@ static noinline_for_stack void | |||
1499 | putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) | 1588 | putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) |
1500 | { | 1589 | { |
1501 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; | 1590 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
1502 | struct zone *zone = lruvec_zone(lruvec); | 1591 | struct pglist_data *pgdat = lruvec_pgdat(lruvec); |
1503 | LIST_HEAD(pages_to_free); | 1592 | LIST_HEAD(pages_to_free); |
1504 | 1593 | ||
1505 | /* | 1594 | /* |
@@ -1512,13 +1601,13 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) | |||
1512 | VM_BUG_ON_PAGE(PageLRU(page), page); | 1601 | VM_BUG_ON_PAGE(PageLRU(page), page); |
1513 | list_del(&page->lru); | 1602 | list_del(&page->lru); |
1514 | if (unlikely(!page_evictable(page))) { | 1603 | if (unlikely(!page_evictable(page))) { |
1515 | spin_unlock_irq(&zone->lru_lock); | 1604 | spin_unlock_irq(&pgdat->lru_lock); |
1516 | putback_lru_page(page); | 1605 | putback_lru_page(page); |
1517 | spin_lock_irq(&zone->lru_lock); | 1606 | spin_lock_irq(&pgdat->lru_lock); |
1518 | continue; | 1607 | continue; |
1519 | } | 1608 | } |
1520 | 1609 | ||
1521 | lruvec = mem_cgroup_page_lruvec(page, zone); | 1610 | lruvec = mem_cgroup_page_lruvec(page, pgdat); |
1522 | 1611 | ||
1523 | SetPageLRU(page); | 1612 | SetPageLRU(page); |
1524 | lru = page_lru(page); | 1613 | lru = page_lru(page); |
@@ -1535,10 +1624,10 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) | |||
1535 | del_page_from_lru_list(page, lruvec, lru); | 1624 | del_page_from_lru_list(page, lruvec, lru); |
1536 | 1625 | ||
1537 | if (unlikely(PageCompound(page))) { | 1626 | if (unlikely(PageCompound(page))) { |
1538 | spin_unlock_irq(&zone->lru_lock); | 1627 | spin_unlock_irq(&pgdat->lru_lock); |
1539 | mem_cgroup_uncharge(page); | 1628 | mem_cgroup_uncharge(page); |
1540 | (*get_compound_page_dtor(page))(page); | 1629 | (*get_compound_page_dtor(page))(page); |
1541 | spin_lock_irq(&zone->lru_lock); | 1630 | spin_lock_irq(&pgdat->lru_lock); |
1542 | } else | 1631 | } else |
1543 | list_add(&page->lru, &pages_to_free); | 1632 | list_add(&page->lru, &pages_to_free); |
1544 | } | 1633 | } |
@@ -1563,8 +1652,32 @@ static int current_may_throttle(void) | |||
1563 | bdi_write_congested(current->backing_dev_info); | 1652 | bdi_write_congested(current->backing_dev_info); |
1564 | } | 1653 | } |
1565 | 1654 | ||
1655 | static bool inactive_reclaimable_pages(struct lruvec *lruvec, | ||
1656 | struct scan_control *sc, enum lru_list lru) | ||
1657 | { | ||
1658 | int zid; | ||
1659 | struct zone *zone; | ||
1660 | int file = is_file_lru(lru); | ||
1661 | struct pglist_data *pgdat = lruvec_pgdat(lruvec); | ||
1662 | |||
1663 | if (!global_reclaim(sc)) | ||
1664 | return true; | ||
1665 | |||
1666 | for (zid = sc->reclaim_idx; zid >= 0; zid--) { | ||
1667 | zone = &pgdat->node_zones[zid]; | ||
1668 | if (!populated_zone(zone)) | ||
1669 | continue; | ||
1670 | |||
1671 | if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE + | ||
1672 | LRU_FILE * file) >= SWAP_CLUSTER_MAX) | ||
1673 | return true; | ||
1674 | } | ||
1675 | |||
1676 | return false; | ||
1677 | } | ||
1678 | |||
1566 | /* | 1679 | /* |
1567 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number | 1680 | * shrink_inactive_list() is a helper for shrink_node(). It returns the number |
1568 | * of reclaimed pages | 1681 | * of reclaimed pages |
1569 | */ | 1682 | */ |
1570 | static noinline_for_stack unsigned long | 1683 | static noinline_for_stack unsigned long |
@@ -1582,10 +1695,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1582 | unsigned long nr_immediate = 0; | 1695 | unsigned long nr_immediate = 0; |
1583 | isolate_mode_t isolate_mode = 0; | 1696 | isolate_mode_t isolate_mode = 0; |
1584 | int file = is_file_lru(lru); | 1697 | int file = is_file_lru(lru); |
1585 | struct zone *zone = lruvec_zone(lruvec); | 1698 | struct pglist_data *pgdat = lruvec_pgdat(lruvec); |
1586 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; | 1699 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
1587 | 1700 | ||
1588 | while (unlikely(too_many_isolated(zone, file, sc))) { | 1701 | if (!inactive_reclaimable_pages(lruvec, sc, lru)) |
1702 | return 0; | ||
1703 | |||
1704 | while (unlikely(too_many_isolated(pgdat, file, sc))) { | ||
1589 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1705 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
1590 | 1706 | ||
1591 | /* We are about to die and free our memory. Return now. */ | 1707 | /* We are about to die and free our memory. Return now. */ |
@@ -1600,48 +1716,45 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1600 | if (!sc->may_writepage) | 1716 | if (!sc->may_writepage) |
1601 | isolate_mode |= ISOLATE_CLEAN; | 1717 | isolate_mode |= ISOLATE_CLEAN; |
1602 | 1718 | ||
1603 | spin_lock_irq(&zone->lru_lock); | 1719 | spin_lock_irq(&pgdat->lru_lock); |
1604 | 1720 | ||
1605 | nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, | 1721 | nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, |
1606 | &nr_scanned, sc, isolate_mode, lru); | 1722 | &nr_scanned, sc, isolate_mode, lru); |
1607 | 1723 | ||
1608 | update_lru_size(lruvec, lru, -nr_taken); | 1724 | __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); |
1609 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); | ||
1610 | reclaim_stat->recent_scanned[file] += nr_taken; | 1725 | reclaim_stat->recent_scanned[file] += nr_taken; |
1611 | 1726 | ||
1612 | if (global_reclaim(sc)) { | 1727 | if (global_reclaim(sc)) { |
1613 | __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); | 1728 | __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); |
1614 | if (current_is_kswapd()) | 1729 | if (current_is_kswapd()) |
1615 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); | 1730 | __count_vm_events(PGSCAN_KSWAPD, nr_scanned); |
1616 | else | 1731 | else |
1617 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); | 1732 | __count_vm_events(PGSCAN_DIRECT, nr_scanned); |
1618 | } | 1733 | } |
1619 | spin_unlock_irq(&zone->lru_lock); | 1734 | spin_unlock_irq(&pgdat->lru_lock); |
1620 | 1735 | ||
1621 | if (nr_taken == 0) | 1736 | if (nr_taken == 0) |
1622 | return 0; | 1737 | return 0; |
1623 | 1738 | ||
1624 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, | 1739 | nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP, |
1625 | &nr_dirty, &nr_unqueued_dirty, &nr_congested, | 1740 | &nr_dirty, &nr_unqueued_dirty, &nr_congested, |
1626 | &nr_writeback, &nr_immediate, | 1741 | &nr_writeback, &nr_immediate, |
1627 | false); | 1742 | false); |
1628 | 1743 | ||
1629 | spin_lock_irq(&zone->lru_lock); | 1744 | spin_lock_irq(&pgdat->lru_lock); |
1630 | 1745 | ||
1631 | if (global_reclaim(sc)) { | 1746 | if (global_reclaim(sc)) { |
1632 | if (current_is_kswapd()) | 1747 | if (current_is_kswapd()) |
1633 | __count_zone_vm_events(PGSTEAL_KSWAPD, zone, | 1748 | __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed); |
1634 | nr_reclaimed); | ||
1635 | else | 1749 | else |
1636 | __count_zone_vm_events(PGSTEAL_DIRECT, zone, | 1750 | __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed); |
1637 | nr_reclaimed); | ||
1638 | } | 1751 | } |
1639 | 1752 | ||
1640 | putback_inactive_pages(lruvec, &page_list); | 1753 | putback_inactive_pages(lruvec, &page_list); |
1641 | 1754 | ||
1642 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); | 1755 | __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); |
1643 | 1756 | ||
1644 | spin_unlock_irq(&zone->lru_lock); | 1757 | spin_unlock_irq(&pgdat->lru_lock); |
1645 | 1758 | ||
1646 | mem_cgroup_uncharge_list(&page_list); | 1759 | mem_cgroup_uncharge_list(&page_list); |
1647 | free_hot_cold_page_list(&page_list, true); | 1760 | free_hot_cold_page_list(&page_list, true); |
@@ -1661,7 +1774,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1661 | * are encountered in the nr_immediate check below. | 1774 | * are encountered in the nr_immediate check below. |
1662 | */ | 1775 | */ |
1663 | if (nr_writeback && nr_writeback == nr_taken) | 1776 | if (nr_writeback && nr_writeback == nr_taken) |
1664 | set_bit(ZONE_WRITEBACK, &zone->flags); | 1777 | set_bit(PGDAT_WRITEBACK, &pgdat->flags); |
1665 | 1778 | ||
1666 | /* | 1779 | /* |
1667 | * Legacy memcg will stall in page writeback so avoid forcibly | 1780 | * Legacy memcg will stall in page writeback so avoid forcibly |
@@ -1673,16 +1786,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1673 | * backed by a congested BDI and wait_iff_congested will stall. | 1786 | * backed by a congested BDI and wait_iff_congested will stall. |
1674 | */ | 1787 | */ |
1675 | if (nr_dirty && nr_dirty == nr_congested) | 1788 | if (nr_dirty && nr_dirty == nr_congested) |
1676 | set_bit(ZONE_CONGESTED, &zone->flags); | 1789 | set_bit(PGDAT_CONGESTED, &pgdat->flags); |
1677 | 1790 | ||
1678 | /* | 1791 | /* |
1679 | * If dirty pages are scanned that are not queued for IO, it | 1792 | * If dirty pages are scanned that are not queued for IO, it |
1680 | * implies that flushers are not keeping up. In this case, flag | 1793 | * implies that flushers are not keeping up. In this case, flag |
1681 | * the zone ZONE_DIRTY and kswapd will start writing pages from | 1794 | * the pgdat PGDAT_DIRTY and kswapd will start writing pages from |
1682 | * reclaim context. | 1795 | * reclaim context. |
1683 | */ | 1796 | */ |
1684 | if (nr_unqueued_dirty == nr_taken) | 1797 | if (nr_unqueued_dirty == nr_taken) |
1685 | set_bit(ZONE_DIRTY, &zone->flags); | 1798 | set_bit(PGDAT_DIRTY, &pgdat->flags); |
1686 | 1799 | ||
1687 | /* | 1800 | /* |
1688 | * If kswapd scans pages marked marked for immediate | 1801 | * If kswapd scans pages marked marked for immediate |
@@ -1701,9 +1814,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1701 | */ | 1814 | */ |
1702 | if (!sc->hibernation_mode && !current_is_kswapd() && | 1815 | if (!sc->hibernation_mode && !current_is_kswapd() && |
1703 | current_may_throttle()) | 1816 | current_may_throttle()) |
1704 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); | 1817 | wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10); |
1705 | 1818 | ||
1706 | trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed, | 1819 | trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, |
1820 | nr_scanned, nr_reclaimed, | ||
1707 | sc->priority, file); | 1821 | sc->priority, file); |
1708 | return nr_reclaimed; | 1822 | return nr_reclaimed; |
1709 | } | 1823 | } |
@@ -1715,9 +1829,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1715 | * processes, from rmap. | 1829 | * processes, from rmap. |
1716 | * | 1830 | * |
1717 | * If the pages are mostly unmapped, the processing is fast and it is | 1831 | * If the pages are mostly unmapped, the processing is fast and it is |
1718 | * appropriate to hold zone->lru_lock across the whole operation. But if | 1832 | * appropriate to hold zone_lru_lock across the whole operation. But if |
1719 | * the pages are mapped, the processing is slow (page_referenced()) so we | 1833 | * the pages are mapped, the processing is slow (page_referenced()) so we |
1720 | * should drop zone->lru_lock around each page. It's impossible to balance | 1834 | * should drop zone_lru_lock around each page. It's impossible to balance |
1721 | * this, so instead we remove the pages from the LRU while processing them. | 1835 | * this, so instead we remove the pages from the LRU while processing them. |
1722 | * It is safe to rely on PG_active against the non-LRU pages in here because | 1836 | * It is safe to rely on PG_active against the non-LRU pages in here because |
1723 | * nobody will play with that bit on a non-LRU page. | 1837 | * nobody will play with that bit on a non-LRU page. |
@@ -1731,20 +1845,20 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, | |||
1731 | struct list_head *pages_to_free, | 1845 | struct list_head *pages_to_free, |
1732 | enum lru_list lru) | 1846 | enum lru_list lru) |
1733 | { | 1847 | { |
1734 | struct zone *zone = lruvec_zone(lruvec); | 1848 | struct pglist_data *pgdat = lruvec_pgdat(lruvec); |
1735 | unsigned long pgmoved = 0; | 1849 | unsigned long pgmoved = 0; |
1736 | struct page *page; | 1850 | struct page *page; |
1737 | int nr_pages; | 1851 | int nr_pages; |
1738 | 1852 | ||
1739 | while (!list_empty(list)) { | 1853 | while (!list_empty(list)) { |
1740 | page = lru_to_page(list); | 1854 | page = lru_to_page(list); |
1741 | lruvec = mem_cgroup_page_lruvec(page, zone); | 1855 | lruvec = mem_cgroup_page_lruvec(page, pgdat); |
1742 | 1856 | ||
1743 | VM_BUG_ON_PAGE(PageLRU(page), page); | 1857 | VM_BUG_ON_PAGE(PageLRU(page), page); |
1744 | SetPageLRU(page); | 1858 | SetPageLRU(page); |
1745 | 1859 | ||
1746 | nr_pages = hpage_nr_pages(page); | 1860 | nr_pages = hpage_nr_pages(page); |
1747 | update_lru_size(lruvec, lru, nr_pages); | 1861 | update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); |
1748 | list_move(&page->lru, &lruvec->lists[lru]); | 1862 | list_move(&page->lru, &lruvec->lists[lru]); |
1749 | pgmoved += nr_pages; | 1863 | pgmoved += nr_pages; |
1750 | 1864 | ||
@@ -1754,10 +1868,10 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, | |||
1754 | del_page_from_lru_list(page, lruvec, lru); | 1868 | del_page_from_lru_list(page, lruvec, lru); |
1755 | 1869 | ||
1756 | if (unlikely(PageCompound(page))) { | 1870 | if (unlikely(PageCompound(page))) { |
1757 | spin_unlock_irq(&zone->lru_lock); | 1871 | spin_unlock_irq(&pgdat->lru_lock); |
1758 | mem_cgroup_uncharge(page); | 1872 | mem_cgroup_uncharge(page); |
1759 | (*get_compound_page_dtor(page))(page); | 1873 | (*get_compound_page_dtor(page))(page); |
1760 | spin_lock_irq(&zone->lru_lock); | 1874 | spin_lock_irq(&pgdat->lru_lock); |
1761 | } else | 1875 | } else |
1762 | list_add(&page->lru, pages_to_free); | 1876 | list_add(&page->lru, pages_to_free); |
1763 | } | 1877 | } |
@@ -1783,7 +1897,7 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1783 | unsigned long nr_rotated = 0; | 1897 | unsigned long nr_rotated = 0; |
1784 | isolate_mode_t isolate_mode = 0; | 1898 | isolate_mode_t isolate_mode = 0; |
1785 | int file = is_file_lru(lru); | 1899 | int file = is_file_lru(lru); |
1786 | struct zone *zone = lruvec_zone(lruvec); | 1900 | struct pglist_data *pgdat = lruvec_pgdat(lruvec); |
1787 | 1901 | ||
1788 | lru_add_drain(); | 1902 | lru_add_drain(); |
1789 | 1903 | ||
@@ -1792,20 +1906,19 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1792 | if (!sc->may_writepage) | 1906 | if (!sc->may_writepage) |
1793 | isolate_mode |= ISOLATE_CLEAN; | 1907 | isolate_mode |= ISOLATE_CLEAN; |
1794 | 1908 | ||
1795 | spin_lock_irq(&zone->lru_lock); | 1909 | spin_lock_irq(&pgdat->lru_lock); |
1796 | 1910 | ||
1797 | nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, | 1911 | nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, |
1798 | &nr_scanned, sc, isolate_mode, lru); | 1912 | &nr_scanned, sc, isolate_mode, lru); |
1799 | 1913 | ||
1800 | update_lru_size(lruvec, lru, -nr_taken); | 1914 | __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); |
1801 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); | ||
1802 | reclaim_stat->recent_scanned[file] += nr_taken; | 1915 | reclaim_stat->recent_scanned[file] += nr_taken; |
1803 | 1916 | ||
1804 | if (global_reclaim(sc)) | 1917 | if (global_reclaim(sc)) |
1805 | __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); | 1918 | __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); |
1806 | __count_zone_vm_events(PGREFILL, zone, nr_scanned); | 1919 | __count_vm_events(PGREFILL, nr_scanned); |
1807 | 1920 | ||
1808 | spin_unlock_irq(&zone->lru_lock); | 1921 | spin_unlock_irq(&pgdat->lru_lock); |
1809 | 1922 | ||
1810 | while (!list_empty(&l_hold)) { | 1923 | while (!list_empty(&l_hold)) { |
1811 | cond_resched(); | 1924 | cond_resched(); |
@@ -1850,7 +1963,7 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1850 | /* | 1963 | /* |
1851 | * Move pages back to the lru list. | 1964 | * Move pages back to the lru list. |
1852 | */ | 1965 | */ |
1853 | spin_lock_irq(&zone->lru_lock); | 1966 | spin_lock_irq(&pgdat->lru_lock); |
1854 | /* | 1967 | /* |
1855 | * Count referenced pages from currently used mappings as rotated, | 1968 | * Count referenced pages from currently used mappings as rotated, |
1856 | * even though only some of them are actually re-activated. This | 1969 | * even though only some of them are actually re-activated. This |
@@ -1861,8 +1974,8 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1861 | 1974 | ||
1862 | move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); | 1975 | move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); |
1863 | move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); | 1976 | move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); |
1864 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); | 1977 | __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); |
1865 | spin_unlock_irq(&zone->lru_lock); | 1978 | spin_unlock_irq(&pgdat->lru_lock); |
1866 | 1979 | ||
1867 | mem_cgroup_uncharge_list(&l_hold); | 1980 | mem_cgroup_uncharge_list(&l_hold); |
1868 | free_hot_cold_page_list(&l_hold, true); | 1981 | free_hot_cold_page_list(&l_hold, true); |
@@ -1894,12 +2007,15 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1894 | * 1TB 101 10GB | 2007 | * 1TB 101 10GB |
1895 | * 10TB 320 32GB | 2008 | * 10TB 320 32GB |
1896 | */ | 2009 | */ |
1897 | static bool inactive_list_is_low(struct lruvec *lruvec, bool file) | 2010 | static bool inactive_list_is_low(struct lruvec *lruvec, bool file, |
2011 | struct scan_control *sc) | ||
1898 | { | 2012 | { |
1899 | unsigned long inactive_ratio; | 2013 | unsigned long inactive_ratio; |
1900 | unsigned long inactive; | 2014 | unsigned long inactive; |
1901 | unsigned long active; | 2015 | unsigned long active; |
1902 | unsigned long gb; | 2016 | unsigned long gb; |
2017 | struct pglist_data *pgdat = lruvec_pgdat(lruvec); | ||
2018 | int zid; | ||
1903 | 2019 | ||
1904 | /* | 2020 | /* |
1905 | * If we don't have swap space, anonymous page deactivation | 2021 | * If we don't have swap space, anonymous page deactivation |
@@ -1911,6 +2027,27 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file) | |||
1911 | inactive = lruvec_lru_size(lruvec, file * LRU_FILE); | 2027 | inactive = lruvec_lru_size(lruvec, file * LRU_FILE); |
1912 | active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE); | 2028 | active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE); |
1913 | 2029 | ||
2030 | /* | ||
2031 | * For zone-constrained allocations, it is necessary to check if | ||
2032 | * deactivations are required for lowmem to be reclaimed. This | ||
2033 | * calculates the inactive/active pages available in eligible zones. | ||
2034 | */ | ||
2035 | for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) { | ||
2036 | struct zone *zone = &pgdat->node_zones[zid]; | ||
2037 | unsigned long inactive_zone, active_zone; | ||
2038 | |||
2039 | if (!populated_zone(zone)) | ||
2040 | continue; | ||
2041 | |||
2042 | inactive_zone = zone_page_state(zone, | ||
2043 | NR_ZONE_LRU_BASE + (file * LRU_FILE)); | ||
2044 | active_zone = zone_page_state(zone, | ||
2045 | NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE); | ||
2046 | |||
2047 | inactive -= min(inactive, inactive_zone); | ||
2048 | active -= min(active, active_zone); | ||
2049 | } | ||
2050 | |||
1914 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | 2051 | gb = (inactive + active) >> (30 - PAGE_SHIFT); |
1915 | if (gb) | 2052 | if (gb) |
1916 | inactive_ratio = int_sqrt(10 * gb); | 2053 | inactive_ratio = int_sqrt(10 * gb); |
@@ -1924,7 +2061,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | |||
1924 | struct lruvec *lruvec, struct scan_control *sc) | 2061 | struct lruvec *lruvec, struct scan_control *sc) |
1925 | { | 2062 | { |
1926 | if (is_active_lru(lru)) { | 2063 | if (is_active_lru(lru)) { |
1927 | if (inactive_list_is_low(lruvec, is_file_lru(lru))) | 2064 | if (inactive_list_is_low(lruvec, is_file_lru(lru), sc)) |
1928 | shrink_active_list(nr_to_scan, lruvec, sc, lru); | 2065 | shrink_active_list(nr_to_scan, lruvec, sc, lru); |
1929 | return 0; | 2066 | return 0; |
1930 | } | 2067 | } |
@@ -1956,7 +2093,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, | |||
1956 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; | 2093 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
1957 | u64 fraction[2]; | 2094 | u64 fraction[2]; |
1958 | u64 denominator = 0; /* gcc */ | 2095 | u64 denominator = 0; /* gcc */ |
1959 | struct zone *zone = lruvec_zone(lruvec); | 2096 | struct pglist_data *pgdat = lruvec_pgdat(lruvec); |
1960 | unsigned long anon_prio, file_prio; | 2097 | unsigned long anon_prio, file_prio; |
1961 | enum scan_balance scan_balance; | 2098 | enum scan_balance scan_balance; |
1962 | unsigned long anon, file; | 2099 | unsigned long anon, file; |
@@ -1977,7 +2114,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, | |||
1977 | * well. | 2114 | * well. |
1978 | */ | 2115 | */ |
1979 | if (current_is_kswapd()) { | 2116 | if (current_is_kswapd()) { |
1980 | if (!zone_reclaimable(zone)) | 2117 | if (!pgdat_reclaimable(pgdat)) |
1981 | force_scan = true; | 2118 | force_scan = true; |
1982 | if (!mem_cgroup_online(memcg)) | 2119 | if (!mem_cgroup_online(memcg)) |
1983 | force_scan = true; | 2120 | force_scan = true; |
@@ -2023,14 +2160,24 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, | |||
2023 | * anon pages. Try to detect this based on file LRU size. | 2160 | * anon pages. Try to detect this based on file LRU size. |
2024 | */ | 2161 | */ |
2025 | if (global_reclaim(sc)) { | 2162 | if (global_reclaim(sc)) { |
2026 | unsigned long zonefile; | 2163 | unsigned long pgdatfile; |
2027 | unsigned long zonefree; | 2164 | unsigned long pgdatfree; |
2165 | int z; | ||
2166 | unsigned long total_high_wmark = 0; | ||
2028 | 2167 | ||
2029 | zonefree = zone_page_state(zone, NR_FREE_PAGES); | 2168 | pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); |
2030 | zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + | 2169 | pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) + |
2031 | zone_page_state(zone, NR_INACTIVE_FILE); | 2170 | node_page_state(pgdat, NR_INACTIVE_FILE); |
2032 | 2171 | ||
2033 | if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { | 2172 | for (z = 0; z < MAX_NR_ZONES; z++) { |
2173 | struct zone *zone = &pgdat->node_zones[z]; | ||
2174 | if (!populated_zone(zone)) | ||
2175 | continue; | ||
2176 | |||
2177 | total_high_wmark += high_wmark_pages(zone); | ||
2178 | } | ||
2179 | |||
2180 | if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { | ||
2034 | scan_balance = SCAN_ANON; | 2181 | scan_balance = SCAN_ANON; |
2035 | goto out; | 2182 | goto out; |
2036 | } | 2183 | } |
@@ -2045,7 +2192,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, | |||
2045 | * lruvec even if it has plenty of old anonymous pages unless the | 2192 | * lruvec even if it has plenty of old anonymous pages unless the |
2046 | * system is under heavy pressure. | 2193 | * system is under heavy pressure. |
2047 | */ | 2194 | */ |
2048 | if (!inactive_list_is_low(lruvec, true) && | 2195 | if (!inactive_list_is_low(lruvec, true, sc) && |
2049 | lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { | 2196 | lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { |
2050 | scan_balance = SCAN_FILE; | 2197 | scan_balance = SCAN_FILE; |
2051 | goto out; | 2198 | goto out; |
@@ -2077,7 +2224,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, | |||
2077 | file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + | 2224 | file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + |
2078 | lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); | 2225 | lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); |
2079 | 2226 | ||
2080 | spin_lock_irq(&zone->lru_lock); | 2227 | spin_lock_irq(&pgdat->lru_lock); |
2081 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { | 2228 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { |
2082 | reclaim_stat->recent_scanned[0] /= 2; | 2229 | reclaim_stat->recent_scanned[0] /= 2; |
2083 | reclaim_stat->recent_rotated[0] /= 2; | 2230 | reclaim_stat->recent_rotated[0] /= 2; |
@@ -2098,7 +2245,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, | |||
2098 | 2245 | ||
2099 | fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); | 2246 | fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); |
2100 | fp /= reclaim_stat->recent_rotated[1] + 1; | 2247 | fp /= reclaim_stat->recent_rotated[1] + 1; |
2101 | spin_unlock_irq(&zone->lru_lock); | 2248 | spin_unlock_irq(&pgdat->lru_lock); |
2102 | 2249 | ||
2103 | fraction[0] = ap; | 2250 | fraction[0] = ap; |
2104 | fraction[1] = fp; | 2251 | fraction[1] = fp; |
@@ -2174,12 +2321,12 @@ static inline void init_tlb_ubc(void) | |||
2174 | #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ | 2321 | #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ |
2175 | 2322 | ||
2176 | /* | 2323 | /* |
2177 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 2324 | * This is a basic per-node page freer. Used by both kswapd and direct reclaim. |
2178 | */ | 2325 | */ |
2179 | static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg, | 2326 | static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, |
2180 | struct scan_control *sc, unsigned long *lru_pages) | 2327 | struct scan_control *sc, unsigned long *lru_pages) |
2181 | { | 2328 | { |
2182 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2329 | struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); |
2183 | unsigned long nr[NR_LRU_LISTS]; | 2330 | unsigned long nr[NR_LRU_LISTS]; |
2184 | unsigned long targets[NR_LRU_LISTS]; | 2331 | unsigned long targets[NR_LRU_LISTS]; |
2185 | unsigned long nr_to_scan; | 2332 | unsigned long nr_to_scan; |
@@ -2287,7 +2434,7 @@ static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg, | |||
2287 | * Even if we did not try to evict anon pages at all, we want to | 2434 | * Even if we did not try to evict anon pages at all, we want to |
2288 | * rebalance the anon lru active/inactive ratio. | 2435 | * rebalance the anon lru active/inactive ratio. |
2289 | */ | 2436 | */ |
2290 | if (inactive_list_is_low(lruvec, false)) | 2437 | if (inactive_list_is_low(lruvec, false, sc)) |
2291 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, | 2438 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, |
2292 | sc, LRU_ACTIVE_ANON); | 2439 | sc, LRU_ACTIVE_ANON); |
2293 | 2440 | ||
@@ -2312,13 +2459,14 @@ static bool in_reclaim_compaction(struct scan_control *sc) | |||
2312 | * calls try_to_compact_zone() that it will have enough free pages to succeed. | 2459 | * calls try_to_compact_zone() that it will have enough free pages to succeed. |
2313 | * It will give up earlier than that if there is difficulty reclaiming pages. | 2460 | * It will give up earlier than that if there is difficulty reclaiming pages. |
2314 | */ | 2461 | */ |
2315 | static inline bool should_continue_reclaim(struct zone *zone, | 2462 | static inline bool should_continue_reclaim(struct pglist_data *pgdat, |
2316 | unsigned long nr_reclaimed, | 2463 | unsigned long nr_reclaimed, |
2317 | unsigned long nr_scanned, | 2464 | unsigned long nr_scanned, |
2318 | struct scan_control *sc) | 2465 | struct scan_control *sc) |
2319 | { | 2466 | { |
2320 | unsigned long pages_for_compaction; | 2467 | unsigned long pages_for_compaction; |
2321 | unsigned long inactive_lru_pages; | 2468 | unsigned long inactive_lru_pages; |
2469 | int z; | ||
2322 | 2470 | ||
2323 | /* If not in reclaim/compaction mode, stop */ | 2471 | /* If not in reclaim/compaction mode, stop */ |
2324 | if (!in_reclaim_compaction(sc)) | 2472 | if (!in_reclaim_compaction(sc)) |
@@ -2352,25 +2500,32 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
2352 | * inactive lists are large enough, continue reclaiming | 2500 | * inactive lists are large enough, continue reclaiming |
2353 | */ | 2501 | */ |
2354 | pages_for_compaction = (2UL << sc->order); | 2502 | pages_for_compaction = (2UL << sc->order); |
2355 | inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); | 2503 | inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); |
2356 | if (get_nr_swap_pages() > 0) | 2504 | if (get_nr_swap_pages() > 0) |
2357 | inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); | 2505 | inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); |
2358 | if (sc->nr_reclaimed < pages_for_compaction && | 2506 | if (sc->nr_reclaimed < pages_for_compaction && |
2359 | inactive_lru_pages > pages_for_compaction) | 2507 | inactive_lru_pages > pages_for_compaction) |
2360 | return true; | 2508 | return true; |
2361 | 2509 | ||
2362 | /* If compaction would go ahead or the allocation would succeed, stop */ | 2510 | /* If compaction would go ahead or the allocation would succeed, stop */ |
2363 | switch (compaction_suitable(zone, sc->order, 0, 0)) { | 2511 | for (z = 0; z <= sc->reclaim_idx; z++) { |
2364 | case COMPACT_PARTIAL: | 2512 | struct zone *zone = &pgdat->node_zones[z]; |
2365 | case COMPACT_CONTINUE: | 2513 | if (!populated_zone(zone)) |
2366 | return false; | 2514 | continue; |
2367 | default: | 2515 | |
2368 | return true; | 2516 | switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { |
2517 | case COMPACT_PARTIAL: | ||
2518 | case COMPACT_CONTINUE: | ||
2519 | return false; | ||
2520 | default: | ||
2521 | /* check next zone */ | ||
2522 | ; | ||
2523 | } | ||
2369 | } | 2524 | } |
2525 | return true; | ||
2370 | } | 2526 | } |
2371 | 2527 | ||
2372 | static bool shrink_zone(struct zone *zone, struct scan_control *sc, | 2528 | static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) |
2373 | bool is_classzone) | ||
2374 | { | 2529 | { |
2375 | struct reclaim_state *reclaim_state = current->reclaim_state; | 2530 | struct reclaim_state *reclaim_state = current->reclaim_state; |
2376 | unsigned long nr_reclaimed, nr_scanned; | 2531 | unsigned long nr_reclaimed, nr_scanned; |
@@ -2379,10 +2534,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, | |||
2379 | do { | 2534 | do { |
2380 | struct mem_cgroup *root = sc->target_mem_cgroup; | 2535 | struct mem_cgroup *root = sc->target_mem_cgroup; |
2381 | struct mem_cgroup_reclaim_cookie reclaim = { | 2536 | struct mem_cgroup_reclaim_cookie reclaim = { |
2382 | .zone = zone, | 2537 | .pgdat = pgdat, |
2383 | .priority = sc->priority, | 2538 | .priority = sc->priority, |
2384 | }; | 2539 | }; |
2385 | unsigned long zone_lru_pages = 0; | 2540 | unsigned long node_lru_pages = 0; |
2386 | struct mem_cgroup *memcg; | 2541 | struct mem_cgroup *memcg; |
2387 | 2542 | ||
2388 | nr_reclaimed = sc->nr_reclaimed; | 2543 | nr_reclaimed = sc->nr_reclaimed; |
@@ -2403,11 +2558,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, | |||
2403 | reclaimed = sc->nr_reclaimed; | 2558 | reclaimed = sc->nr_reclaimed; |
2404 | scanned = sc->nr_scanned; | 2559 | scanned = sc->nr_scanned; |
2405 | 2560 | ||
2406 | shrink_zone_memcg(zone, memcg, sc, &lru_pages); | 2561 | shrink_node_memcg(pgdat, memcg, sc, &lru_pages); |
2407 | zone_lru_pages += lru_pages; | 2562 | node_lru_pages += lru_pages; |
2408 | 2563 | ||
2409 | if (memcg && is_classzone) | 2564 | if (!global_reclaim(sc)) |
2410 | shrink_slab(sc->gfp_mask, zone_to_nid(zone), | 2565 | shrink_slab(sc->gfp_mask, pgdat->node_id, |
2411 | memcg, sc->nr_scanned - scanned, | 2566 | memcg, sc->nr_scanned - scanned, |
2412 | lru_pages); | 2567 | lru_pages); |
2413 | 2568 | ||
@@ -2419,7 +2574,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, | |||
2419 | /* | 2574 | /* |
2420 | * Direct reclaim and kswapd have to scan all memory | 2575 | * Direct reclaim and kswapd have to scan all memory |
2421 | * cgroups to fulfill the overall scan target for the | 2576 | * cgroups to fulfill the overall scan target for the |
2422 | * zone. | 2577 | * node. |
2423 | * | 2578 | * |
2424 | * Limit reclaim, on the other hand, only cares about | 2579 | * Limit reclaim, on the other hand, only cares about |
2425 | * nr_to_reclaim pages to be reclaimed and it will | 2580 | * nr_to_reclaim pages to be reclaimed and it will |
@@ -2437,10 +2592,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, | |||
2437 | * Shrink the slab caches in the same proportion that | 2592 | * Shrink the slab caches in the same proportion that |
2438 | * the eligible LRU pages were scanned. | 2593 | * the eligible LRU pages were scanned. |
2439 | */ | 2594 | */ |
2440 | if (global_reclaim(sc) && is_classzone) | 2595 | if (global_reclaim(sc)) |
2441 | shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL, | 2596 | shrink_slab(sc->gfp_mask, pgdat->node_id, NULL, |
2442 | sc->nr_scanned - nr_scanned, | 2597 | sc->nr_scanned - nr_scanned, |
2443 | zone_lru_pages); | 2598 | node_lru_pages); |
2444 | 2599 | ||
2445 | if (reclaim_state) { | 2600 | if (reclaim_state) { |
2446 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; | 2601 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; |
@@ -2455,7 +2610,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, | |||
2455 | if (sc->nr_reclaimed - nr_reclaimed) | 2610 | if (sc->nr_reclaimed - nr_reclaimed) |
2456 | reclaimable = true; | 2611 | reclaimable = true; |
2457 | 2612 | ||
2458 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, | 2613 | } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, |
2459 | sc->nr_scanned - nr_scanned, sc)); | 2614 | sc->nr_scanned - nr_scanned, sc)); |
2460 | 2615 | ||
2461 | return reclaimable; | 2616 | return reclaimable; |
@@ -2465,9 +2620,9 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, | |||
2465 | * Returns true if compaction should go ahead for a high-order request, or | 2620 | * Returns true if compaction should go ahead for a high-order request, or |
2466 | * the high-order allocation would succeed without compaction. | 2621 | * the high-order allocation would succeed without compaction. |
2467 | */ | 2622 | */ |
2468 | static inline bool compaction_ready(struct zone *zone, int order, int classzone_idx) | 2623 | static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) |
2469 | { | 2624 | { |
2470 | unsigned long balance_gap, watermark; | 2625 | unsigned long watermark; |
2471 | bool watermark_ok; | 2626 | bool watermark_ok; |
2472 | 2627 | ||
2473 | /* | 2628 | /* |
@@ -2476,23 +2631,21 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_ | |||
2476 | * there is a buffer of free pages available to give compaction | 2631 | * there is a buffer of free pages available to give compaction |
2477 | * a reasonable chance of completing and allocating the page | 2632 | * a reasonable chance of completing and allocating the page |
2478 | */ | 2633 | */ |
2479 | balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( | 2634 | watermark = high_wmark_pages(zone) + (2UL << sc->order); |
2480 | zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); | 2635 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); |
2481 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); | ||
2482 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, classzone_idx); | ||
2483 | 2636 | ||
2484 | /* | 2637 | /* |
2485 | * If compaction is deferred, reclaim up to a point where | 2638 | * If compaction is deferred, reclaim up to a point where |
2486 | * compaction will have a chance of success when re-enabled | 2639 | * compaction will have a chance of success when re-enabled |
2487 | */ | 2640 | */ |
2488 | if (compaction_deferred(zone, order)) | 2641 | if (compaction_deferred(zone, sc->order)) |
2489 | return watermark_ok; | 2642 | return watermark_ok; |
2490 | 2643 | ||
2491 | /* | 2644 | /* |
2492 | * If compaction is not ready to start and allocation is not likely | 2645 | * If compaction is not ready to start and allocation is not likely |
2493 | * to succeed without it, then keep reclaiming. | 2646 | * to succeed without it, then keep reclaiming. |
2494 | */ | 2647 | */ |
2495 | if (compaction_suitable(zone, order, 0, classzone_idx) == COMPACT_SKIPPED) | 2648 | if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED) |
2496 | return false; | 2649 | return false; |
2497 | 2650 | ||
2498 | return watermark_ok; | 2651 | return watermark_ok; |
@@ -2503,14 +2656,6 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_ | |||
2503 | * try to reclaim pages from zones which will satisfy the caller's allocation | 2656 | * try to reclaim pages from zones which will satisfy the caller's allocation |
2504 | * request. | 2657 | * request. |
2505 | * | 2658 | * |
2506 | * We reclaim from a zone even if that zone is over high_wmark_pages(zone). | ||
2507 | * Because: | ||
2508 | * a) The caller may be trying to free *extra* pages to satisfy a higher-order | ||
2509 | * allocation or | ||
2510 | * b) The target zone may be at high_wmark_pages(zone) but the lower zones | ||
2511 | * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' | ||
2512 | * zone defense algorithm. | ||
2513 | * | ||
2514 | * If a zone is deemed to be full of pinned pages then just give it a light | 2659 | * If a zone is deemed to be full of pinned pages then just give it a light |
2515 | * scan then give up on it. | 2660 | * scan then give up on it. |
2516 | */ | 2661 | */ |
@@ -2521,7 +2666,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2521 | unsigned long nr_soft_reclaimed; | 2666 | unsigned long nr_soft_reclaimed; |
2522 | unsigned long nr_soft_scanned; | 2667 | unsigned long nr_soft_scanned; |
2523 | gfp_t orig_mask; | 2668 | gfp_t orig_mask; |
2524 | enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); | 2669 | pg_data_t *last_pgdat = NULL; |
2525 | 2670 | ||
2526 | /* | 2671 | /* |
2527 | * If the number of buffer_heads in the machine exceeds the maximum | 2672 | * If the number of buffer_heads in the machine exceeds the maximum |
@@ -2529,21 +2674,13 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2529 | * highmem pages could be pinning lowmem pages storing buffer_heads | 2674 | * highmem pages could be pinning lowmem pages storing buffer_heads |
2530 | */ | 2675 | */ |
2531 | orig_mask = sc->gfp_mask; | 2676 | orig_mask = sc->gfp_mask; |
2532 | if (buffer_heads_over_limit) | 2677 | if (buffer_heads_over_limit) { |
2533 | sc->gfp_mask |= __GFP_HIGHMEM; | 2678 | sc->gfp_mask |= __GFP_HIGHMEM; |
2679 | sc->reclaim_idx = gfp_zone(sc->gfp_mask); | ||
2680 | } | ||
2534 | 2681 | ||
2535 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2682 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2536 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2683 | sc->reclaim_idx, sc->nodemask) { |
2537 | enum zone_type classzone_idx; | ||
2538 | |||
2539 | if (!populated_zone(zone)) | ||
2540 | continue; | ||
2541 | |||
2542 | classzone_idx = requested_highidx; | ||
2543 | while (!populated_zone(zone->zone_pgdat->node_zones + | ||
2544 | classzone_idx)) | ||
2545 | classzone_idx--; | ||
2546 | |||
2547 | /* | 2684 | /* |
2548 | * Take care memory controller reclaiming has small influence | 2685 | * Take care memory controller reclaiming has small influence |
2549 | * to global LRU. | 2686 | * to global LRU. |
@@ -2554,7 +2691,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2554 | continue; | 2691 | continue; |
2555 | 2692 | ||
2556 | if (sc->priority != DEF_PRIORITY && | 2693 | if (sc->priority != DEF_PRIORITY && |
2557 | !zone_reclaimable(zone)) | 2694 | !pgdat_reclaimable(zone->zone_pgdat)) |
2558 | continue; /* Let kswapd poll it */ | 2695 | continue; /* Let kswapd poll it */ |
2559 | 2696 | ||
2560 | /* | 2697 | /* |
@@ -2568,20 +2705,28 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2568 | */ | 2705 | */ |
2569 | if (IS_ENABLED(CONFIG_COMPACTION) && | 2706 | if (IS_ENABLED(CONFIG_COMPACTION) && |
2570 | sc->order > PAGE_ALLOC_COSTLY_ORDER && | 2707 | sc->order > PAGE_ALLOC_COSTLY_ORDER && |
2571 | zonelist_zone_idx(z) <= requested_highidx && | 2708 | compaction_ready(zone, sc)) { |
2572 | compaction_ready(zone, sc->order, requested_highidx)) { | ||
2573 | sc->compaction_ready = true; | 2709 | sc->compaction_ready = true; |
2574 | continue; | 2710 | continue; |
2575 | } | 2711 | } |
2576 | 2712 | ||
2577 | /* | 2713 | /* |
2714 | * Shrink each node in the zonelist once. If the | ||
2715 | * zonelist is ordered by zone (not the default) then a | ||
2716 | * node may be shrunk multiple times but in that case | ||
2717 | * the user prefers lower zones being preserved. | ||
2718 | */ | ||
2719 | if (zone->zone_pgdat == last_pgdat) | ||
2720 | continue; | ||
2721 | |||
2722 | /* | ||
2578 | * This steals pages from memory cgroups over softlimit | 2723 | * This steals pages from memory cgroups over softlimit |
2579 | * and returns the number of reclaimed pages and | 2724 | * and returns the number of reclaimed pages and |
2580 | * scanned pages. This works for global memory pressure | 2725 | * scanned pages. This works for global memory pressure |
2581 | * and balancing, not for a memcg's limit. | 2726 | * and balancing, not for a memcg's limit. |
2582 | */ | 2727 | */ |
2583 | nr_soft_scanned = 0; | 2728 | nr_soft_scanned = 0; |
2584 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | 2729 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat, |
2585 | sc->order, sc->gfp_mask, | 2730 | sc->order, sc->gfp_mask, |
2586 | &nr_soft_scanned); | 2731 | &nr_soft_scanned); |
2587 | sc->nr_reclaimed += nr_soft_reclaimed; | 2732 | sc->nr_reclaimed += nr_soft_reclaimed; |
@@ -2589,7 +2734,11 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2589 | /* need some check for avoid more shrink_zone() */ | 2734 | /* need some check for avoid more shrink_zone() */ |
2590 | } | 2735 | } |
2591 | 2736 | ||
2592 | shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); | 2737 | /* See comment about same check for global reclaim above */ |
2738 | if (zone->zone_pgdat == last_pgdat) | ||
2739 | continue; | ||
2740 | last_pgdat = zone->zone_pgdat; | ||
2741 | shrink_node(zone->zone_pgdat, sc); | ||
2593 | } | 2742 | } |
2594 | 2743 | ||
2595 | /* | 2744 | /* |
@@ -2625,7 +2774,7 @@ retry: | |||
2625 | delayacct_freepages_start(); | 2774 | delayacct_freepages_start(); |
2626 | 2775 | ||
2627 | if (global_reclaim(sc)) | 2776 | if (global_reclaim(sc)) |
2628 | count_vm_event(ALLOCSTALL); | 2777 | __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); |
2629 | 2778 | ||
2630 | do { | 2779 | do { |
2631 | vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, | 2780 | vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, |
@@ -2692,7 +2841,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | |||
2692 | for (i = 0; i <= ZONE_NORMAL; i++) { | 2841 | for (i = 0; i <= ZONE_NORMAL; i++) { |
2693 | zone = &pgdat->node_zones[i]; | 2842 | zone = &pgdat->node_zones[i]; |
2694 | if (!populated_zone(zone) || | 2843 | if (!populated_zone(zone) || |
2695 | zone_reclaimable_pages(zone) == 0) | 2844 | pgdat_reclaimable_pages(pgdat) == 0) |
2696 | continue; | 2845 | continue; |
2697 | 2846 | ||
2698 | pfmemalloc_reserve += min_wmark_pages(zone); | 2847 | pfmemalloc_reserve += min_wmark_pages(zone); |
@@ -2707,7 +2856,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | |||
2707 | 2856 | ||
2708 | /* kswapd must be awake if processes are being throttled */ | 2857 | /* kswapd must be awake if processes are being throttled */ |
2709 | if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { | 2858 | if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { |
2710 | pgdat->classzone_idx = min(pgdat->classzone_idx, | 2859 | pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx, |
2711 | (enum zone_type)ZONE_NORMAL); | 2860 | (enum zone_type)ZONE_NORMAL); |
2712 | wake_up_interruptible(&pgdat->kswapd_wait); | 2861 | wake_up_interruptible(&pgdat->kswapd_wait); |
2713 | } | 2862 | } |
@@ -2815,6 +2964,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2815 | struct scan_control sc = { | 2964 | struct scan_control sc = { |
2816 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2965 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2817 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), | 2966 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), |
2967 | .reclaim_idx = gfp_zone(gfp_mask), | ||
2818 | .order = order, | 2968 | .order = order, |
2819 | .nodemask = nodemask, | 2969 | .nodemask = nodemask, |
2820 | .priority = DEF_PRIORITY, | 2970 | .priority = DEF_PRIORITY, |
@@ -2833,7 +2983,8 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2833 | 2983 | ||
2834 | trace_mm_vmscan_direct_reclaim_begin(order, | 2984 | trace_mm_vmscan_direct_reclaim_begin(order, |
2835 | sc.may_writepage, | 2985 | sc.may_writepage, |
2836 | gfp_mask); | 2986 | gfp_mask, |
2987 | sc.reclaim_idx); | ||
2837 | 2988 | ||
2838 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | 2989 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); |
2839 | 2990 | ||
@@ -2844,9 +2995,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2844 | 2995 | ||
2845 | #ifdef CONFIG_MEMCG | 2996 | #ifdef CONFIG_MEMCG |
2846 | 2997 | ||
2847 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | 2998 | unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, |
2848 | gfp_t gfp_mask, bool noswap, | 2999 | gfp_t gfp_mask, bool noswap, |
2849 | struct zone *zone, | 3000 | pg_data_t *pgdat, |
2850 | unsigned long *nr_scanned) | 3001 | unsigned long *nr_scanned) |
2851 | { | 3002 | { |
2852 | struct scan_control sc = { | 3003 | struct scan_control sc = { |
@@ -2854,6 +3005,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | |||
2854 | .target_mem_cgroup = memcg, | 3005 | .target_mem_cgroup = memcg, |
2855 | .may_writepage = !laptop_mode, | 3006 | .may_writepage = !laptop_mode, |
2856 | .may_unmap = 1, | 3007 | .may_unmap = 1, |
3008 | .reclaim_idx = MAX_NR_ZONES - 1, | ||
2857 | .may_swap = !noswap, | 3009 | .may_swap = !noswap, |
2858 | }; | 3010 | }; |
2859 | unsigned long lru_pages; | 3011 | unsigned long lru_pages; |
@@ -2863,16 +3015,17 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | |||
2863 | 3015 | ||
2864 | trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, | 3016 | trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, |
2865 | sc.may_writepage, | 3017 | sc.may_writepage, |
2866 | sc.gfp_mask); | 3018 | sc.gfp_mask, |
3019 | sc.reclaim_idx); | ||
2867 | 3020 | ||
2868 | /* | 3021 | /* |
2869 | * NOTE: Although we can get the priority field, using it | 3022 | * NOTE: Although we can get the priority field, using it |
2870 | * here is not a good idea, since it limits the pages we can scan. | 3023 | * here is not a good idea, since it limits the pages we can scan. |
2871 | * if we don't reclaim here, the shrink_zone from balance_pgdat | 3024 | * if we don't reclaim here, the shrink_node from balance_pgdat |
2872 | * will pick up pages from other mem cgroup's as well. We hack | 3025 | * will pick up pages from other mem cgroup's as well. We hack |
2873 | * the priority and make it zero. | 3026 | * the priority and make it zero. |
2874 | */ | 3027 | */ |
2875 | shrink_zone_memcg(zone, memcg, &sc, &lru_pages); | 3028 | shrink_node_memcg(pgdat, memcg, &sc, &lru_pages); |
2876 | 3029 | ||
2877 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); | 3030 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); |
2878 | 3031 | ||
@@ -2892,6 +3045,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
2892 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), | 3045 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), |
2893 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 3046 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2894 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), | 3047 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), |
3048 | .reclaim_idx = MAX_NR_ZONES - 1, | ||
2895 | .target_mem_cgroup = memcg, | 3049 | .target_mem_cgroup = memcg, |
2896 | .priority = DEF_PRIORITY, | 3050 | .priority = DEF_PRIORITY, |
2897 | .may_writepage = !laptop_mode, | 3051 | .may_writepage = !laptop_mode, |
@@ -2910,7 +3064,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
2910 | 3064 | ||
2911 | trace_mm_vmscan_memcg_reclaim_begin(0, | 3065 | trace_mm_vmscan_memcg_reclaim_begin(0, |
2912 | sc.may_writepage, | 3066 | sc.may_writepage, |
2913 | sc.gfp_mask); | 3067 | sc.gfp_mask, |
3068 | sc.reclaim_idx); | ||
2914 | 3069 | ||
2915 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | 3070 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); |
2916 | 3071 | ||
@@ -2920,7 +3075,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
2920 | } | 3075 | } |
2921 | #endif | 3076 | #endif |
2922 | 3077 | ||
2923 | static void age_active_anon(struct zone *zone, struct scan_control *sc) | 3078 | static void age_active_anon(struct pglist_data *pgdat, |
3079 | struct scan_control *sc) | ||
2924 | { | 3080 | { |
2925 | struct mem_cgroup *memcg; | 3081 | struct mem_cgroup *memcg; |
2926 | 3082 | ||
@@ -2929,9 +3085,9 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) | |||
2929 | 3085 | ||
2930 | memcg = mem_cgroup_iter(NULL, NULL, NULL); | 3086 | memcg = mem_cgroup_iter(NULL, NULL, NULL); |
2931 | do { | 3087 | do { |
2932 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 3088 | struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); |
2933 | 3089 | ||
2934 | if (inactive_list_is_low(lruvec, false)) | 3090 | if (inactive_list_is_low(lruvec, false, sc)) |
2935 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, | 3091 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, |
2936 | sc, LRU_ACTIVE_ANON); | 3092 | sc, LRU_ACTIVE_ANON); |
2937 | 3093 | ||
@@ -2939,82 +3095,21 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) | |||
2939 | } while (memcg); | 3095 | } while (memcg); |
2940 | } | 3096 | } |
2941 | 3097 | ||
2942 | static bool zone_balanced(struct zone *zone, int order, bool highorder, | 3098 | static bool zone_balanced(struct zone *zone, int order, int classzone_idx) |
2943 | unsigned long balance_gap, int classzone_idx) | ||
2944 | { | 3099 | { |
2945 | unsigned long mark = high_wmark_pages(zone) + balance_gap; | 3100 | unsigned long mark = high_wmark_pages(zone); |
3101 | |||
3102 | if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx)) | ||
3103 | return false; | ||
2946 | 3104 | ||
2947 | /* | 3105 | /* |
2948 | * When checking from pgdat_balanced(), kswapd should stop and sleep | 3106 | * If any eligible zone is balanced then the node is not considered |
2949 | * when it reaches the high order-0 watermark and let kcompactd take | 3107 | * to be congested or dirty |
2950 | * over. Other callers such as wakeup_kswapd() want to determine the | ||
2951 | * true high-order watermark. | ||
2952 | */ | 3108 | */ |
2953 | if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) { | 3109 | clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags); |
2954 | mark += (1UL << order); | 3110 | clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags); |
2955 | order = 0; | ||
2956 | } | ||
2957 | |||
2958 | return zone_watermark_ok_safe(zone, order, mark, classzone_idx); | ||
2959 | } | ||
2960 | |||
2961 | /* | ||
2962 | * pgdat_balanced() is used when checking if a node is balanced. | ||
2963 | * | ||
2964 | * For order-0, all zones must be balanced! | ||
2965 | * | ||
2966 | * For high-order allocations only zones that meet watermarks and are in a | ||
2967 | * zone allowed by the callers classzone_idx are added to balanced_pages. The | ||
2968 | * total of balanced pages must be at least 25% of the zones allowed by | ||
2969 | * classzone_idx for the node to be considered balanced. Forcing all zones to | ||
2970 | * be balanced for high orders can cause excessive reclaim when there are | ||
2971 | * imbalanced zones. | ||
2972 | * The choice of 25% is due to | ||
2973 | * o a 16M DMA zone that is balanced will not balance a zone on any | ||
2974 | * reasonable sized machine | ||
2975 | * o On all other machines, the top zone must be at least a reasonable | ||
2976 | * percentage of the middle zones. For example, on 32-bit x86, highmem | ||
2977 | * would need to be at least 256M for it to be balance a whole node. | ||
2978 | * Similarly, on x86-64 the Normal zone would need to be at least 1G | ||
2979 | * to balance a node on its own. These seemed like reasonable ratios. | ||
2980 | */ | ||
2981 | static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | ||
2982 | { | ||
2983 | unsigned long managed_pages = 0; | ||
2984 | unsigned long balanced_pages = 0; | ||
2985 | int i; | ||
2986 | |||
2987 | /* Check the watermark levels */ | ||
2988 | for (i = 0; i <= classzone_idx; i++) { | ||
2989 | struct zone *zone = pgdat->node_zones + i; | ||
2990 | |||
2991 | if (!populated_zone(zone)) | ||
2992 | continue; | ||
2993 | |||
2994 | managed_pages += zone->managed_pages; | ||
2995 | |||
2996 | /* | ||
2997 | * A special case here: | ||
2998 | * | ||
2999 | * balance_pgdat() skips over all_unreclaimable after | ||
3000 | * DEF_PRIORITY. Effectively, it considers them balanced so | ||
3001 | * they must be considered balanced here as well! | ||
3002 | */ | ||
3003 | if (!zone_reclaimable(zone)) { | ||
3004 | balanced_pages += zone->managed_pages; | ||
3005 | continue; | ||
3006 | } | ||
3007 | 3111 | ||
3008 | if (zone_balanced(zone, order, false, 0, i)) | 3112 | return true; |
3009 | balanced_pages += zone->managed_pages; | ||
3010 | else if (!order) | ||
3011 | return false; | ||
3012 | } | ||
3013 | |||
3014 | if (order) | ||
3015 | return balanced_pages >= (managed_pages >> 2); | ||
3016 | else | ||
3017 | return true; | ||
3018 | } | 3113 | } |
3019 | 3114 | ||
3020 | /* | 3115 | /* |
@@ -3023,12 +3118,9 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | |||
3023 | * | 3118 | * |
3024 | * Returns true if kswapd is ready to sleep | 3119 | * Returns true if kswapd is ready to sleep |
3025 | */ | 3120 | */ |
3026 | static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | 3121 | static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) |
3027 | int classzone_idx) | ||
3028 | { | 3122 | { |
3029 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | 3123 | int i; |
3030 | if (remaining) | ||
3031 | return false; | ||
3032 | 3124 | ||
3033 | /* | 3125 | /* |
3034 | * The throttled processes are normally woken up in balance_pgdat() as | 3126 | * The throttled processes are normally woken up in balance_pgdat() as |
@@ -3046,91 +3138,81 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
3046 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) | 3138 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) |
3047 | wake_up_all(&pgdat->pfmemalloc_wait); | 3139 | wake_up_all(&pgdat->pfmemalloc_wait); |
3048 | 3140 | ||
3049 | return pgdat_balanced(pgdat, order, classzone_idx); | 3141 | for (i = 0; i <= classzone_idx; i++) { |
3142 | struct zone *zone = pgdat->node_zones + i; | ||
3143 | |||
3144 | if (!populated_zone(zone)) | ||
3145 | continue; | ||
3146 | |||
3147 | if (!zone_balanced(zone, order, classzone_idx)) | ||
3148 | return false; | ||
3149 | } | ||
3150 | |||
3151 | return true; | ||
3050 | } | 3152 | } |
3051 | 3153 | ||
3052 | /* | 3154 | /* |
3053 | * kswapd shrinks the zone by the number of pages required to reach | 3155 | * kswapd shrinks a node of pages that are at or below the highest usable |
3054 | * the high watermark. | 3156 | * zone that is currently unbalanced. |
3055 | * | 3157 | * |
3056 | * Returns true if kswapd scanned at least the requested number of pages to | 3158 | * Returns true if kswapd scanned at least the requested number of pages to |
3057 | * reclaim or if the lack of progress was due to pages under writeback. | 3159 | * reclaim or if the lack of progress was due to pages under writeback. |
3058 | * This is used to determine if the scanning priority needs to be raised. | 3160 | * This is used to determine if the scanning priority needs to be raised. |
3059 | */ | 3161 | */ |
3060 | static bool kswapd_shrink_zone(struct zone *zone, | 3162 | static bool kswapd_shrink_node(pg_data_t *pgdat, |
3061 | int classzone_idx, | ||
3062 | struct scan_control *sc) | 3163 | struct scan_control *sc) |
3063 | { | 3164 | { |
3064 | unsigned long balance_gap; | 3165 | struct zone *zone; |
3065 | bool lowmem_pressure; | 3166 | int z; |
3066 | 3167 | ||
3067 | /* Reclaim above the high watermark. */ | 3168 | /* Reclaim a number of pages proportional to the number of zones */ |
3068 | sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); | 3169 | sc->nr_to_reclaim = 0; |
3170 | for (z = 0; z <= sc->reclaim_idx; z++) { | ||
3171 | zone = pgdat->node_zones + z; | ||
3172 | if (!populated_zone(zone)) | ||
3173 | continue; | ||
3069 | 3174 | ||
3070 | /* | 3175 | sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); |
3071 | * We put equal pressure on every zone, unless one zone has way too | 3176 | } |
3072 | * many pages free already. The "too many pages" is defined as the | ||
3073 | * high wmark plus a "gap" where the gap is either the low | ||
3074 | * watermark or 1% of the zone, whichever is smaller. | ||
3075 | */ | ||
3076 | balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( | ||
3077 | zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); | ||
3078 | 3177 | ||
3079 | /* | 3178 | /* |
3080 | * If there is no low memory pressure or the zone is balanced then no | 3179 | * Historically care was taken to put equal pressure on all zones but |
3081 | * reclaim is necessary | 3180 | * now pressure is applied based on node LRU order. |
3082 | */ | 3181 | */ |
3083 | lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); | 3182 | shrink_node(pgdat, sc); |
3084 | if (!lowmem_pressure && zone_balanced(zone, sc->order, false, | ||
3085 | balance_gap, classzone_idx)) | ||
3086 | return true; | ||
3087 | |||
3088 | shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); | ||
3089 | |||
3090 | clear_bit(ZONE_WRITEBACK, &zone->flags); | ||
3091 | 3183 | ||
3092 | /* | 3184 | /* |
3093 | * If a zone reaches its high watermark, consider it to be no longer | 3185 | * Fragmentation may mean that the system cannot be rebalanced for |
3094 | * congested. It's possible there are dirty pages backed by congested | 3186 | * high-order allocations. If twice the allocation size has been |
3095 | * BDIs but as pressure is relieved, speculatively avoid congestion | 3187 | * reclaimed then recheck watermarks only at order-0 to prevent |
3096 | * waits. | 3188 | * excessive reclaim. Assume that a process requested a high-order |
3189 | * can direct reclaim/compact. | ||
3097 | */ | 3190 | */ |
3098 | if (zone_reclaimable(zone) && | 3191 | if (sc->order && sc->nr_reclaimed >= 2UL << sc->order) |
3099 | zone_balanced(zone, sc->order, false, 0, classzone_idx)) { | 3192 | sc->order = 0; |
3100 | clear_bit(ZONE_CONGESTED, &zone->flags); | ||
3101 | clear_bit(ZONE_DIRTY, &zone->flags); | ||
3102 | } | ||
3103 | 3193 | ||
3104 | return sc->nr_scanned >= sc->nr_to_reclaim; | 3194 | return sc->nr_scanned >= sc->nr_to_reclaim; |
3105 | } | 3195 | } |
3106 | 3196 | ||
3107 | /* | 3197 | /* |
3108 | * For kswapd, balance_pgdat() will work across all this node's zones until | 3198 | * For kswapd, balance_pgdat() will reclaim pages across a node from zones |
3109 | * they are all at high_wmark_pages(zone). | 3199 | * that are eligible for use by the caller until at least one zone is |
3200 | * balanced. | ||
3110 | * | 3201 | * |
3111 | * Returns the highest zone idx kswapd was reclaiming at | 3202 | * Returns the order kswapd finished reclaiming at. |
3112 | * | ||
3113 | * There is special handling here for zones which are full of pinned pages. | ||
3114 | * This can happen if the pages are all mlocked, or if they are all used by | ||
3115 | * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. | ||
3116 | * What we do is to detect the case where all pages in the zone have been | ||
3117 | * scanned twice and there has been zero successful reclaim. Mark the zone as | ||
3118 | * dead and from now on, only perform a short scan. Basically we're polling | ||
3119 | * the zone for when the problem goes away. | ||
3120 | * | 3203 | * |
3121 | * kswapd scans the zones in the highmem->normal->dma direction. It skips | 3204 | * kswapd scans the zones in the highmem->normal->dma direction. It skips |
3122 | * zones which have free_pages > high_wmark_pages(zone), but once a zone is | 3205 | * zones which have free_pages > high_wmark_pages(zone), but once a zone is |
3123 | * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the | 3206 | * found to have free_pages <= high_wmark_pages(zone), any page is that zone |
3124 | * lower zones regardless of the number of free pages in the lower zones. This | 3207 | * or lower is eligible for reclaim until at least one usable zone is |
3125 | * interoperates with the page allocator fallback scheme to ensure that aging | 3208 | * balanced. |
3126 | * of pages is balanced across the zones. | ||
3127 | */ | 3209 | */ |
3128 | static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | 3210 | static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) |
3129 | { | 3211 | { |
3130 | int i; | 3212 | int i; |
3131 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | ||
3132 | unsigned long nr_soft_reclaimed; | 3213 | unsigned long nr_soft_reclaimed; |
3133 | unsigned long nr_soft_scanned; | 3214 | unsigned long nr_soft_scanned; |
3215 | struct zone *zone; | ||
3134 | struct scan_control sc = { | 3216 | struct scan_control sc = { |
3135 | .gfp_mask = GFP_KERNEL, | 3217 | .gfp_mask = GFP_KERNEL, |
3136 | .order = order, | 3218 | .order = order, |
@@ -3145,100 +3227,77 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3145 | bool raise_priority = true; | 3227 | bool raise_priority = true; |
3146 | 3228 | ||
3147 | sc.nr_reclaimed = 0; | 3229 | sc.nr_reclaimed = 0; |
3230 | sc.reclaim_idx = classzone_idx; | ||
3148 | 3231 | ||
3149 | /* | 3232 | /* |
3150 | * Scan in the highmem->dma direction for the highest | 3233 | * If the number of buffer_heads exceeds the maximum allowed |
3151 | * zone which needs scanning | 3234 | * then consider reclaiming from all zones. This has a dual |
3235 | * purpose -- on 64-bit systems it is expected that | ||
3236 | * buffer_heads are stripped during active rotation. On 32-bit | ||
3237 | * systems, highmem pages can pin lowmem memory and shrinking | ||
3238 | * buffers can relieve lowmem pressure. Reclaim may still not | ||
3239 | * go ahead if all eligible zones for the original allocation | ||
3240 | * request are balanced to avoid excessive reclaim from kswapd. | ||
3152 | */ | 3241 | */ |
3153 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { | 3242 | if (buffer_heads_over_limit) { |
3154 | struct zone *zone = pgdat->node_zones + i; | 3243 | for (i = MAX_NR_ZONES - 1; i >= 0; i--) { |
3155 | 3244 | zone = pgdat->node_zones + i; | |
3156 | if (!populated_zone(zone)) | 3245 | if (!populated_zone(zone)) |
3157 | continue; | 3246 | continue; |
3158 | |||
3159 | if (sc.priority != DEF_PRIORITY && | ||
3160 | !zone_reclaimable(zone)) | ||
3161 | continue; | ||
3162 | |||
3163 | /* | ||
3164 | * Do some background aging of the anon list, to give | ||
3165 | * pages a chance to be referenced before reclaiming. | ||
3166 | */ | ||
3167 | age_active_anon(zone, &sc); | ||
3168 | 3247 | ||
3169 | /* | 3248 | sc.reclaim_idx = i; |
3170 | * If the number of buffer_heads in the machine | ||
3171 | * exceeds the maximum allowed level and this node | ||
3172 | * has a highmem zone, force kswapd to reclaim from | ||
3173 | * it to relieve lowmem pressure. | ||
3174 | */ | ||
3175 | if (buffer_heads_over_limit && is_highmem_idx(i)) { | ||
3176 | end_zone = i; | ||
3177 | break; | 3249 | break; |
3178 | } | 3250 | } |
3251 | } | ||
3179 | 3252 | ||
3180 | if (!zone_balanced(zone, order, false, 0, 0)) { | 3253 | /* |
3181 | end_zone = i; | 3254 | * Only reclaim if there are no eligible zones. Check from |
3182 | break; | 3255 | * high to low zone as allocations prefer higher zones. |
3183 | } else { | 3256 | * Scanning from low to high zone would allow congestion to be |
3184 | /* | 3257 | * cleared during a very small window when a small low |
3185 | * If balanced, clear the dirty and congested | 3258 | * zone was balanced even under extreme pressure when the |
3186 | * flags | 3259 | * overall node may be congested. Note that sc.reclaim_idx |
3187 | */ | 3260 | * is not used as buffer_heads_over_limit may have adjusted |
3188 | clear_bit(ZONE_CONGESTED, &zone->flags); | 3261 | * it. |
3189 | clear_bit(ZONE_DIRTY, &zone->flags); | 3262 | */ |
3190 | } | 3263 | for (i = classzone_idx; i >= 0; i--) { |
3264 | zone = pgdat->node_zones + i; | ||
3265 | if (!populated_zone(zone)) | ||
3266 | continue; | ||
3267 | |||
3268 | if (zone_balanced(zone, sc.order, classzone_idx)) | ||
3269 | goto out; | ||
3191 | } | 3270 | } |
3192 | 3271 | ||
3193 | if (i < 0) | 3272 | /* |
3194 | goto out; | 3273 | * Do some background aging of the anon list, to give |
3274 | * pages a chance to be referenced before reclaiming. All | ||
3275 | * pages are rotated regardless of classzone as this is | ||
3276 | * about consistent aging. | ||
3277 | */ | ||
3278 | age_active_anon(pgdat, &sc); | ||
3195 | 3279 | ||
3196 | /* | 3280 | /* |
3197 | * If we're getting trouble reclaiming, start doing writepage | 3281 | * If we're getting trouble reclaiming, start doing writepage |
3198 | * even in laptop mode. | 3282 | * even in laptop mode. |
3199 | */ | 3283 | */ |
3200 | if (sc.priority < DEF_PRIORITY - 2) | 3284 | if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat)) |
3201 | sc.may_writepage = 1; | 3285 | sc.may_writepage = 1; |
3202 | 3286 | ||
3287 | /* Call soft limit reclaim before calling shrink_node. */ | ||
3288 | sc.nr_scanned = 0; | ||
3289 | nr_soft_scanned = 0; | ||
3290 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order, | ||
3291 | sc.gfp_mask, &nr_soft_scanned); | ||
3292 | sc.nr_reclaimed += nr_soft_reclaimed; | ||
3293 | |||
3203 | /* | 3294 | /* |
3204 | * Now scan the zone in the dma->highmem direction, stopping | 3295 | * There should be no need to raise the scanning priority if |
3205 | * at the last zone which needs scanning. | 3296 | * enough pages are already being scanned that that high |
3206 | * | 3297 | * watermark would be met at 100% efficiency. |
3207 | * We do this because the page allocator works in the opposite | ||
3208 | * direction. This prevents the page allocator from allocating | ||
3209 | * pages behind kswapd's direction of progress, which would | ||
3210 | * cause too much scanning of the lower zones. | ||
3211 | */ | 3298 | */ |
3212 | for (i = 0; i <= end_zone; i++) { | 3299 | if (kswapd_shrink_node(pgdat, &sc)) |
3213 | struct zone *zone = pgdat->node_zones + i; | 3300 | raise_priority = false; |
3214 | |||
3215 | if (!populated_zone(zone)) | ||
3216 | continue; | ||
3217 | |||
3218 | if (sc.priority != DEF_PRIORITY && | ||
3219 | !zone_reclaimable(zone)) | ||
3220 | continue; | ||
3221 | |||
3222 | sc.nr_scanned = 0; | ||
3223 | |||
3224 | nr_soft_scanned = 0; | ||
3225 | /* | ||
3226 | * Call soft limit reclaim before calling shrink_zone. | ||
3227 | */ | ||
3228 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
3229 | order, sc.gfp_mask, | ||
3230 | &nr_soft_scanned); | ||
3231 | sc.nr_reclaimed += nr_soft_reclaimed; | ||
3232 | |||
3233 | /* | ||
3234 | * There should be no need to raise the scanning | ||
3235 | * priority if enough pages are already being scanned | ||
3236 | * that that high watermark would be met at 100% | ||
3237 | * efficiency. | ||
3238 | */ | ||
3239 | if (kswapd_shrink_zone(zone, end_zone, &sc)) | ||
3240 | raise_priority = false; | ||
3241 | } | ||
3242 | 3301 | ||
3243 | /* | 3302 | /* |
3244 | * If the low watermark is met there is no need for processes | 3303 | * If the low watermark is met there is no need for processes |
@@ -3259,19 +3318,20 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3259 | */ | 3318 | */ |
3260 | if (raise_priority || !sc.nr_reclaimed) | 3319 | if (raise_priority || !sc.nr_reclaimed) |
3261 | sc.priority--; | 3320 | sc.priority--; |
3262 | } while (sc.priority >= 1 && | 3321 | } while (sc.priority >= 1); |
3263 | !pgdat_balanced(pgdat, order, classzone_idx)); | ||
3264 | 3322 | ||
3265 | out: | 3323 | out: |
3266 | /* | 3324 | /* |
3267 | * Return the highest zone idx we were reclaiming at so | 3325 | * Return the order kswapd stopped reclaiming at as |
3268 | * prepare_kswapd_sleep() makes the same decisions as here. | 3326 | * prepare_kswapd_sleep() takes it into account. If another caller |
3327 | * entered the allocator slow path while kswapd was awake, order will | ||
3328 | * remain at the higher level. | ||
3269 | */ | 3329 | */ |
3270 | return end_zone; | 3330 | return sc.order; |
3271 | } | 3331 | } |
3272 | 3332 | ||
3273 | static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, | 3333 | static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, |
3274 | int classzone_idx, int balanced_classzone_idx) | 3334 | unsigned int classzone_idx) |
3275 | { | 3335 | { |
3276 | long remaining = 0; | 3336 | long remaining = 0; |
3277 | DEFINE_WAIT(wait); | 3337 | DEFINE_WAIT(wait); |
@@ -3282,8 +3342,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, | |||
3282 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 3342 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
3283 | 3343 | ||
3284 | /* Try to sleep for a short interval */ | 3344 | /* Try to sleep for a short interval */ |
3285 | if (prepare_kswapd_sleep(pgdat, order, remaining, | 3345 | if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { |
3286 | balanced_classzone_idx)) { | ||
3287 | /* | 3346 | /* |
3288 | * Compaction records what page blocks it recently failed to | 3347 | * Compaction records what page blocks it recently failed to |
3289 | * isolate pages from and skips them in the future scanning. | 3348 | * isolate pages from and skips them in the future scanning. |
@@ -3296,9 +3355,20 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, | |||
3296 | * We have freed the memory, now we should compact it to make | 3355 | * We have freed the memory, now we should compact it to make |
3297 | * allocation of the requested order possible. | 3356 | * allocation of the requested order possible. |
3298 | */ | 3357 | */ |
3299 | wakeup_kcompactd(pgdat, order, classzone_idx); | 3358 | wakeup_kcompactd(pgdat, alloc_order, classzone_idx); |
3300 | 3359 | ||
3301 | remaining = schedule_timeout(HZ/10); | 3360 | remaining = schedule_timeout(HZ/10); |
3361 | |||
3362 | /* | ||
3363 | * If woken prematurely then reset kswapd_classzone_idx and | ||
3364 | * order. The values will either be from a wakeup request or | ||
3365 | * the previous request that slept prematurely. | ||
3366 | */ | ||
3367 | if (remaining) { | ||
3368 | pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); | ||
3369 | pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); | ||
3370 | } | ||
3371 | |||
3302 | finish_wait(&pgdat->kswapd_wait, &wait); | 3372 | finish_wait(&pgdat->kswapd_wait, &wait); |
3303 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 3373 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
3304 | } | 3374 | } |
@@ -3307,8 +3377,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, | |||
3307 | * After a short sleep, check if it was a premature sleep. If not, then | 3377 | * After a short sleep, check if it was a premature sleep. If not, then |
3308 | * go fully to sleep until explicitly woken up. | 3378 | * go fully to sleep until explicitly woken up. |
3309 | */ | 3379 | */ |
3310 | if (prepare_kswapd_sleep(pgdat, order, remaining, | 3380 | if (!remaining && |
3311 | balanced_classzone_idx)) { | 3381 | prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { |
3312 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | 3382 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); |
3313 | 3383 | ||
3314 | /* | 3384 | /* |
@@ -3349,9 +3419,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, | |||
3349 | */ | 3419 | */ |
3350 | static int kswapd(void *p) | 3420 | static int kswapd(void *p) |
3351 | { | 3421 | { |
3352 | unsigned long order, new_order; | 3422 | unsigned int alloc_order, reclaim_order, classzone_idx; |
3353 | int classzone_idx, new_classzone_idx; | ||
3354 | int balanced_classzone_idx; | ||
3355 | pg_data_t *pgdat = (pg_data_t*)p; | 3423 | pg_data_t *pgdat = (pg_data_t*)p; |
3356 | struct task_struct *tsk = current; | 3424 | struct task_struct *tsk = current; |
3357 | 3425 | ||
@@ -3381,38 +3449,20 @@ static int kswapd(void *p) | |||
3381 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; | 3449 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; |
3382 | set_freezable(); | 3450 | set_freezable(); |
3383 | 3451 | ||
3384 | order = new_order = 0; | 3452 | pgdat->kswapd_order = alloc_order = reclaim_order = 0; |
3385 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; | 3453 | pgdat->kswapd_classzone_idx = classzone_idx = 0; |
3386 | balanced_classzone_idx = classzone_idx; | ||
3387 | for ( ; ; ) { | 3454 | for ( ; ; ) { |
3388 | bool ret; | 3455 | bool ret; |
3389 | 3456 | ||
3390 | /* | 3457 | kswapd_try_sleep: |
3391 | * While we were reclaiming, there might have been another | 3458 | kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, |
3392 | * wakeup, so check the values. | 3459 | classzone_idx); |
3393 | */ | ||
3394 | new_order = pgdat->kswapd_max_order; | ||
3395 | new_classzone_idx = pgdat->classzone_idx; | ||
3396 | pgdat->kswapd_max_order = 0; | ||
3397 | pgdat->classzone_idx = pgdat->nr_zones - 1; | ||
3398 | 3460 | ||
3399 | if (order < new_order || classzone_idx > new_classzone_idx) { | 3461 | /* Read the new order and classzone_idx */ |
3400 | /* | 3462 | alloc_order = reclaim_order = pgdat->kswapd_order; |
3401 | * Don't sleep if someone wants a larger 'order' | 3463 | classzone_idx = pgdat->kswapd_classzone_idx; |
3402 | * allocation or has tigher zone constraints | 3464 | pgdat->kswapd_order = 0; |
3403 | */ | 3465 | pgdat->kswapd_classzone_idx = 0; |
3404 | order = new_order; | ||
3405 | classzone_idx = new_classzone_idx; | ||
3406 | } else { | ||
3407 | kswapd_try_to_sleep(pgdat, order, classzone_idx, | ||
3408 | balanced_classzone_idx); | ||
3409 | order = pgdat->kswapd_max_order; | ||
3410 | classzone_idx = pgdat->classzone_idx; | ||
3411 | new_order = order; | ||
3412 | new_classzone_idx = classzone_idx; | ||
3413 | pgdat->kswapd_max_order = 0; | ||
3414 | pgdat->classzone_idx = pgdat->nr_zones - 1; | ||
3415 | } | ||
3416 | 3466 | ||
3417 | ret = try_to_freeze(); | 3467 | ret = try_to_freeze(); |
3418 | if (kthread_should_stop()) | 3468 | if (kthread_should_stop()) |
@@ -3422,11 +3472,25 @@ static int kswapd(void *p) | |||
3422 | * We can speed up thawing tasks if we don't call balance_pgdat | 3472 | * We can speed up thawing tasks if we don't call balance_pgdat |
3423 | * after returning from the refrigerator | 3473 | * after returning from the refrigerator |
3424 | */ | 3474 | */ |
3425 | if (!ret) { | 3475 | if (ret) |
3426 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); | 3476 | continue; |
3427 | balanced_classzone_idx = balance_pgdat(pgdat, order, | 3477 | |
3428 | classzone_idx); | 3478 | /* |
3429 | } | 3479 | * Reclaim begins at the requested order but if a high-order |
3480 | * reclaim fails then kswapd falls back to reclaiming for | ||
3481 | * order-0. If that happens, kswapd will consider sleeping | ||
3482 | * for the order it finished reclaiming at (reclaim_order) | ||
3483 | * but kcompactd is woken to compact for the original | ||
3484 | * request (alloc_order). | ||
3485 | */ | ||
3486 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, | ||
3487 | alloc_order); | ||
3488 | reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); | ||
3489 | if (reclaim_order < alloc_order) | ||
3490 | goto kswapd_try_sleep; | ||
3491 | |||
3492 | alloc_order = reclaim_order = pgdat->kswapd_order; | ||
3493 | classzone_idx = pgdat->kswapd_classzone_idx; | ||
3430 | } | 3494 | } |
3431 | 3495 | ||
3432 | tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); | 3496 | tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); |
@@ -3442,6 +3506,7 @@ static int kswapd(void *p) | |||
3442 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) | 3506 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) |
3443 | { | 3507 | { |
3444 | pg_data_t *pgdat; | 3508 | pg_data_t *pgdat; |
3509 | int z; | ||
3445 | 3510 | ||
3446 | if (!populated_zone(zone)) | 3511 | if (!populated_zone(zone)) |
3447 | return; | 3512 | return; |
@@ -3449,14 +3514,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) | |||
3449 | if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) | 3514 | if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) |
3450 | return; | 3515 | return; |
3451 | pgdat = zone->zone_pgdat; | 3516 | pgdat = zone->zone_pgdat; |
3452 | if (pgdat->kswapd_max_order < order) { | 3517 | pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); |
3453 | pgdat->kswapd_max_order = order; | 3518 | pgdat->kswapd_order = max(pgdat->kswapd_order, order); |
3454 | pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); | ||
3455 | } | ||
3456 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 3519 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
3457 | return; | 3520 | return; |
3458 | if (zone_balanced(zone, order, true, 0, 0)) | 3521 | |
3459 | return; | 3522 | /* Only wake kswapd if all zones are unbalanced */ |
3523 | for (z = 0; z <= classzone_idx; z++) { | ||
3524 | zone = pgdat->node_zones + z; | ||
3525 | if (!populated_zone(zone)) | ||
3526 | continue; | ||
3527 | |||
3528 | if (zone_balanced(zone, order, classzone_idx)) | ||
3529 | return; | ||
3530 | } | ||
3460 | 3531 | ||
3461 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | 3532 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); |
3462 | wake_up_interruptible(&pgdat->kswapd_wait); | 3533 | wake_up_interruptible(&pgdat->kswapd_wait); |
@@ -3477,6 +3548,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
3477 | struct scan_control sc = { | 3548 | struct scan_control sc = { |
3478 | .nr_to_reclaim = nr_to_reclaim, | 3549 | .nr_to_reclaim = nr_to_reclaim, |
3479 | .gfp_mask = GFP_HIGHUSER_MOVABLE, | 3550 | .gfp_mask = GFP_HIGHUSER_MOVABLE, |
3551 | .reclaim_idx = MAX_NR_ZONES - 1, | ||
3480 | .priority = DEF_PRIORITY, | 3552 | .priority = DEF_PRIORITY, |
3481 | .may_writepage = 1, | 3553 | .may_writepage = 1, |
3482 | .may_unmap = 1, | 3554 | .may_unmap = 1, |
@@ -3578,12 +3650,12 @@ module_init(kswapd_init) | |||
3578 | 3650 | ||
3579 | #ifdef CONFIG_NUMA | 3651 | #ifdef CONFIG_NUMA |
3580 | /* | 3652 | /* |
3581 | * Zone reclaim mode | 3653 | * Node reclaim mode |
3582 | * | 3654 | * |
3583 | * If non-zero call zone_reclaim when the number of free pages falls below | 3655 | * If non-zero call node_reclaim when the number of free pages falls below |
3584 | * the watermarks. | 3656 | * the watermarks. |
3585 | */ | 3657 | */ |
3586 | int zone_reclaim_mode __read_mostly; | 3658 | int node_reclaim_mode __read_mostly; |
3587 | 3659 | ||
3588 | #define RECLAIM_OFF 0 | 3660 | #define RECLAIM_OFF 0 |
3589 | #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ | 3661 | #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ |
@@ -3591,14 +3663,14 @@ int zone_reclaim_mode __read_mostly; | |||
3591 | #define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ | 3663 | #define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ |
3592 | 3664 | ||
3593 | /* | 3665 | /* |
3594 | * Priority for ZONE_RECLAIM. This determines the fraction of pages | 3666 | * Priority for NODE_RECLAIM. This determines the fraction of pages |
3595 | * of a node considered for each zone_reclaim. 4 scans 1/16th of | 3667 | * of a node considered for each zone_reclaim. 4 scans 1/16th of |
3596 | * a zone. | 3668 | * a zone. |
3597 | */ | 3669 | */ |
3598 | #define ZONE_RECLAIM_PRIORITY 4 | 3670 | #define NODE_RECLAIM_PRIORITY 4 |
3599 | 3671 | ||
3600 | /* | 3672 | /* |
3601 | * Percentage of pages in a zone that must be unmapped for zone_reclaim to | 3673 | * Percentage of pages in a zone that must be unmapped for node_reclaim to |
3602 | * occur. | 3674 | * occur. |
3603 | */ | 3675 | */ |
3604 | int sysctl_min_unmapped_ratio = 1; | 3676 | int sysctl_min_unmapped_ratio = 1; |
@@ -3609,11 +3681,11 @@ int sysctl_min_unmapped_ratio = 1; | |||
3609 | */ | 3681 | */ |
3610 | int sysctl_min_slab_ratio = 5; | 3682 | int sysctl_min_slab_ratio = 5; |
3611 | 3683 | ||
3612 | static inline unsigned long zone_unmapped_file_pages(struct zone *zone) | 3684 | static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat) |
3613 | { | 3685 | { |
3614 | unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); | 3686 | unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED); |
3615 | unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + | 3687 | unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) + |
3616 | zone_page_state(zone, NR_ACTIVE_FILE); | 3688 | node_page_state(pgdat, NR_ACTIVE_FILE); |
3617 | 3689 | ||
3618 | /* | 3690 | /* |
3619 | * It's possible for there to be more file mapped pages than | 3691 | * It's possible for there to be more file mapped pages than |
@@ -3624,7 +3696,7 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone) | |||
3624 | } | 3696 | } |
3625 | 3697 | ||
3626 | /* Work out how many page cache pages we can reclaim in this reclaim_mode */ | 3698 | /* Work out how many page cache pages we can reclaim in this reclaim_mode */ |
3627 | static unsigned long zone_pagecache_reclaimable(struct zone *zone) | 3699 | static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) |
3628 | { | 3700 | { |
3629 | unsigned long nr_pagecache_reclaimable; | 3701 | unsigned long nr_pagecache_reclaimable; |
3630 | unsigned long delta = 0; | 3702 | unsigned long delta = 0; |
@@ -3632,17 +3704,17 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone) | |||
3632 | /* | 3704 | /* |
3633 | * If RECLAIM_UNMAP is set, then all file pages are considered | 3705 | * If RECLAIM_UNMAP is set, then all file pages are considered |
3634 | * potentially reclaimable. Otherwise, we have to worry about | 3706 | * potentially reclaimable. Otherwise, we have to worry about |
3635 | * pages like swapcache and zone_unmapped_file_pages() provides | 3707 | * pages like swapcache and node_unmapped_file_pages() provides |
3636 | * a better estimate | 3708 | * a better estimate |
3637 | */ | 3709 | */ |
3638 | if (zone_reclaim_mode & RECLAIM_UNMAP) | 3710 | if (node_reclaim_mode & RECLAIM_UNMAP) |
3639 | nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); | 3711 | nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES); |
3640 | else | 3712 | else |
3641 | nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); | 3713 | nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); |
3642 | 3714 | ||
3643 | /* If we can't clean pages, remove dirty pages from consideration */ | 3715 | /* If we can't clean pages, remove dirty pages from consideration */ |
3644 | if (!(zone_reclaim_mode & RECLAIM_WRITE)) | 3716 | if (!(node_reclaim_mode & RECLAIM_WRITE)) |
3645 | delta += zone_page_state(zone, NR_FILE_DIRTY); | 3717 | delta += node_page_state(pgdat, NR_FILE_DIRTY); |
3646 | 3718 | ||
3647 | /* Watch for any possible underflows due to delta */ | 3719 | /* Watch for any possible underflows due to delta */ |
3648 | if (unlikely(delta > nr_pagecache_reclaimable)) | 3720 | if (unlikely(delta > nr_pagecache_reclaimable)) |
@@ -3652,22 +3724,24 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone) | |||
3652 | } | 3724 | } |
3653 | 3725 | ||
3654 | /* | 3726 | /* |
3655 | * Try to free up some pages from this zone through reclaim. | 3727 | * Try to free up some pages from this node through reclaim. |
3656 | */ | 3728 | */ |
3657 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 3729 | static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) |
3658 | { | 3730 | { |
3659 | /* Minimum pages needed in order to stay on node */ | 3731 | /* Minimum pages needed in order to stay on node */ |
3660 | const unsigned long nr_pages = 1 << order; | 3732 | const unsigned long nr_pages = 1 << order; |
3661 | struct task_struct *p = current; | 3733 | struct task_struct *p = current; |
3662 | struct reclaim_state reclaim_state; | 3734 | struct reclaim_state reclaim_state; |
3735 | int classzone_idx = gfp_zone(gfp_mask); | ||
3663 | struct scan_control sc = { | 3736 | struct scan_control sc = { |
3664 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), | 3737 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), |
3665 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), | 3738 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), |
3666 | .order = order, | 3739 | .order = order, |
3667 | .priority = ZONE_RECLAIM_PRIORITY, | 3740 | .priority = NODE_RECLAIM_PRIORITY, |
3668 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 3741 | .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), |
3669 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP), | 3742 | .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), |
3670 | .may_swap = 1, | 3743 | .may_swap = 1, |
3744 | .reclaim_idx = classzone_idx, | ||
3671 | }; | 3745 | }; |
3672 | 3746 | ||
3673 | cond_resched(); | 3747 | cond_resched(); |
@@ -3681,13 +3755,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3681 | reclaim_state.reclaimed_slab = 0; | 3755 | reclaim_state.reclaimed_slab = 0; |
3682 | p->reclaim_state = &reclaim_state; | 3756 | p->reclaim_state = &reclaim_state; |
3683 | 3757 | ||
3684 | if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { | 3758 | if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { |
3685 | /* | 3759 | /* |
3686 | * Free memory by calling shrink zone with increasing | 3760 | * Free memory by calling shrink zone with increasing |
3687 | * priorities until we have enough memory freed. | 3761 | * priorities until we have enough memory freed. |
3688 | */ | 3762 | */ |
3689 | do { | 3763 | do { |
3690 | shrink_zone(zone, &sc, true); | 3764 | shrink_node(pgdat, &sc); |
3691 | } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); | 3765 | } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); |
3692 | } | 3766 | } |
3693 | 3767 | ||
@@ -3697,49 +3771,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3697 | return sc.nr_reclaimed >= nr_pages; | 3771 | return sc.nr_reclaimed >= nr_pages; |
3698 | } | 3772 | } |
3699 | 3773 | ||
3700 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 3774 | int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) |
3701 | { | 3775 | { |
3702 | int node_id; | ||
3703 | int ret; | 3776 | int ret; |
3704 | 3777 | ||
3705 | /* | 3778 | /* |
3706 | * Zone reclaim reclaims unmapped file backed pages and | 3779 | * Node reclaim reclaims unmapped file backed pages and |
3707 | * slab pages if we are over the defined limits. | 3780 | * slab pages if we are over the defined limits. |
3708 | * | 3781 | * |
3709 | * A small portion of unmapped file backed pages is needed for | 3782 | * A small portion of unmapped file backed pages is needed for |
3710 | * file I/O otherwise pages read by file I/O will be immediately | 3783 | * file I/O otherwise pages read by file I/O will be immediately |
3711 | * thrown out if the zone is overallocated. So we do not reclaim | 3784 | * thrown out if the node is overallocated. So we do not reclaim |
3712 | * if less than a specified percentage of the zone is used by | 3785 | * if less than a specified percentage of the node is used by |
3713 | * unmapped file backed pages. | 3786 | * unmapped file backed pages. |
3714 | */ | 3787 | */ |
3715 | if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && | 3788 | if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && |
3716 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) | 3789 | sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) |
3717 | return ZONE_RECLAIM_FULL; | 3790 | return NODE_RECLAIM_FULL; |
3718 | 3791 | ||
3719 | if (!zone_reclaimable(zone)) | 3792 | if (!pgdat_reclaimable(pgdat)) |
3720 | return ZONE_RECLAIM_FULL; | 3793 | return NODE_RECLAIM_FULL; |
3721 | 3794 | ||
3722 | /* | 3795 | /* |
3723 | * Do not scan if the allocation should not be delayed. | 3796 | * Do not scan if the allocation should not be delayed. |
3724 | */ | 3797 | */ |
3725 | if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) | 3798 | if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) |
3726 | return ZONE_RECLAIM_NOSCAN; | 3799 | return NODE_RECLAIM_NOSCAN; |
3727 | 3800 | ||
3728 | /* | 3801 | /* |
3729 | * Only run zone reclaim on the local zone or on zones that do not | 3802 | * Only run node reclaim on the local node or on nodes that do not |
3730 | * have associated processors. This will favor the local processor | 3803 | * have associated processors. This will favor the local processor |
3731 | * over remote processors and spread off node memory allocations | 3804 | * over remote processors and spread off node memory allocations |
3732 | * as wide as possible. | 3805 | * as wide as possible. |
3733 | */ | 3806 | */ |
3734 | node_id = zone_to_nid(zone); | 3807 | if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) |
3735 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) | 3808 | return NODE_RECLAIM_NOSCAN; |
3736 | return ZONE_RECLAIM_NOSCAN; | ||
3737 | 3809 | ||
3738 | if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags)) | 3810 | if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) |
3739 | return ZONE_RECLAIM_NOSCAN; | 3811 | return NODE_RECLAIM_NOSCAN; |
3740 | 3812 | ||
3741 | ret = __zone_reclaim(zone, gfp_mask, order); | 3813 | ret = __node_reclaim(pgdat, gfp_mask, order); |
3742 | clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags); | 3814 | clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); |
3743 | 3815 | ||
3744 | if (!ret) | 3816 | if (!ret) |
3745 | count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); | 3817 | count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); |
@@ -3778,24 +3850,23 @@ int page_evictable(struct page *page) | |||
3778 | void check_move_unevictable_pages(struct page **pages, int nr_pages) | 3850 | void check_move_unevictable_pages(struct page **pages, int nr_pages) |
3779 | { | 3851 | { |
3780 | struct lruvec *lruvec; | 3852 | struct lruvec *lruvec; |
3781 | struct zone *zone = NULL; | 3853 | struct pglist_data *pgdat = NULL; |
3782 | int pgscanned = 0; | 3854 | int pgscanned = 0; |
3783 | int pgrescued = 0; | 3855 | int pgrescued = 0; |
3784 | int i; | 3856 | int i; |
3785 | 3857 | ||
3786 | for (i = 0; i < nr_pages; i++) { | 3858 | for (i = 0; i < nr_pages; i++) { |
3787 | struct page *page = pages[i]; | 3859 | struct page *page = pages[i]; |
3788 | struct zone *pagezone; | 3860 | struct pglist_data *pagepgdat = page_pgdat(page); |
3789 | 3861 | ||
3790 | pgscanned++; | 3862 | pgscanned++; |
3791 | pagezone = page_zone(page); | 3863 | if (pagepgdat != pgdat) { |
3792 | if (pagezone != zone) { | 3864 | if (pgdat) |
3793 | if (zone) | 3865 | spin_unlock_irq(&pgdat->lru_lock); |
3794 | spin_unlock_irq(&zone->lru_lock); | 3866 | pgdat = pagepgdat; |
3795 | zone = pagezone; | 3867 | spin_lock_irq(&pgdat->lru_lock); |
3796 | spin_lock_irq(&zone->lru_lock); | ||
3797 | } | 3868 | } |
3798 | lruvec = mem_cgroup_page_lruvec(page, zone); | 3869 | lruvec = mem_cgroup_page_lruvec(page, pgdat); |
3799 | 3870 | ||
3800 | if (!PageLRU(page) || !PageUnevictable(page)) | 3871 | if (!PageLRU(page) || !PageUnevictable(page)) |
3801 | continue; | 3872 | continue; |
@@ -3811,10 +3882,10 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) | |||
3811 | } | 3882 | } |
3812 | } | 3883 | } |
3813 | 3884 | ||
3814 | if (zone) { | 3885 | if (pgdat) { |
3815 | __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); | 3886 | __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); |
3816 | __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); | 3887 | __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); |
3817 | spin_unlock_irq(&zone->lru_lock); | 3888 | spin_unlock_irq(&pgdat->lru_lock); |
3818 | } | 3889 | } |
3819 | } | 3890 | } |
3820 | #endif /* CONFIG_SHMEM */ | 3891 | #endif /* CONFIG_SHMEM */ |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 7997f52935c9..89cec42d19ff 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -86,8 +86,10 @@ void vm_events_fold_cpu(int cpu) | |||
86 | * | 86 | * |
87 | * vm_stat contains the global counters | 87 | * vm_stat contains the global counters |
88 | */ | 88 | */ |
89 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; | 89 | atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; |
90 | EXPORT_SYMBOL(vm_stat); | 90 | atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp; |
91 | EXPORT_SYMBOL(vm_zone_stat); | ||
92 | EXPORT_SYMBOL(vm_node_stat); | ||
91 | 93 | ||
92 | #ifdef CONFIG_SMP | 94 | #ifdef CONFIG_SMP |
93 | 95 | ||
@@ -167,19 +169,36 @@ int calculate_normal_threshold(struct zone *zone) | |||
167 | */ | 169 | */ |
168 | void refresh_zone_stat_thresholds(void) | 170 | void refresh_zone_stat_thresholds(void) |
169 | { | 171 | { |
172 | struct pglist_data *pgdat; | ||
170 | struct zone *zone; | 173 | struct zone *zone; |
171 | int cpu; | 174 | int cpu; |
172 | int threshold; | 175 | int threshold; |
173 | 176 | ||
177 | /* Zero current pgdat thresholds */ | ||
178 | for_each_online_pgdat(pgdat) { | ||
179 | for_each_online_cpu(cpu) { | ||
180 | per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0; | ||
181 | } | ||
182 | } | ||
183 | |||
174 | for_each_populated_zone(zone) { | 184 | for_each_populated_zone(zone) { |
185 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
175 | unsigned long max_drift, tolerate_drift; | 186 | unsigned long max_drift, tolerate_drift; |
176 | 187 | ||
177 | threshold = calculate_normal_threshold(zone); | 188 | threshold = calculate_normal_threshold(zone); |
178 | 189 | ||
179 | for_each_online_cpu(cpu) | 190 | for_each_online_cpu(cpu) { |
191 | int pgdat_threshold; | ||
192 | |||
180 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold | 193 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold |
181 | = threshold; | 194 | = threshold; |
182 | 195 | ||
196 | /* Base nodestat threshold on the largest populated zone. */ | ||
197 | pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold; | ||
198 | per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold | ||
199 | = max(threshold, pgdat_threshold); | ||
200 | } | ||
201 | |||
183 | /* | 202 | /* |
184 | * Only set percpu_drift_mark if there is a danger that | 203 | * Only set percpu_drift_mark if there is a danger that |
185 | * NR_FREE_PAGES reports the low watermark is ok when in fact | 204 | * NR_FREE_PAGES reports the low watermark is ok when in fact |
@@ -238,6 +257,26 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | |||
238 | } | 257 | } |
239 | EXPORT_SYMBOL(__mod_zone_page_state); | 258 | EXPORT_SYMBOL(__mod_zone_page_state); |
240 | 259 | ||
260 | void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, | ||
261 | long delta) | ||
262 | { | ||
263 | struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; | ||
264 | s8 __percpu *p = pcp->vm_node_stat_diff + item; | ||
265 | long x; | ||
266 | long t; | ||
267 | |||
268 | x = delta + __this_cpu_read(*p); | ||
269 | |||
270 | t = __this_cpu_read(pcp->stat_threshold); | ||
271 | |||
272 | if (unlikely(x > t || x < -t)) { | ||
273 | node_page_state_add(x, pgdat, item); | ||
274 | x = 0; | ||
275 | } | ||
276 | __this_cpu_write(*p, x); | ||
277 | } | ||
278 | EXPORT_SYMBOL(__mod_node_page_state); | ||
279 | |||
241 | /* | 280 | /* |
242 | * Optimized increment and decrement functions. | 281 | * Optimized increment and decrement functions. |
243 | * | 282 | * |
@@ -277,12 +316,34 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) | |||
277 | } | 316 | } |
278 | } | 317 | } |
279 | 318 | ||
319 | void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) | ||
320 | { | ||
321 | struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; | ||
322 | s8 __percpu *p = pcp->vm_node_stat_diff + item; | ||
323 | s8 v, t; | ||
324 | |||
325 | v = __this_cpu_inc_return(*p); | ||
326 | t = __this_cpu_read(pcp->stat_threshold); | ||
327 | if (unlikely(v > t)) { | ||
328 | s8 overstep = t >> 1; | ||
329 | |||
330 | node_page_state_add(v + overstep, pgdat, item); | ||
331 | __this_cpu_write(*p, -overstep); | ||
332 | } | ||
333 | } | ||
334 | |||
280 | void __inc_zone_page_state(struct page *page, enum zone_stat_item item) | 335 | void __inc_zone_page_state(struct page *page, enum zone_stat_item item) |
281 | { | 336 | { |
282 | __inc_zone_state(page_zone(page), item); | 337 | __inc_zone_state(page_zone(page), item); |
283 | } | 338 | } |
284 | EXPORT_SYMBOL(__inc_zone_page_state); | 339 | EXPORT_SYMBOL(__inc_zone_page_state); |
285 | 340 | ||
341 | void __inc_node_page_state(struct page *page, enum node_stat_item item) | ||
342 | { | ||
343 | __inc_node_state(page_pgdat(page), item); | ||
344 | } | ||
345 | EXPORT_SYMBOL(__inc_node_page_state); | ||
346 | |||
286 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) | 347 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) |
287 | { | 348 | { |
288 | struct per_cpu_pageset __percpu *pcp = zone->pageset; | 349 | struct per_cpu_pageset __percpu *pcp = zone->pageset; |
@@ -299,12 +360,34 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) | |||
299 | } | 360 | } |
300 | } | 361 | } |
301 | 362 | ||
363 | void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) | ||
364 | { | ||
365 | struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; | ||
366 | s8 __percpu *p = pcp->vm_node_stat_diff + item; | ||
367 | s8 v, t; | ||
368 | |||
369 | v = __this_cpu_dec_return(*p); | ||
370 | t = __this_cpu_read(pcp->stat_threshold); | ||
371 | if (unlikely(v < - t)) { | ||
372 | s8 overstep = t >> 1; | ||
373 | |||
374 | node_page_state_add(v - overstep, pgdat, item); | ||
375 | __this_cpu_write(*p, overstep); | ||
376 | } | ||
377 | } | ||
378 | |||
302 | void __dec_zone_page_state(struct page *page, enum zone_stat_item item) | 379 | void __dec_zone_page_state(struct page *page, enum zone_stat_item item) |
303 | { | 380 | { |
304 | __dec_zone_state(page_zone(page), item); | 381 | __dec_zone_state(page_zone(page), item); |
305 | } | 382 | } |
306 | EXPORT_SYMBOL(__dec_zone_page_state); | 383 | EXPORT_SYMBOL(__dec_zone_page_state); |
307 | 384 | ||
385 | void __dec_node_page_state(struct page *page, enum node_stat_item item) | ||
386 | { | ||
387 | __dec_node_state(page_pgdat(page), item); | ||
388 | } | ||
389 | EXPORT_SYMBOL(__dec_node_page_state); | ||
390 | |||
308 | #ifdef CONFIG_HAVE_CMPXCHG_LOCAL | 391 | #ifdef CONFIG_HAVE_CMPXCHG_LOCAL |
309 | /* | 392 | /* |
310 | * If we have cmpxchg_local support then we do not need to incur the overhead | 393 | * If we have cmpxchg_local support then we do not need to incur the overhead |
@@ -318,8 +401,8 @@ EXPORT_SYMBOL(__dec_zone_page_state); | |||
318 | * 1 Overstepping half of threshold | 401 | * 1 Overstepping half of threshold |
319 | * -1 Overstepping minus half of threshold | 402 | * -1 Overstepping minus half of threshold |
320 | */ | 403 | */ |
321 | static inline void mod_state(struct zone *zone, enum zone_stat_item item, | 404 | static inline void mod_zone_state(struct zone *zone, |
322 | long delta, int overstep_mode) | 405 | enum zone_stat_item item, long delta, int overstep_mode) |
323 | { | 406 | { |
324 | struct per_cpu_pageset __percpu *pcp = zone->pageset; | 407 | struct per_cpu_pageset __percpu *pcp = zone->pageset; |
325 | s8 __percpu *p = pcp->vm_stat_diff + item; | 408 | s8 __percpu *p = pcp->vm_stat_diff + item; |
@@ -359,26 +442,83 @@ static inline void mod_state(struct zone *zone, enum zone_stat_item item, | |||
359 | void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | 442 | void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, |
360 | long delta) | 443 | long delta) |
361 | { | 444 | { |
362 | mod_state(zone, item, delta, 0); | 445 | mod_zone_state(zone, item, delta, 0); |
363 | } | 446 | } |
364 | EXPORT_SYMBOL(mod_zone_page_state); | 447 | EXPORT_SYMBOL(mod_zone_page_state); |
365 | 448 | ||
366 | void inc_zone_state(struct zone *zone, enum zone_stat_item item) | ||
367 | { | ||
368 | mod_state(zone, item, 1, 1); | ||
369 | } | ||
370 | |||
371 | void inc_zone_page_state(struct page *page, enum zone_stat_item item) | 449 | void inc_zone_page_state(struct page *page, enum zone_stat_item item) |
372 | { | 450 | { |
373 | mod_state(page_zone(page), item, 1, 1); | 451 | mod_zone_state(page_zone(page), item, 1, 1); |
374 | } | 452 | } |
375 | EXPORT_SYMBOL(inc_zone_page_state); | 453 | EXPORT_SYMBOL(inc_zone_page_state); |
376 | 454 | ||
377 | void dec_zone_page_state(struct page *page, enum zone_stat_item item) | 455 | void dec_zone_page_state(struct page *page, enum zone_stat_item item) |
378 | { | 456 | { |
379 | mod_state(page_zone(page), item, -1, -1); | 457 | mod_zone_state(page_zone(page), item, -1, -1); |
380 | } | 458 | } |
381 | EXPORT_SYMBOL(dec_zone_page_state); | 459 | EXPORT_SYMBOL(dec_zone_page_state); |
460 | |||
461 | static inline void mod_node_state(struct pglist_data *pgdat, | ||
462 | enum node_stat_item item, int delta, int overstep_mode) | ||
463 | { | ||
464 | struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; | ||
465 | s8 __percpu *p = pcp->vm_node_stat_diff + item; | ||
466 | long o, n, t, z; | ||
467 | |||
468 | do { | ||
469 | z = 0; /* overflow to node counters */ | ||
470 | |||
471 | /* | ||
472 | * The fetching of the stat_threshold is racy. We may apply | ||
473 | * a counter threshold to the wrong the cpu if we get | ||
474 | * rescheduled while executing here. However, the next | ||
475 | * counter update will apply the threshold again and | ||
476 | * therefore bring the counter under the threshold again. | ||
477 | * | ||
478 | * Most of the time the thresholds are the same anyways | ||
479 | * for all cpus in a node. | ||
480 | */ | ||
481 | t = this_cpu_read(pcp->stat_threshold); | ||
482 | |||
483 | o = this_cpu_read(*p); | ||
484 | n = delta + o; | ||
485 | |||
486 | if (n > t || n < -t) { | ||
487 | int os = overstep_mode * (t >> 1) ; | ||
488 | |||
489 | /* Overflow must be added to node counters */ | ||
490 | z = n + os; | ||
491 | n = -os; | ||
492 | } | ||
493 | } while (this_cpu_cmpxchg(*p, o, n) != o); | ||
494 | |||
495 | if (z) | ||
496 | node_page_state_add(z, pgdat, item); | ||
497 | } | ||
498 | |||
499 | void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, | ||
500 | long delta) | ||
501 | { | ||
502 | mod_node_state(pgdat, item, delta, 0); | ||
503 | } | ||
504 | EXPORT_SYMBOL(mod_node_page_state); | ||
505 | |||
506 | void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) | ||
507 | { | ||
508 | mod_node_state(pgdat, item, 1, 1); | ||
509 | } | ||
510 | |||
511 | void inc_node_page_state(struct page *page, enum node_stat_item item) | ||
512 | { | ||
513 | mod_node_state(page_pgdat(page), item, 1, 1); | ||
514 | } | ||
515 | EXPORT_SYMBOL(inc_node_page_state); | ||
516 | |||
517 | void dec_node_page_state(struct page *page, enum node_stat_item item) | ||
518 | { | ||
519 | mod_node_state(page_pgdat(page), item, -1, -1); | ||
520 | } | ||
521 | EXPORT_SYMBOL(dec_node_page_state); | ||
382 | #else | 522 | #else |
383 | /* | 523 | /* |
384 | * Use interrupt disable to serialize counter updates | 524 | * Use interrupt disable to serialize counter updates |
@@ -394,15 +534,6 @@ void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | |||
394 | } | 534 | } |
395 | EXPORT_SYMBOL(mod_zone_page_state); | 535 | EXPORT_SYMBOL(mod_zone_page_state); |
396 | 536 | ||
397 | void inc_zone_state(struct zone *zone, enum zone_stat_item item) | ||
398 | { | ||
399 | unsigned long flags; | ||
400 | |||
401 | local_irq_save(flags); | ||
402 | __inc_zone_state(zone, item); | ||
403 | local_irq_restore(flags); | ||
404 | } | ||
405 | |||
406 | void inc_zone_page_state(struct page *page, enum zone_stat_item item) | 537 | void inc_zone_page_state(struct page *page, enum zone_stat_item item) |
407 | { | 538 | { |
408 | unsigned long flags; | 539 | unsigned long flags; |
@@ -424,21 +555,69 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) | |||
424 | local_irq_restore(flags); | 555 | local_irq_restore(flags); |
425 | } | 556 | } |
426 | EXPORT_SYMBOL(dec_zone_page_state); | 557 | EXPORT_SYMBOL(dec_zone_page_state); |
427 | #endif | ||
428 | 558 | ||
559 | void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) | ||
560 | { | ||
561 | unsigned long flags; | ||
562 | |||
563 | local_irq_save(flags); | ||
564 | __inc_node_state(pgdat, item); | ||
565 | local_irq_restore(flags); | ||
566 | } | ||
567 | EXPORT_SYMBOL(inc_node_state); | ||
568 | |||
569 | void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, | ||
570 | long delta) | ||
571 | { | ||
572 | unsigned long flags; | ||
573 | |||
574 | local_irq_save(flags); | ||
575 | __mod_node_page_state(pgdat, item, delta); | ||
576 | local_irq_restore(flags); | ||
577 | } | ||
578 | EXPORT_SYMBOL(mod_node_page_state); | ||
579 | |||
580 | void inc_node_page_state(struct page *page, enum node_stat_item item) | ||
581 | { | ||
582 | unsigned long flags; | ||
583 | struct pglist_data *pgdat; | ||
584 | |||
585 | pgdat = page_pgdat(page); | ||
586 | local_irq_save(flags); | ||
587 | __inc_node_state(pgdat, item); | ||
588 | local_irq_restore(flags); | ||
589 | } | ||
590 | EXPORT_SYMBOL(inc_node_page_state); | ||
591 | |||
592 | void dec_node_page_state(struct page *page, enum node_stat_item item) | ||
593 | { | ||
594 | unsigned long flags; | ||
595 | |||
596 | local_irq_save(flags); | ||
597 | __dec_node_page_state(page, item); | ||
598 | local_irq_restore(flags); | ||
599 | } | ||
600 | EXPORT_SYMBOL(dec_node_page_state); | ||
601 | #endif | ||
429 | 602 | ||
430 | /* | 603 | /* |
431 | * Fold a differential into the global counters. | 604 | * Fold a differential into the global counters. |
432 | * Returns the number of counters updated. | 605 | * Returns the number of counters updated. |
433 | */ | 606 | */ |
434 | static int fold_diff(int *diff) | 607 | static int fold_diff(int *zone_diff, int *node_diff) |
435 | { | 608 | { |
436 | int i; | 609 | int i; |
437 | int changes = 0; | 610 | int changes = 0; |
438 | 611 | ||
439 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 612 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
440 | if (diff[i]) { | 613 | if (zone_diff[i]) { |
441 | atomic_long_add(diff[i], &vm_stat[i]); | 614 | atomic_long_add(zone_diff[i], &vm_zone_stat[i]); |
615 | changes++; | ||
616 | } | ||
617 | |||
618 | for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) | ||
619 | if (node_diff[i]) { | ||
620 | atomic_long_add(node_diff[i], &vm_node_stat[i]); | ||
442 | changes++; | 621 | changes++; |
443 | } | 622 | } |
444 | return changes; | 623 | return changes; |
@@ -462,9 +641,11 @@ static int fold_diff(int *diff) | |||
462 | */ | 641 | */ |
463 | static int refresh_cpu_vm_stats(bool do_pagesets) | 642 | static int refresh_cpu_vm_stats(bool do_pagesets) |
464 | { | 643 | { |
644 | struct pglist_data *pgdat; | ||
465 | struct zone *zone; | 645 | struct zone *zone; |
466 | int i; | 646 | int i; |
467 | int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; | 647 | int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; |
648 | int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; | ||
468 | int changes = 0; | 649 | int changes = 0; |
469 | 650 | ||
470 | for_each_populated_zone(zone) { | 651 | for_each_populated_zone(zone) { |
@@ -477,7 +658,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets) | |||
477 | if (v) { | 658 | if (v) { |
478 | 659 | ||
479 | atomic_long_add(v, &zone->vm_stat[i]); | 660 | atomic_long_add(v, &zone->vm_stat[i]); |
480 | global_diff[i] += v; | 661 | global_zone_diff[i] += v; |
481 | #ifdef CONFIG_NUMA | 662 | #ifdef CONFIG_NUMA |
482 | /* 3 seconds idle till flush */ | 663 | /* 3 seconds idle till flush */ |
483 | __this_cpu_write(p->expire, 3); | 664 | __this_cpu_write(p->expire, 3); |
@@ -516,7 +697,22 @@ static int refresh_cpu_vm_stats(bool do_pagesets) | |||
516 | } | 697 | } |
517 | #endif | 698 | #endif |
518 | } | 699 | } |
519 | changes += fold_diff(global_diff); | 700 | |
701 | for_each_online_pgdat(pgdat) { | ||
702 | struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats; | ||
703 | |||
704 | for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { | ||
705 | int v; | ||
706 | |||
707 | v = this_cpu_xchg(p->vm_node_stat_diff[i], 0); | ||
708 | if (v) { | ||
709 | atomic_long_add(v, &pgdat->vm_stat[i]); | ||
710 | global_node_diff[i] += v; | ||
711 | } | ||
712 | } | ||
713 | } | ||
714 | |||
715 | changes += fold_diff(global_zone_diff, global_node_diff); | ||
520 | return changes; | 716 | return changes; |
521 | } | 717 | } |
522 | 718 | ||
@@ -527,9 +723,11 @@ static int refresh_cpu_vm_stats(bool do_pagesets) | |||
527 | */ | 723 | */ |
528 | void cpu_vm_stats_fold(int cpu) | 724 | void cpu_vm_stats_fold(int cpu) |
529 | { | 725 | { |
726 | struct pglist_data *pgdat; | ||
530 | struct zone *zone; | 727 | struct zone *zone; |
531 | int i; | 728 | int i; |
532 | int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; | 729 | int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; |
730 | int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; | ||
533 | 731 | ||
534 | for_each_populated_zone(zone) { | 732 | for_each_populated_zone(zone) { |
535 | struct per_cpu_pageset *p; | 733 | struct per_cpu_pageset *p; |
@@ -543,11 +741,27 @@ void cpu_vm_stats_fold(int cpu) | |||
543 | v = p->vm_stat_diff[i]; | 741 | v = p->vm_stat_diff[i]; |
544 | p->vm_stat_diff[i] = 0; | 742 | p->vm_stat_diff[i] = 0; |
545 | atomic_long_add(v, &zone->vm_stat[i]); | 743 | atomic_long_add(v, &zone->vm_stat[i]); |
546 | global_diff[i] += v; | 744 | global_zone_diff[i] += v; |
547 | } | 745 | } |
548 | } | 746 | } |
549 | 747 | ||
550 | fold_diff(global_diff); | 748 | for_each_online_pgdat(pgdat) { |
749 | struct per_cpu_nodestat *p; | ||
750 | |||
751 | p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); | ||
752 | |||
753 | for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) | ||
754 | if (p->vm_node_stat_diff[i]) { | ||
755 | int v; | ||
756 | |||
757 | v = p->vm_node_stat_diff[i]; | ||
758 | p->vm_node_stat_diff[i] = 0; | ||
759 | atomic_long_add(v, &pgdat->vm_stat[i]); | ||
760 | global_node_diff[i] += v; | ||
761 | } | ||
762 | } | ||
763 | |||
764 | fold_diff(global_zone_diff, global_node_diff); | ||
551 | } | 765 | } |
552 | 766 | ||
553 | /* | 767 | /* |
@@ -563,16 +777,19 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) | |||
563 | int v = pset->vm_stat_diff[i]; | 777 | int v = pset->vm_stat_diff[i]; |
564 | pset->vm_stat_diff[i] = 0; | 778 | pset->vm_stat_diff[i] = 0; |
565 | atomic_long_add(v, &zone->vm_stat[i]); | 779 | atomic_long_add(v, &zone->vm_stat[i]); |
566 | atomic_long_add(v, &vm_stat[i]); | 780 | atomic_long_add(v, &vm_zone_stat[i]); |
567 | } | 781 | } |
568 | } | 782 | } |
569 | #endif | 783 | #endif |
570 | 784 | ||
571 | #ifdef CONFIG_NUMA | 785 | #ifdef CONFIG_NUMA |
572 | /* | 786 | /* |
573 | * Determine the per node value of a stat item. | 787 | * Determine the per node value of a stat item. This function |
788 | * is called frequently in a NUMA machine, so try to be as | ||
789 | * frugal as possible. | ||
574 | */ | 790 | */ |
575 | unsigned long node_page_state(int node, enum zone_stat_item item) | 791 | unsigned long sum_zone_node_page_state(int node, |
792 | enum zone_stat_item item) | ||
576 | { | 793 | { |
577 | struct zone *zones = NODE_DATA(node)->node_zones; | 794 | struct zone *zones = NODE_DATA(node)->node_zones; |
578 | int i; | 795 | int i; |
@@ -584,6 +801,19 @@ unsigned long node_page_state(int node, enum zone_stat_item item) | |||
584 | return count; | 801 | return count; |
585 | } | 802 | } |
586 | 803 | ||
804 | /* | ||
805 | * Determine the per node value of a stat item. | ||
806 | */ | ||
807 | unsigned long node_page_state(struct pglist_data *pgdat, | ||
808 | enum node_stat_item item) | ||
809 | { | ||
810 | long x = atomic_long_read(&pgdat->vm_stat[item]); | ||
811 | #ifdef CONFIG_SMP | ||
812 | if (x < 0) | ||
813 | x = 0; | ||
814 | #endif | ||
815 | return x; | ||
816 | } | ||
587 | #endif | 817 | #endif |
588 | 818 | ||
589 | #ifdef CONFIG_COMPACTION | 819 | #ifdef CONFIG_COMPACTION |
@@ -691,33 +921,18 @@ int fragmentation_index(struct zone *zone, unsigned int order) | |||
691 | const char * const vmstat_text[] = { | 921 | const char * const vmstat_text[] = { |
692 | /* enum zone_stat_item countes */ | 922 | /* enum zone_stat_item countes */ |
693 | "nr_free_pages", | 923 | "nr_free_pages", |
694 | "nr_alloc_batch", | 924 | "nr_zone_inactive_anon", |
695 | "nr_inactive_anon", | 925 | "nr_zone_active_anon", |
696 | "nr_active_anon", | 926 | "nr_zone_inactive_file", |
697 | "nr_inactive_file", | 927 | "nr_zone_active_file", |
698 | "nr_active_file", | 928 | "nr_zone_unevictable", |
699 | "nr_unevictable", | 929 | "nr_zone_write_pending", |
700 | "nr_mlock", | 930 | "nr_mlock", |
701 | "nr_anon_pages", | ||
702 | "nr_mapped", | ||
703 | "nr_file_pages", | ||
704 | "nr_dirty", | ||
705 | "nr_writeback", | ||
706 | "nr_slab_reclaimable", | 931 | "nr_slab_reclaimable", |
707 | "nr_slab_unreclaimable", | 932 | "nr_slab_unreclaimable", |
708 | "nr_page_table_pages", | 933 | "nr_page_table_pages", |
709 | "nr_kernel_stack", | 934 | "nr_kernel_stack", |
710 | "nr_unstable", | ||
711 | "nr_bounce", | 935 | "nr_bounce", |
712 | "nr_vmscan_write", | ||
713 | "nr_vmscan_immediate_reclaim", | ||
714 | "nr_writeback_temp", | ||
715 | "nr_isolated_anon", | ||
716 | "nr_isolated_file", | ||
717 | "nr_shmem", | ||
718 | "nr_dirtied", | ||
719 | "nr_written", | ||
720 | "nr_pages_scanned", | ||
721 | #if IS_ENABLED(CONFIG_ZSMALLOC) | 936 | #if IS_ENABLED(CONFIG_ZSMALLOC) |
722 | "nr_zspages", | 937 | "nr_zspages", |
723 | #endif | 938 | #endif |
@@ -729,13 +944,35 @@ const char * const vmstat_text[] = { | |||
729 | "numa_local", | 944 | "numa_local", |
730 | "numa_other", | 945 | "numa_other", |
731 | #endif | 946 | #endif |
947 | "nr_free_cma", | ||
948 | |||
949 | /* Node-based counters */ | ||
950 | "nr_inactive_anon", | ||
951 | "nr_active_anon", | ||
952 | "nr_inactive_file", | ||
953 | "nr_active_file", | ||
954 | "nr_unevictable", | ||
955 | "nr_isolated_anon", | ||
956 | "nr_isolated_file", | ||
957 | "nr_pages_scanned", | ||
732 | "workingset_refault", | 958 | "workingset_refault", |
733 | "workingset_activate", | 959 | "workingset_activate", |
734 | "workingset_nodereclaim", | 960 | "workingset_nodereclaim", |
735 | "nr_anon_transparent_hugepages", | 961 | "nr_anon_pages", |
962 | "nr_mapped", | ||
963 | "nr_file_pages", | ||
964 | "nr_dirty", | ||
965 | "nr_writeback", | ||
966 | "nr_writeback_temp", | ||
967 | "nr_shmem", | ||
736 | "nr_shmem_hugepages", | 968 | "nr_shmem_hugepages", |
737 | "nr_shmem_pmdmapped", | 969 | "nr_shmem_pmdmapped", |
738 | "nr_free_cma", | 970 | "nr_anon_transparent_hugepages", |
971 | "nr_unstable", | ||
972 | "nr_vmscan_write", | ||
973 | "nr_vmscan_immediate_reclaim", | ||
974 | "nr_dirtied", | ||
975 | "nr_written", | ||
739 | 976 | ||
740 | /* enum writeback_stat_item counters */ | 977 | /* enum writeback_stat_item counters */ |
741 | "nr_dirty_threshold", | 978 | "nr_dirty_threshold", |
@@ -749,6 +986,8 @@ const char * const vmstat_text[] = { | |||
749 | "pswpout", | 986 | "pswpout", |
750 | 987 | ||
751 | TEXTS_FOR_ZONES("pgalloc") | 988 | TEXTS_FOR_ZONES("pgalloc") |
989 | TEXTS_FOR_ZONES("allocstall") | ||
990 | TEXTS_FOR_ZONES("pgskip") | ||
752 | 991 | ||
753 | "pgfree", | 992 | "pgfree", |
754 | "pgactivate", | 993 | "pgactivate", |
@@ -758,11 +997,11 @@ const char * const vmstat_text[] = { | |||
758 | "pgmajfault", | 997 | "pgmajfault", |
759 | "pglazyfreed", | 998 | "pglazyfreed", |
760 | 999 | ||
761 | TEXTS_FOR_ZONES("pgrefill") | 1000 | "pgrefill", |
762 | TEXTS_FOR_ZONES("pgsteal_kswapd") | 1001 | "pgsteal_kswapd", |
763 | TEXTS_FOR_ZONES("pgsteal_direct") | 1002 | "pgsteal_direct", |
764 | TEXTS_FOR_ZONES("pgscan_kswapd") | 1003 | "pgscan_kswapd", |
765 | TEXTS_FOR_ZONES("pgscan_direct") | 1004 | "pgscan_direct", |
766 | "pgscan_direct_throttle", | 1005 | "pgscan_direct_throttle", |
767 | 1006 | ||
768 | #ifdef CONFIG_NUMA | 1007 | #ifdef CONFIG_NUMA |
@@ -774,7 +1013,6 @@ const char * const vmstat_text[] = { | |||
774 | "kswapd_low_wmark_hit_quickly", | 1013 | "kswapd_low_wmark_hit_quickly", |
775 | "kswapd_high_wmark_hit_quickly", | 1014 | "kswapd_high_wmark_hit_quickly", |
776 | "pageoutrun", | 1015 | "pageoutrun", |
777 | "allocstall", | ||
778 | 1016 | ||
779 | "pgrotated", | 1017 | "pgrotated", |
780 | 1018 | ||
@@ -1180,17 +1418,41 @@ static const struct file_operations pagetypeinfo_file_ops = { | |||
1180 | .release = seq_release, | 1418 | .release = seq_release, |
1181 | }; | 1419 | }; |
1182 | 1420 | ||
1421 | static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone) | ||
1422 | { | ||
1423 | int zid; | ||
1424 | |||
1425 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | ||
1426 | struct zone *compare = &pgdat->node_zones[zid]; | ||
1427 | |||
1428 | if (populated_zone(compare)) | ||
1429 | return zone == compare; | ||
1430 | } | ||
1431 | |||
1432 | /* The zone must be somewhere! */ | ||
1433 | WARN_ON_ONCE(1); | ||
1434 | return false; | ||
1435 | } | ||
1436 | |||
1183 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | 1437 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, |
1184 | struct zone *zone) | 1438 | struct zone *zone) |
1185 | { | 1439 | { |
1186 | int i; | 1440 | int i; |
1187 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); | 1441 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); |
1442 | if (is_zone_first_populated(pgdat, zone)) { | ||
1443 | seq_printf(m, "\n per-node stats"); | ||
1444 | for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { | ||
1445 | seq_printf(m, "\n %-12s %lu", | ||
1446 | vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], | ||
1447 | node_page_state(pgdat, i)); | ||
1448 | } | ||
1449 | } | ||
1188 | seq_printf(m, | 1450 | seq_printf(m, |
1189 | "\n pages free %lu" | 1451 | "\n pages free %lu" |
1190 | "\n min %lu" | 1452 | "\n min %lu" |
1191 | "\n low %lu" | 1453 | "\n low %lu" |
1192 | "\n high %lu" | 1454 | "\n high %lu" |
1193 | "\n scanned %lu" | 1455 | "\n node_scanned %lu" |
1194 | "\n spanned %lu" | 1456 | "\n spanned %lu" |
1195 | "\n present %lu" | 1457 | "\n present %lu" |
1196 | "\n managed %lu", | 1458 | "\n managed %lu", |
@@ -1198,13 +1460,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
1198 | min_wmark_pages(zone), | 1460 | min_wmark_pages(zone), |
1199 | low_wmark_pages(zone), | 1461 | low_wmark_pages(zone), |
1200 | high_wmark_pages(zone), | 1462 | high_wmark_pages(zone), |
1201 | zone_page_state(zone, NR_PAGES_SCANNED), | 1463 | node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED), |
1202 | zone->spanned_pages, | 1464 | zone->spanned_pages, |
1203 | zone->present_pages, | 1465 | zone->present_pages, |
1204 | zone->managed_pages); | 1466 | zone->managed_pages); |
1205 | 1467 | ||
1206 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 1468 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
1207 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], | 1469 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], |
1208 | zone_page_state(zone, i)); | 1470 | zone_page_state(zone, i)); |
1209 | 1471 | ||
1210 | seq_printf(m, | 1472 | seq_printf(m, |
@@ -1234,12 +1496,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
1234 | #endif | 1496 | #endif |
1235 | } | 1497 | } |
1236 | seq_printf(m, | 1498 | seq_printf(m, |
1237 | "\n all_unreclaimable: %u" | 1499 | "\n node_unreclaimable: %u" |
1238 | "\n start_pfn: %lu" | 1500 | "\n start_pfn: %lu" |
1239 | "\n inactive_ratio: %u", | 1501 | "\n node_inactive_ratio: %u", |
1240 | !zone_reclaimable(zone), | 1502 | !pgdat_reclaimable(zone->zone_pgdat), |
1241 | zone->zone_start_pfn, | 1503 | zone->zone_start_pfn, |
1242 | zone->inactive_ratio); | 1504 | zone->zone_pgdat->inactive_ratio); |
1243 | seq_putc(m, '\n'); | 1505 | seq_putc(m, '\n'); |
1244 | } | 1506 | } |
1245 | 1507 | ||
@@ -1287,6 +1549,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) | |||
1287 | if (*pos >= ARRAY_SIZE(vmstat_text)) | 1549 | if (*pos >= ARRAY_SIZE(vmstat_text)) |
1288 | return NULL; | 1550 | return NULL; |
1289 | stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + | 1551 | stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + |
1552 | NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) + | ||
1290 | NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); | 1553 | NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); |
1291 | 1554 | ||
1292 | #ifdef CONFIG_VM_EVENT_COUNTERS | 1555 | #ifdef CONFIG_VM_EVENT_COUNTERS |
@@ -1301,6 +1564,10 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) | |||
1301 | v[i] = global_page_state(i); | 1564 | v[i] = global_page_state(i); |
1302 | v += NR_VM_ZONE_STAT_ITEMS; | 1565 | v += NR_VM_ZONE_STAT_ITEMS; |
1303 | 1566 | ||
1567 | for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) | ||
1568 | v[i] = global_node_page_state(i); | ||
1569 | v += NR_VM_NODE_STAT_ITEMS; | ||
1570 | |||
1304 | global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, | 1571 | global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, |
1305 | v + NR_DIRTY_THRESHOLD); | 1572 | v + NR_DIRTY_THRESHOLD); |
1306 | v += NR_VM_WRITEBACK_STAT_ITEMS; | 1573 | v += NR_VM_WRITEBACK_STAT_ITEMS; |
@@ -1325,7 +1592,6 @@ static int vmstat_show(struct seq_file *m, void *arg) | |||
1325 | { | 1592 | { |
1326 | unsigned long *l = arg; | 1593 | unsigned long *l = arg; |
1327 | unsigned long off = l - (unsigned long *)m->private; | 1594 | unsigned long off = l - (unsigned long *)m->private; |
1328 | |||
1329 | seq_printf(m, "%s %lu\n", vmstat_text[off], *l); | 1595 | seq_printf(m, "%s %lu\n", vmstat_text[off], *l); |
1330 | return 0; | 1596 | return 0; |
1331 | } | 1597 | } |
@@ -1390,13 +1656,12 @@ int vmstat_refresh(struct ctl_table *table, int write, | |||
1390 | if (err) | 1656 | if (err) |
1391 | return err; | 1657 | return err; |
1392 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { | 1658 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { |
1393 | val = atomic_long_read(&vm_stat[i]); | 1659 | val = atomic_long_read(&vm_zone_stat[i]); |
1394 | if (val < 0) { | 1660 | if (val < 0) { |
1395 | switch (i) { | 1661 | switch (i) { |
1396 | case NR_ALLOC_BATCH: | ||
1397 | case NR_PAGES_SCANNED: | 1662 | case NR_PAGES_SCANNED: |
1398 | /* | 1663 | /* |
1399 | * These are often seen to go negative in | 1664 | * This is often seen to go negative in |
1400 | * recent kernels, but not to go permanently | 1665 | * recent kernels, but not to go permanently |
1401 | * negative. Whilst it would be nicer not to | 1666 | * negative. Whilst it would be nicer not to |
1402 | * have exceptions, rooting them out would be | 1667 | * have exceptions, rooting them out would be |
diff --git a/mm/workingset.c b/mm/workingset.c index 577277546d98..69551cfae97b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c | |||
@@ -16,7 +16,7 @@ | |||
16 | /* | 16 | /* |
17 | * Double CLOCK lists | 17 | * Double CLOCK lists |
18 | * | 18 | * |
19 | * Per zone, two clock lists are maintained for file pages: the | 19 | * Per node, two clock lists are maintained for file pages: the |
20 | * inactive and the active list. Freshly faulted pages start out at | 20 | * inactive and the active list. Freshly faulted pages start out at |
21 | * the head of the inactive list and page reclaim scans pages from the | 21 | * the head of the inactive list and page reclaim scans pages from the |
22 | * tail. Pages that are accessed multiple times on the inactive list | 22 | * tail. Pages that are accessed multiple times on the inactive list |
@@ -141,11 +141,11 @@ | |||
141 | * | 141 | * |
142 | * Implementation | 142 | * Implementation |
143 | * | 143 | * |
144 | * For each zone's file LRU lists, a counter for inactive evictions | 144 | * For each node's file LRU lists, a counter for inactive evictions |
145 | * and activations is maintained (zone->inactive_age). | 145 | * and activations is maintained (node->inactive_age). |
146 | * | 146 | * |
147 | * On eviction, a snapshot of this counter (along with some bits to | 147 | * On eviction, a snapshot of this counter (along with some bits to |
148 | * identify the zone) is stored in the now empty page cache radix tree | 148 | * identify the node) is stored in the now empty page cache radix tree |
149 | * slot of the evicted page. This is called a shadow entry. | 149 | * slot of the evicted page. This is called a shadow entry. |
150 | * | 150 | * |
151 | * On cache misses for which there are shadow entries, an eligible | 151 | * On cache misses for which there are shadow entries, an eligible |
@@ -153,7 +153,7 @@ | |||
153 | */ | 153 | */ |
154 | 154 | ||
155 | #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ | 155 | #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ |
156 | ZONES_SHIFT + NODES_SHIFT + \ | 156 | NODES_SHIFT + \ |
157 | MEM_CGROUP_ID_SHIFT) | 157 | MEM_CGROUP_ID_SHIFT) |
158 | #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) | 158 | #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) |
159 | 159 | ||
@@ -167,33 +167,30 @@ | |||
167 | */ | 167 | */ |
168 | static unsigned int bucket_order __read_mostly; | 168 | static unsigned int bucket_order __read_mostly; |
169 | 169 | ||
170 | static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction) | 170 | static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) |
171 | { | 171 | { |
172 | eviction >>= bucket_order; | 172 | eviction >>= bucket_order; |
173 | eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; | 173 | eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; |
174 | eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); | 174 | eviction = (eviction << NODES_SHIFT) | pgdat->node_id; |
175 | eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); | ||
176 | eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); | 175 | eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); |
177 | 176 | ||
178 | return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); | 177 | return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); |
179 | } | 178 | } |
180 | 179 | ||
181 | static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep, | 180 | static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, |
182 | unsigned long *evictionp) | 181 | unsigned long *evictionp) |
183 | { | 182 | { |
184 | unsigned long entry = (unsigned long)shadow; | 183 | unsigned long entry = (unsigned long)shadow; |
185 | int memcgid, nid, zid; | 184 | int memcgid, nid; |
186 | 185 | ||
187 | entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; | 186 | entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; |
188 | zid = entry & ((1UL << ZONES_SHIFT) - 1); | ||
189 | entry >>= ZONES_SHIFT; | ||
190 | nid = entry & ((1UL << NODES_SHIFT) - 1); | 187 | nid = entry & ((1UL << NODES_SHIFT) - 1); |
191 | entry >>= NODES_SHIFT; | 188 | entry >>= NODES_SHIFT; |
192 | memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); | 189 | memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); |
193 | entry >>= MEM_CGROUP_ID_SHIFT; | 190 | entry >>= MEM_CGROUP_ID_SHIFT; |
194 | 191 | ||
195 | *memcgidp = memcgid; | 192 | *memcgidp = memcgid; |
196 | *zonep = NODE_DATA(nid)->node_zones + zid; | 193 | *pgdat = NODE_DATA(nid); |
197 | *evictionp = entry << bucket_order; | 194 | *evictionp = entry << bucket_order; |
198 | } | 195 | } |
199 | 196 | ||
@@ -208,7 +205,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep, | |||
208 | void *workingset_eviction(struct address_space *mapping, struct page *page) | 205 | void *workingset_eviction(struct address_space *mapping, struct page *page) |
209 | { | 206 | { |
210 | struct mem_cgroup *memcg = page_memcg(page); | 207 | struct mem_cgroup *memcg = page_memcg(page); |
211 | struct zone *zone = page_zone(page); | 208 | struct pglist_data *pgdat = page_pgdat(page); |
212 | int memcgid = mem_cgroup_id(memcg); | 209 | int memcgid = mem_cgroup_id(memcg); |
213 | unsigned long eviction; | 210 | unsigned long eviction; |
214 | struct lruvec *lruvec; | 211 | struct lruvec *lruvec; |
@@ -218,9 +215,9 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) | |||
218 | VM_BUG_ON_PAGE(page_count(page), page); | 215 | VM_BUG_ON_PAGE(page_count(page), page); |
219 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 216 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
220 | 217 | ||
221 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 218 | lruvec = mem_cgroup_lruvec(pgdat, memcg); |
222 | eviction = atomic_long_inc_return(&lruvec->inactive_age); | 219 | eviction = atomic_long_inc_return(&lruvec->inactive_age); |
223 | return pack_shadow(memcgid, zone, eviction); | 220 | return pack_shadow(memcgid, pgdat, eviction); |
224 | } | 221 | } |
225 | 222 | ||
226 | /** | 223 | /** |
@@ -228,7 +225,7 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) | |||
228 | * @shadow: shadow entry of the evicted page | 225 | * @shadow: shadow entry of the evicted page |
229 | * | 226 | * |
230 | * Calculates and evaluates the refault distance of the previously | 227 | * Calculates and evaluates the refault distance of the previously |
231 | * evicted page in the context of the zone it was allocated in. | 228 | * evicted page in the context of the node it was allocated in. |
232 | * | 229 | * |
233 | * Returns %true if the page should be activated, %false otherwise. | 230 | * Returns %true if the page should be activated, %false otherwise. |
234 | */ | 231 | */ |
@@ -240,10 +237,10 @@ bool workingset_refault(void *shadow) | |||
240 | unsigned long eviction; | 237 | unsigned long eviction; |
241 | struct lruvec *lruvec; | 238 | struct lruvec *lruvec; |
242 | unsigned long refault; | 239 | unsigned long refault; |
243 | struct zone *zone; | 240 | struct pglist_data *pgdat; |
244 | int memcgid; | 241 | int memcgid; |
245 | 242 | ||
246 | unpack_shadow(shadow, &memcgid, &zone, &eviction); | 243 | unpack_shadow(shadow, &memcgid, &pgdat, &eviction); |
247 | 244 | ||
248 | rcu_read_lock(); | 245 | rcu_read_lock(); |
249 | /* | 246 | /* |
@@ -267,7 +264,7 @@ bool workingset_refault(void *shadow) | |||
267 | rcu_read_unlock(); | 264 | rcu_read_unlock(); |
268 | return false; | 265 | return false; |
269 | } | 266 | } |
270 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 267 | lruvec = mem_cgroup_lruvec(pgdat, memcg); |
271 | refault = atomic_long_read(&lruvec->inactive_age); | 268 | refault = atomic_long_read(&lruvec->inactive_age); |
272 | active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); | 269 | active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); |
273 | rcu_read_unlock(); | 270 | rcu_read_unlock(); |
@@ -290,10 +287,10 @@ bool workingset_refault(void *shadow) | |||
290 | */ | 287 | */ |
291 | refault_distance = (refault - eviction) & EVICTION_MASK; | 288 | refault_distance = (refault - eviction) & EVICTION_MASK; |
292 | 289 | ||
293 | inc_zone_state(zone, WORKINGSET_REFAULT); | 290 | inc_node_state(pgdat, WORKINGSET_REFAULT); |
294 | 291 | ||
295 | if (refault_distance <= active_file) { | 292 | if (refault_distance <= active_file) { |
296 | inc_zone_state(zone, WORKINGSET_ACTIVATE); | 293 | inc_node_state(pgdat, WORKINGSET_ACTIVATE); |
297 | return true; | 294 | return true; |
298 | } | 295 | } |
299 | return false; | 296 | return false; |
@@ -305,9 +302,10 @@ bool workingset_refault(void *shadow) | |||
305 | */ | 302 | */ |
306 | void workingset_activation(struct page *page) | 303 | void workingset_activation(struct page *page) |
307 | { | 304 | { |
305 | struct mem_cgroup *memcg; | ||
308 | struct lruvec *lruvec; | 306 | struct lruvec *lruvec; |
309 | 307 | ||
310 | lock_page_memcg(page); | 308 | rcu_read_lock(); |
311 | /* | 309 | /* |
312 | * Filter non-memcg pages here, e.g. unmap can call | 310 | * Filter non-memcg pages here, e.g. unmap can call |
313 | * mark_page_accessed() on VDSO pages. | 311 | * mark_page_accessed() on VDSO pages. |
@@ -315,12 +313,13 @@ void workingset_activation(struct page *page) | |||
315 | * XXX: See workingset_refault() - this should return | 313 | * XXX: See workingset_refault() - this should return |
316 | * root_mem_cgroup even for !CONFIG_MEMCG. | 314 | * root_mem_cgroup even for !CONFIG_MEMCG. |
317 | */ | 315 | */ |
318 | if (!mem_cgroup_disabled() && !page_memcg(page)) | 316 | memcg = page_memcg_rcu(page); |
317 | if (!mem_cgroup_disabled() && !memcg) | ||
319 | goto out; | 318 | goto out; |
320 | lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page)); | 319 | lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); |
321 | atomic_long_inc(&lruvec->inactive_age); | 320 | atomic_long_inc(&lruvec->inactive_age); |
322 | out: | 321 | out: |
323 | unlock_page_memcg(page); | 322 | rcu_read_unlock(); |
324 | } | 323 | } |
325 | 324 | ||
326 | /* | 325 | /* |
@@ -349,12 +348,13 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, | |||
349 | shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); | 348 | shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); |
350 | local_irq_enable(); | 349 | local_irq_enable(); |
351 | 350 | ||
352 | if (memcg_kmem_enabled()) | 351 | if (memcg_kmem_enabled()) { |
353 | pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, | 352 | pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, |
354 | LRU_ALL_FILE); | 353 | LRU_ALL_FILE); |
355 | else | 354 | } else { |
356 | pages = node_page_state(sc->nid, NR_ACTIVE_FILE) + | 355 | pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + |
357 | node_page_state(sc->nid, NR_INACTIVE_FILE); | 356 | node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); |
357 | } | ||
358 | 358 | ||
359 | /* | 359 | /* |
360 | * Active cache pages are limited to 50% of memory, and shadow | 360 | * Active cache pages are limited to 50% of memory, and shadow |
@@ -433,7 +433,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, | |||
433 | } | 433 | } |
434 | } | 434 | } |
435 | BUG_ON(node->count); | 435 | BUG_ON(node->count); |
436 | inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM); | 436 | inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); |
437 | if (!__radix_tree_delete_node(&mapping->page_tree, node)) | 437 | if (!__radix_tree_delete_node(&mapping->page_tree, node)) |
438 | BUG(); | 438 | BUG(); |
439 | 439 | ||
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 04176de6df70..b0bc023d25c5 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
@@ -20,6 +20,7 @@ | |||
20 | * page->freelist(index): links together all component pages of a zspage | 20 | * page->freelist(index): links together all component pages of a zspage |
21 | * For the huge page, this is always 0, so we use this field | 21 | * For the huge page, this is always 0, so we use this field |
22 | * to store handle. | 22 | * to store handle. |
23 | * page->units: first object offset in a subpage of zspage | ||
23 | * | 24 | * |
24 | * Usage of struct page flags: | 25 | * Usage of struct page flags: |
25 | * PG_private: identifies the first component page | 26 | * PG_private: identifies the first component page |
@@ -137,9 +138,6 @@ | |||
137 | */ | 138 | */ |
138 | #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) | 139 | #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) |
139 | 140 | ||
140 | /* | ||
141 | * We do not maintain any list for completely empty or full pages | ||
142 | */ | ||
143 | enum fullness_group { | 141 | enum fullness_group { |
144 | ZS_EMPTY, | 142 | ZS_EMPTY, |
145 | ZS_ALMOST_EMPTY, | 143 | ZS_ALMOST_EMPTY, |
@@ -467,11 +465,6 @@ static struct zpool_driver zs_zpool_driver = { | |||
467 | MODULE_ALIAS("zpool-zsmalloc"); | 465 | MODULE_ALIAS("zpool-zsmalloc"); |
468 | #endif /* CONFIG_ZPOOL */ | 466 | #endif /* CONFIG_ZPOOL */ |
469 | 467 | ||
470 | static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) | ||
471 | { | ||
472 | return pages_per_zspage * PAGE_SIZE / size; | ||
473 | } | ||
474 | |||
475 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ | 468 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ |
476 | static DEFINE_PER_CPU(struct mapping_area, zs_map_area); | 469 | static DEFINE_PER_CPU(struct mapping_area, zs_map_area); |
477 | 470 | ||
@@ -635,8 +628,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v) | |||
635 | freeable = zs_can_compact(class); | 628 | freeable = zs_can_compact(class); |
636 | spin_unlock(&class->lock); | 629 | spin_unlock(&class->lock); |
637 | 630 | ||
638 | objs_per_zspage = get_maxobj_per_zspage(class->size, | 631 | objs_per_zspage = class->objs_per_zspage; |
639 | class->pages_per_zspage); | ||
640 | pages_used = obj_allocated / objs_per_zspage * | 632 | pages_used = obj_allocated / objs_per_zspage * |
641 | class->pages_per_zspage; | 633 | class->pages_per_zspage; |
642 | 634 | ||
@@ -945,8 +937,8 @@ static void unpin_tag(unsigned long handle) | |||
945 | static void reset_page(struct page *page) | 937 | static void reset_page(struct page *page) |
946 | { | 938 | { |
947 | __ClearPageMovable(page); | 939 | __ClearPageMovable(page); |
948 | clear_bit(PG_private, &page->flags); | 940 | ClearPagePrivate(page); |
949 | clear_bit(PG_private_2, &page->flags); | 941 | ClearPagePrivate2(page); |
950 | set_page_private(page, 0); | 942 | set_page_private(page, 0); |
951 | page_mapcount_reset(page); | 943 | page_mapcount_reset(page); |
952 | ClearPageHugeObject(page); | 944 | ClearPageHugeObject(page); |
@@ -1014,8 +1006,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, | |||
1014 | 1006 | ||
1015 | cache_free_zspage(pool, zspage); | 1007 | cache_free_zspage(pool, zspage); |
1016 | 1008 | ||
1017 | zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | 1009 | zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); |
1018 | class->size, class->pages_per_zspage)); | ||
1019 | atomic_long_sub(class->pages_per_zspage, | 1010 | atomic_long_sub(class->pages_per_zspage, |
1020 | &pool->pages_allocated); | 1011 | &pool->pages_allocated); |
1021 | } | 1012 | } |
@@ -1350,7 +1341,7 @@ static void zs_unregister_cpu_notifier(void) | |||
1350 | cpu_notifier_register_done(); | 1341 | cpu_notifier_register_done(); |
1351 | } | 1342 | } |
1352 | 1343 | ||
1353 | static void init_zs_size_classes(void) | 1344 | static void __init init_zs_size_classes(void) |
1354 | { | 1345 | { |
1355 | int nr; | 1346 | int nr; |
1356 | 1347 | ||
@@ -1361,16 +1352,14 @@ static void init_zs_size_classes(void) | |||
1361 | zs_size_classes = nr; | 1352 | zs_size_classes = nr; |
1362 | } | 1353 | } |
1363 | 1354 | ||
1364 | static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) | 1355 | static bool can_merge(struct size_class *prev, int pages_per_zspage, |
1356 | int objs_per_zspage) | ||
1365 | { | 1357 | { |
1366 | if (prev->pages_per_zspage != pages_per_zspage) | 1358 | if (prev->pages_per_zspage == pages_per_zspage && |
1367 | return false; | 1359 | prev->objs_per_zspage == objs_per_zspage) |
1360 | return true; | ||
1368 | 1361 | ||
1369 | if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) | 1362 | return false; |
1370 | != get_maxobj_per_zspage(size, pages_per_zspage)) | ||
1371 | return false; | ||
1372 | |||
1373 | return true; | ||
1374 | } | 1363 | } |
1375 | 1364 | ||
1376 | static bool zspage_full(struct size_class *class, struct zspage *zspage) | 1365 | static bool zspage_full(struct size_class *class, struct zspage *zspage) |
@@ -1541,6 +1530,7 @@ static unsigned long obj_malloc(struct size_class *class, | |||
1541 | * zs_malloc - Allocate block of given size from pool. | 1530 | * zs_malloc - Allocate block of given size from pool. |
1542 | * @pool: pool to allocate from | 1531 | * @pool: pool to allocate from |
1543 | * @size: size of block to allocate | 1532 | * @size: size of block to allocate |
1533 | * @gfp: gfp flags when allocating object | ||
1544 | * | 1534 | * |
1545 | * On success, handle to the allocated object is returned, | 1535 | * On success, handle to the allocated object is returned, |
1546 | * otherwise 0. | 1536 | * otherwise 0. |
@@ -1592,8 +1582,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) | |||
1592 | record_obj(handle, obj); | 1582 | record_obj(handle, obj); |
1593 | atomic_long_add(class->pages_per_zspage, | 1583 | atomic_long_add(class->pages_per_zspage, |
1594 | &pool->pages_allocated); | 1584 | &pool->pages_allocated); |
1595 | zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | 1585 | zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); |
1596 | class->size, class->pages_per_zspage)); | ||
1597 | 1586 | ||
1598 | /* We completely set up zspage so mark them as movable */ | 1587 | /* We completely set up zspage so mark them as movable */ |
1599 | SetZsPageMovable(pool, zspage); | 1588 | SetZsPageMovable(pool, zspage); |
@@ -1741,10 +1730,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, | |||
1741 | * return handle. | 1730 | * return handle. |
1742 | */ | 1731 | */ |
1743 | static unsigned long find_alloced_obj(struct size_class *class, | 1732 | static unsigned long find_alloced_obj(struct size_class *class, |
1744 | struct page *page, int index) | 1733 | struct page *page, int *obj_idx) |
1745 | { | 1734 | { |
1746 | unsigned long head; | 1735 | unsigned long head; |
1747 | int offset = 0; | 1736 | int offset = 0; |
1737 | int index = *obj_idx; | ||
1748 | unsigned long handle = 0; | 1738 | unsigned long handle = 0; |
1749 | void *addr = kmap_atomic(page); | 1739 | void *addr = kmap_atomic(page); |
1750 | 1740 | ||
@@ -1765,6 +1755,9 @@ static unsigned long find_alloced_obj(struct size_class *class, | |||
1765 | } | 1755 | } |
1766 | 1756 | ||
1767 | kunmap_atomic(addr); | 1757 | kunmap_atomic(addr); |
1758 | |||
1759 | *obj_idx = index; | ||
1760 | |||
1768 | return handle; | 1761 | return handle; |
1769 | } | 1762 | } |
1770 | 1763 | ||
@@ -1776,7 +1769,7 @@ struct zs_compact_control { | |||
1776 | struct page *d_page; | 1769 | struct page *d_page; |
1777 | /* Starting object index within @s_page which used for live object | 1770 | /* Starting object index within @s_page which used for live object |
1778 | * in the subpage. */ | 1771 | * in the subpage. */ |
1779 | int index; | 1772 | int obj_idx; |
1780 | }; | 1773 | }; |
1781 | 1774 | ||
1782 | static int migrate_zspage(struct zs_pool *pool, struct size_class *class, | 1775 | static int migrate_zspage(struct zs_pool *pool, struct size_class *class, |
@@ -1786,16 +1779,16 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, | |||
1786 | unsigned long handle; | 1779 | unsigned long handle; |
1787 | struct page *s_page = cc->s_page; | 1780 | struct page *s_page = cc->s_page; |
1788 | struct page *d_page = cc->d_page; | 1781 | struct page *d_page = cc->d_page; |
1789 | unsigned long index = cc->index; | 1782 | int obj_idx = cc->obj_idx; |
1790 | int ret = 0; | 1783 | int ret = 0; |
1791 | 1784 | ||
1792 | while (1) { | 1785 | while (1) { |
1793 | handle = find_alloced_obj(class, s_page, index); | 1786 | handle = find_alloced_obj(class, s_page, &obj_idx); |
1794 | if (!handle) { | 1787 | if (!handle) { |
1795 | s_page = get_next_page(s_page); | 1788 | s_page = get_next_page(s_page); |
1796 | if (!s_page) | 1789 | if (!s_page) |
1797 | break; | 1790 | break; |
1798 | index = 0; | 1791 | obj_idx = 0; |
1799 | continue; | 1792 | continue; |
1800 | } | 1793 | } |
1801 | 1794 | ||
@@ -1809,7 +1802,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, | |||
1809 | used_obj = handle_to_obj(handle); | 1802 | used_obj = handle_to_obj(handle); |
1810 | free_obj = obj_malloc(class, get_zspage(d_page), handle); | 1803 | free_obj = obj_malloc(class, get_zspage(d_page), handle); |
1811 | zs_object_copy(class, free_obj, used_obj); | 1804 | zs_object_copy(class, free_obj, used_obj); |
1812 | index++; | 1805 | obj_idx++; |
1813 | /* | 1806 | /* |
1814 | * record_obj updates handle's value to free_obj and it will | 1807 | * record_obj updates handle's value to free_obj and it will |
1815 | * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which | 1808 | * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which |
@@ -1824,7 +1817,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, | |||
1824 | 1817 | ||
1825 | /* Remember last position in this iteration */ | 1818 | /* Remember last position in this iteration */ |
1826 | cc->s_page = s_page; | 1819 | cc->s_page = s_page; |
1827 | cc->index = index; | 1820 | cc->obj_idx = obj_idx; |
1828 | 1821 | ||
1829 | return ret; | 1822 | return ret; |
1830 | } | 1823 | } |
@@ -2181,8 +2174,7 @@ static int zs_register_migration(struct zs_pool *pool) | |||
2181 | static void zs_unregister_migration(struct zs_pool *pool) | 2174 | static void zs_unregister_migration(struct zs_pool *pool) |
2182 | { | 2175 | { |
2183 | flush_work(&pool->free_work); | 2176 | flush_work(&pool->free_work); |
2184 | if (pool->inode) | 2177 | iput(pool->inode); |
2185 | iput(pool->inode); | ||
2186 | } | 2178 | } |
2187 | 2179 | ||
2188 | /* | 2180 | /* |
@@ -2261,8 +2253,7 @@ static unsigned long zs_can_compact(struct size_class *class) | |||
2261 | return 0; | 2253 | return 0; |
2262 | 2254 | ||
2263 | obj_wasted = obj_allocated - obj_used; | 2255 | obj_wasted = obj_allocated - obj_used; |
2264 | obj_wasted /= get_maxobj_per_zspage(class->size, | 2256 | obj_wasted /= class->objs_per_zspage; |
2265 | class->pages_per_zspage); | ||
2266 | 2257 | ||
2267 | return obj_wasted * class->pages_per_zspage; | 2258 | return obj_wasted * class->pages_per_zspage; |
2268 | } | 2259 | } |
@@ -2279,7 +2270,7 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) | |||
2279 | if (!zs_can_compact(class)) | 2270 | if (!zs_can_compact(class)) |
2280 | break; | 2271 | break; |
2281 | 2272 | ||
2282 | cc.index = 0; | 2273 | cc.obj_idx = 0; |
2283 | cc.s_page = get_first_page(src_zspage); | 2274 | cc.s_page = get_first_page(src_zspage); |
2284 | 2275 | ||
2285 | while ((dst_zspage = isolate_zspage(class, false))) { | 2276 | while ((dst_zspage = isolate_zspage(class, false))) { |
@@ -2398,7 +2389,7 @@ static int zs_register_shrinker(struct zs_pool *pool) | |||
2398 | 2389 | ||
2399 | /** | 2390 | /** |
2400 | * zs_create_pool - Creates an allocation pool to work from. | 2391 | * zs_create_pool - Creates an allocation pool to work from. |
2401 | * @flags: allocation flags used to allocate pool metadata | 2392 | * @name: pool name to be created |
2402 | * | 2393 | * |
2403 | * This function must be called before anything when using | 2394 | * This function must be called before anything when using |
2404 | * the zsmalloc allocator. | 2395 | * the zsmalloc allocator. |
@@ -2438,6 +2429,7 @@ struct zs_pool *zs_create_pool(const char *name) | |||
2438 | for (i = zs_size_classes - 1; i >= 0; i--) { | 2429 | for (i = zs_size_classes - 1; i >= 0; i--) { |
2439 | int size; | 2430 | int size; |
2440 | int pages_per_zspage; | 2431 | int pages_per_zspage; |
2432 | int objs_per_zspage; | ||
2441 | struct size_class *class; | 2433 | struct size_class *class; |
2442 | int fullness = 0; | 2434 | int fullness = 0; |
2443 | 2435 | ||
@@ -2445,6 +2437,7 @@ struct zs_pool *zs_create_pool(const char *name) | |||
2445 | if (size > ZS_MAX_ALLOC_SIZE) | 2437 | if (size > ZS_MAX_ALLOC_SIZE) |
2446 | size = ZS_MAX_ALLOC_SIZE; | 2438 | size = ZS_MAX_ALLOC_SIZE; |
2447 | pages_per_zspage = get_pages_per_zspage(size); | 2439 | pages_per_zspage = get_pages_per_zspage(size); |
2440 | objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; | ||
2448 | 2441 | ||
2449 | /* | 2442 | /* |
2450 | * size_class is used for normal zsmalloc operation such | 2443 | * size_class is used for normal zsmalloc operation such |
@@ -2456,7 +2449,7 @@ struct zs_pool *zs_create_pool(const char *name) | |||
2456 | * previous size_class if possible. | 2449 | * previous size_class if possible. |
2457 | */ | 2450 | */ |
2458 | if (prev_class) { | 2451 | if (prev_class) { |
2459 | if (can_merge(prev_class, size, pages_per_zspage)) { | 2452 | if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) { |
2460 | pool->size_class[i] = prev_class; | 2453 | pool->size_class[i] = prev_class; |
2461 | continue; | 2454 | continue; |
2462 | } | 2455 | } |
@@ -2469,8 +2462,7 @@ struct zs_pool *zs_create_pool(const char *name) | |||
2469 | class->size = size; | 2462 | class->size = size; |
2470 | class->index = i; | 2463 | class->index = i; |
2471 | class->pages_per_zspage = pages_per_zspage; | 2464 | class->pages_per_zspage = pages_per_zspage; |
2472 | class->objs_per_zspage = class->pages_per_zspage * | 2465 | class->objs_per_zspage = objs_per_zspage; |
2473 | PAGE_SIZE / class->size; | ||
2474 | spin_lock_init(&class->lock); | 2466 | spin_lock_init(&class->lock); |
2475 | pool->size_class[i] = class; | 2467 | pool->size_class[i] = class; |
2476 | for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS; | 2468 | for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS; |
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index b1d491c2e704..fdde1bd3e306 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c | |||
@@ -608,6 +608,7 @@ static const struct { | |||
608 | const char *compact; | 608 | const char *compact; |
609 | } gfp_compact_table[] = { | 609 | } gfp_compact_table[] = { |
610 | { "GFP_TRANSHUGE", "THP" }, | 610 | { "GFP_TRANSHUGE", "THP" }, |
611 | { "GFP_TRANSHUGE_LIGHT", "THL" }, | ||
611 | { "GFP_HIGHUSER_MOVABLE", "HUM" }, | 612 | { "GFP_HIGHUSER_MOVABLE", "HUM" }, |
612 | { "GFP_HIGHUSER", "HU" }, | 613 | { "GFP_HIGHUSER", "HU" }, |
613 | { "GFP_USER", "U" }, | 614 | { "GFP_USER", "U" }, |