diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/backing-dev.c | 8 | ||||
-rw-r--r-- | mm/bootmem.c | 8 | ||||
-rw-r--r-- | mm/filemap.c | 32 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/fremap.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 46 | ||||
-rw-r--r-- | mm/internal.h | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 1847 | ||||
-rw-r--r-- | mm/memory.c | 204 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 20 | ||||
-rw-r--r-- | mm/migrate.c | 131 | ||||
-rw-r--r-- | mm/mlock.c | 9 | ||||
-rw-r--r-- | mm/mmap.c | 32 | ||||
-rw-r--r-- | mm/mprotect.c | 6 | ||||
-rw-r--r-- | mm/nommu.c | 1027 | ||||
-rw-r--r-- | mm/oom_kill.c | 119 | ||||
-rw-r--r-- | mm/page-writeback.c | 245 | ||||
-rw-r--r-- | mm/page_alloc.c | 143 | ||||
-rw-r--r-- | mm/page_cgroup.c | 209 | ||||
-rw-r--r-- | mm/page_io.c | 6 | ||||
-rw-r--r-- | mm/rmap.c | 60 | ||||
-rw-r--r-- | mm/shmem.c | 102 | ||||
-rw-r--r-- | mm/slub.c | 2 | ||||
-rw-r--r-- | mm/swap.c | 77 | ||||
-rw-r--r-- | mm/swap_state.c | 35 | ||||
-rw-r--r-- | mm/swapfile.c | 600 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 134 | ||||
-rw-r--r-- | mm/vmalloc.c | 50 | ||||
-rw-r--r-- | mm/vmscan.c | 324 |
31 files changed, 3801 insertions, 1691 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 5b5790f8a816..a5b77811fdf2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -181,12 +181,6 @@ config MIGRATION | |||
181 | example on NUMA systems to put pages nearer to the processors accessing | 181 | example on NUMA systems to put pages nearer to the processors accessing |
182 | the page. | 182 | the page. |
183 | 183 | ||
184 | config RESOURCES_64BIT | ||
185 | bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL) | ||
186 | default 64BIT | ||
187 | help | ||
188 | This option allows memory and IO resources to be 64 bit. | ||
189 | |||
190 | config PHYS_ADDR_T_64BIT | 184 | config PHYS_ADDR_T_64BIT |
191 | def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT | 185 | def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT |
192 | 186 | ||
diff --git a/mm/Makefile b/mm/Makefile index 51c27709cc7c..72255be57f89 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | |||
9 | 9 | ||
10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ |
11 | maccess.o page_alloc.o page-writeback.o pdflush.o \ | 11 | maccess.o page_alloc.o page-writeback.o pdflush.o \ |
12 | readahead.o swap.o truncate.o vmscan.o \ | 12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
14 | page_isolation.o mm_init.o $(mmu-y) | 14 | page_isolation.o mm_init.o $(mmu-y) |
15 | 15 | ||
@@ -21,9 +21,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o | |||
21 | obj-$(CONFIG_NUMA) += mempolicy.o | 21 | obj-$(CONFIG_NUMA) += mempolicy.o |
22 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 22 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
23 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | 23 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o |
24 | obj-$(CONFIG_SHMEM) += shmem.o | ||
25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | 24 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o |
26 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | ||
27 | obj-$(CONFIG_SLOB) += slob.o | 25 | obj-$(CONFIG_SLOB) += slob.o |
28 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | 26 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o |
29 | obj-$(CONFIG_SLAB) += slab.o | 27 | obj-$(CONFIG_SLAB) += slab.o |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 801c08b046e6..8e8587444132 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -24,9 +24,9 @@ static void bdi_debug_init(void) | |||
24 | static int bdi_debug_stats_show(struct seq_file *m, void *v) | 24 | static int bdi_debug_stats_show(struct seq_file *m, void *v) |
25 | { | 25 | { |
26 | struct backing_dev_info *bdi = m->private; | 26 | struct backing_dev_info *bdi = m->private; |
27 | long background_thresh; | 27 | unsigned long background_thresh; |
28 | long dirty_thresh; | 28 | unsigned long dirty_thresh; |
29 | long bdi_thresh; | 29 | unsigned long bdi_thresh; |
30 | 30 | ||
31 | get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); | 31 | get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); |
32 | 32 | ||
@@ -223,7 +223,7 @@ int bdi_init(struct backing_dev_info *bdi) | |||
223 | bdi->max_prop_frac = PROP_FRAC_BASE; | 223 | bdi->max_prop_frac = PROP_FRAC_BASE; |
224 | 224 | ||
225 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { | 225 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { |
226 | err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0); | 226 | err = percpu_counter_init(&bdi->bdi_stat[i], 0); |
227 | if (err) | 227 | if (err) |
228 | goto err; | 228 | goto err; |
229 | } | 229 | } |
diff --git a/mm/bootmem.c b/mm/bootmem.c index ac5a891f142a..51a0ccf61e0e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -435,6 +435,10 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, | |||
435 | unsigned long fallback = 0; | 435 | unsigned long fallback = 0; |
436 | unsigned long min, max, start, sidx, midx, step; | 436 | unsigned long min, max, start, sidx, midx, step; |
437 | 437 | ||
438 | bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", | ||
439 | bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, | ||
440 | align, goal, limit); | ||
441 | |||
438 | BUG_ON(!size); | 442 | BUG_ON(!size); |
439 | BUG_ON(align & (align - 1)); | 443 | BUG_ON(align & (align - 1)); |
440 | BUG_ON(limit && goal + size > limit); | 444 | BUG_ON(limit && goal + size > limit); |
@@ -442,10 +446,6 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, | |||
442 | if (!bdata->node_bootmem_map) | 446 | if (!bdata->node_bootmem_map) |
443 | return NULL; | 447 | return NULL; |
444 | 448 | ||
445 | bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", | ||
446 | bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, | ||
447 | align, goal, limit); | ||
448 | |||
449 | min = bdata->node_min_pfn; | 449 | min = bdata->node_min_pfn; |
450 | max = bdata->node_low_pfn; | 450 | max = bdata->node_low_pfn; |
451 | 451 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index f5769b4dc075..ceba0bd03662 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, | |||
210 | int ret; | 210 | int ret; |
211 | struct writeback_control wbc = { | 211 | struct writeback_control wbc = { |
212 | .sync_mode = sync_mode, | 212 | .sync_mode = sync_mode, |
213 | .nr_to_write = mapping->nrpages * 2, | 213 | .nr_to_write = LONG_MAX, |
214 | .range_start = start, | 214 | .range_start = start, |
215 | .range_end = end, | 215 | .range_end = end, |
216 | }; | 216 | }; |
@@ -460,7 +460,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
460 | VM_BUG_ON(!PageLocked(page)); | 460 | VM_BUG_ON(!PageLocked(page)); |
461 | 461 | ||
462 | error = mem_cgroup_cache_charge(page, current->mm, | 462 | error = mem_cgroup_cache_charge(page, current->mm, |
463 | gfp_mask & ~__GFP_HIGHMEM); | 463 | gfp_mask & GFP_RECLAIM_MASK); |
464 | if (error) | 464 | if (error) |
465 | goto out; | 465 | goto out; |
466 | 466 | ||
@@ -741,7 +741,14 @@ repeat: | |||
741 | page = __page_cache_alloc(gfp_mask); | 741 | page = __page_cache_alloc(gfp_mask); |
742 | if (!page) | 742 | if (!page) |
743 | return NULL; | 743 | return NULL; |
744 | err = add_to_page_cache_lru(page, mapping, index, gfp_mask); | 744 | /* |
745 | * We want a regular kernel memory (not highmem or DMA etc) | ||
746 | * allocation for the radix tree nodes, but we need to honour | ||
747 | * the context-specific requirements the caller has asked for. | ||
748 | * GFP_RECLAIM_MASK collects those requirements. | ||
749 | */ | ||
750 | err = add_to_page_cache_lru(page, mapping, index, | ||
751 | (gfp_mask & GFP_RECLAIM_MASK)); | ||
745 | if (unlikely(err)) { | 752 | if (unlikely(err)) { |
746 | page_cache_release(page); | 753 | page_cache_release(page); |
747 | page = NULL; | 754 | page = NULL; |
@@ -950,7 +957,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) | |||
950 | return NULL; | 957 | return NULL; |
951 | } | 958 | } |
952 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); | 959 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); |
953 | if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { | 960 | if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { |
954 | page_cache_release(page); | 961 | page_cache_release(page); |
955 | page = NULL; | 962 | page = NULL; |
956 | } | 963 | } |
@@ -1317,7 +1324,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1317 | goto out; /* skip atime */ | 1324 | goto out; /* skip atime */ |
1318 | size = i_size_read(inode); | 1325 | size = i_size_read(inode); |
1319 | if (pos < size) { | 1326 | if (pos < size) { |
1320 | retval = filemap_write_and_wait(mapping); | 1327 | retval = filemap_write_and_wait_range(mapping, pos, |
1328 | pos + iov_length(iov, nr_segs) - 1); | ||
1321 | if (!retval) { | 1329 | if (!retval) { |
1322 | retval = mapping->a_ops->direct_IO(READ, iocb, | 1330 | retval = mapping->a_ops->direct_IO(READ, iocb, |
1323 | iov, pos, nr_segs); | 1331 | iov, pos, nr_segs); |
@@ -1530,7 +1538,6 @@ retry_find: | |||
1530 | /* | 1538 | /* |
1531 | * Found the page and have a reference on it. | 1539 | * Found the page and have a reference on it. |
1532 | */ | 1540 | */ |
1533 | mark_page_accessed(page); | ||
1534 | ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; | 1541 | ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; |
1535 | vmf->page = page; | 1542 | vmf->page = page; |
1536 | return ret | VM_FAULT_LOCKED; | 1543 | return ret | VM_FAULT_LOCKED; |
@@ -2060,18 +2067,10 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
2060 | if (count != ocount) | 2067 | if (count != ocount) |
2061 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); | 2068 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); |
2062 | 2069 | ||
2063 | /* | ||
2064 | * Unmap all mmappings of the file up-front. | ||
2065 | * | ||
2066 | * This will cause any pte dirty bits to be propagated into the | ||
2067 | * pageframes for the subsequent filemap_write_and_wait(). | ||
2068 | */ | ||
2069 | write_len = iov_length(iov, *nr_segs); | 2070 | write_len = iov_length(iov, *nr_segs); |
2070 | end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; | 2071 | end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; |
2071 | if (mapping_mapped(mapping)) | ||
2072 | unmap_mapping_range(mapping, pos, write_len, 0); | ||
2073 | 2072 | ||
2074 | written = filemap_write_and_wait(mapping); | 2073 | written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); |
2075 | if (written) | 2074 | if (written) |
2076 | goto out; | 2075 | goto out; |
2077 | 2076 | ||
@@ -2291,7 +2290,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2291 | * the file data here, to try to honour O_DIRECT expectations. | 2290 | * the file data here, to try to honour O_DIRECT expectations. |
2292 | */ | 2291 | */ |
2293 | if (unlikely(file->f_flags & O_DIRECT) && written) | 2292 | if (unlikely(file->f_flags & O_DIRECT) && written) |
2294 | status = filemap_write_and_wait(mapping); | 2293 | status = filemap_write_and_wait_range(mapping, |
2294 | pos, pos + written - 1); | ||
2295 | 2295 | ||
2296 | return written ? written : status; | 2296 | return written ? written : status; |
2297 | } | 2297 | } |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index b5167dfb2f2d..0c04615651b7 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -193,7 +193,7 @@ retry: | |||
193 | /* Nuke the page table entry. */ | 193 | /* Nuke the page table entry. */ |
194 | flush_cache_page(vma, address, pte_pfn(*pte)); | 194 | flush_cache_page(vma, address, pte_pfn(*pte)); |
195 | pteval = ptep_clear_flush_notify(vma, address, pte); | 195 | pteval = ptep_clear_flush_notify(vma, address, pte); |
196 | page_remove_rmap(page, vma); | 196 | page_remove_rmap(page); |
197 | dec_mm_counter(mm, file_rss); | 197 | dec_mm_counter(mm, file_rss); |
198 | BUG_ON(pte_dirty(pteval)); | 198 | BUG_ON(pte_dirty(pteval)); |
199 | pte_unmap_unlock(pte, ptl); | 199 | pte_unmap_unlock(pte, ptl); |
diff --git a/mm/fremap.c b/mm/fremap.c index 7d12ca70ef7b..62d5bbda921a 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
37 | if (page) { | 37 | if (page) { |
38 | if (pte_dirty(pte)) | 38 | if (pte_dirty(pte)) |
39 | set_page_dirty(page); | 39 | set_page_dirty(page); |
40 | page_remove_rmap(page, vma); | 40 | page_remove_rmap(page); |
41 | page_cache_release(page); | 41 | page_cache_release(page); |
42 | update_hiwater_rss(mm); | 42 | update_hiwater_rss(mm); |
43 | dec_mm_counter(mm, file_rss); | 43 | dec_mm_counter(mm, file_rss); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6058b53dcb89..618e98304080 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -220,6 +220,35 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, | |||
220 | } | 220 | } |
221 | 221 | ||
222 | /* | 222 | /* |
223 | * Return the size of the pages allocated when backing a VMA. In the majority | ||
224 | * cases this will be same size as used by the page table entries. | ||
225 | */ | ||
226 | unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) | ||
227 | { | ||
228 | struct hstate *hstate; | ||
229 | |||
230 | if (!is_vm_hugetlb_page(vma)) | ||
231 | return PAGE_SIZE; | ||
232 | |||
233 | hstate = hstate_vma(vma); | ||
234 | |||
235 | return 1UL << (hstate->order + PAGE_SHIFT); | ||
236 | } | ||
237 | |||
238 | /* | ||
239 | * Return the page size being used by the MMU to back a VMA. In the majority | ||
240 | * of cases, the page size used by the kernel matches the MMU size. On | ||
241 | * architectures where it differs, an architecture-specific version of this | ||
242 | * function is required. | ||
243 | */ | ||
244 | #ifndef vma_mmu_pagesize | ||
245 | unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) | ||
246 | { | ||
247 | return vma_kernel_pagesize(vma); | ||
248 | } | ||
249 | #endif | ||
250 | |||
251 | /* | ||
223 | * Flags for MAP_PRIVATE reservations. These are stored in the bottom | 252 | * Flags for MAP_PRIVATE reservations. These are stored in the bottom |
224 | * bits of the reservation map pointer, which are always clear due to | 253 | * bits of the reservation map pointer, which are always clear due to |
225 | * alignment. | 254 | * alignment. |
@@ -371,8 +400,10 @@ static void clear_huge_page(struct page *page, | |||
371 | { | 400 | { |
372 | int i; | 401 | int i; |
373 | 402 | ||
374 | if (unlikely(sz > MAX_ORDER_NR_PAGES)) | 403 | if (unlikely(sz > MAX_ORDER_NR_PAGES)) { |
375 | return clear_gigantic_page(page, addr, sz); | 404 | clear_gigantic_page(page, addr, sz); |
405 | return; | ||
406 | } | ||
376 | 407 | ||
377 | might_sleep(); | 408 | might_sleep(); |
378 | for (i = 0; i < sz/PAGE_SIZE; i++) { | 409 | for (i = 0; i < sz/PAGE_SIZE; i++) { |
@@ -404,8 +435,10 @@ static void copy_huge_page(struct page *dst, struct page *src, | |||
404 | int i; | 435 | int i; |
405 | struct hstate *h = hstate_vma(vma); | 436 | struct hstate *h = hstate_vma(vma); |
406 | 437 | ||
407 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) | 438 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { |
408 | return copy_gigantic_page(dst, src, addr, vma); | 439 | copy_gigantic_page(dst, src, addr, vma); |
440 | return; | ||
441 | } | ||
409 | 442 | ||
410 | might_sleep(); | 443 | might_sleep(); |
411 | for (i = 0; i < pages_per_huge_page(h); i++) { | 444 | for (i = 0; i < pages_per_huge_page(h); i++) { |
@@ -972,7 +1005,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
972 | return page; | 1005 | return page; |
973 | } | 1006 | } |
974 | 1007 | ||
975 | __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) | 1008 | int __weak alloc_bootmem_huge_page(struct hstate *h) |
976 | { | 1009 | { |
977 | struct huge_bootmem_page *m; | 1010 | struct huge_bootmem_page *m; |
978 | int nr_nodes = nodes_weight(node_online_map); | 1011 | int nr_nodes = nodes_weight(node_online_map); |
@@ -991,8 +1024,7 @@ __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) | |||
991 | * puts them into the mem_map). | 1024 | * puts them into the mem_map). |
992 | */ | 1025 | */ |
993 | m = addr; | 1026 | m = addr; |
994 | if (m) | 1027 | goto found; |
995 | goto found; | ||
996 | } | 1028 | } |
997 | hstate_next_node(h); | 1029 | hstate_next_node(h); |
998 | nr_nodes--; | 1030 | nr_nodes--; |
diff --git a/mm/internal.h b/mm/internal.h index 13333bc2eb68..478223b73a2a 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -49,6 +49,7 @@ extern void putback_lru_page(struct page *page); | |||
49 | /* | 49 | /* |
50 | * in mm/page_alloc.c | 50 | * in mm/page_alloc.c |
51 | */ | 51 | */ |
52 | extern unsigned long highest_memmap_pfn; | ||
52 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 53 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
53 | 54 | ||
54 | /* | 55 | /* |
@@ -275,6 +276,7 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | |||
275 | #define GUP_FLAGS_WRITE 0x1 | 276 | #define GUP_FLAGS_WRITE 0x1 |
276 | #define GUP_FLAGS_FORCE 0x2 | 277 | #define GUP_FLAGS_FORCE 0x2 |
277 | #define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 | 278 | #define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 |
279 | #define GUP_FLAGS_IGNORE_SIGKILL 0x8 | ||
278 | 280 | ||
279 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 281 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
280 | unsigned long start, int len, int flags, | 282 | unsigned long start, int len, int flags, |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 866dcc7eeb0c..e2996b80601f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -21,11 +21,13 @@ | |||
21 | #include <linux/memcontrol.h> | 21 | #include <linux/memcontrol.h> |
22 | #include <linux/cgroup.h> | 22 | #include <linux/cgroup.h> |
23 | #include <linux/mm.h> | 23 | #include <linux/mm.h> |
24 | #include <linux/pagemap.h> | ||
24 | #include <linux/smp.h> | 25 | #include <linux/smp.h> |
25 | #include <linux/page-flags.h> | 26 | #include <linux/page-flags.h> |
26 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
27 | #include <linux/bit_spinlock.h> | 28 | #include <linux/bit_spinlock.h> |
28 | #include <linux/rcupdate.h> | 29 | #include <linux/rcupdate.h> |
30 | #include <linux/mutex.h> | ||
29 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
30 | #include <linux/swap.h> | 32 | #include <linux/swap.h> |
31 | #include <linux/spinlock.h> | 33 | #include <linux/spinlock.h> |
@@ -34,12 +36,23 @@ | |||
34 | #include <linux/vmalloc.h> | 36 | #include <linux/vmalloc.h> |
35 | #include <linux/mm_inline.h> | 37 | #include <linux/mm_inline.h> |
36 | #include <linux/page_cgroup.h> | 38 | #include <linux/page_cgroup.h> |
39 | #include "internal.h" | ||
37 | 40 | ||
38 | #include <asm/uaccess.h> | 41 | #include <asm/uaccess.h> |
39 | 42 | ||
40 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 43 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
41 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 44 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
42 | 45 | ||
46 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
47 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */ | ||
48 | int do_swap_account __read_mostly; | ||
49 | static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | ||
50 | #else | ||
51 | #define do_swap_account (0) | ||
52 | #endif | ||
53 | |||
54 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | ||
55 | |||
43 | /* | 56 | /* |
44 | * Statistics for memory cgroup. | 57 | * Statistics for memory cgroup. |
45 | */ | 58 | */ |
@@ -60,7 +73,7 @@ struct mem_cgroup_stat_cpu { | |||
60 | } ____cacheline_aligned_in_smp; | 73 | } ____cacheline_aligned_in_smp; |
61 | 74 | ||
62 | struct mem_cgroup_stat { | 75 | struct mem_cgroup_stat { |
63 | struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; | 76 | struct mem_cgroup_stat_cpu cpustat[0]; |
64 | }; | 77 | }; |
65 | 78 | ||
66 | /* | 79 | /* |
@@ -89,9 +102,10 @@ struct mem_cgroup_per_zone { | |||
89 | /* | 102 | /* |
90 | * spin_lock to protect the per cgroup LRU | 103 | * spin_lock to protect the per cgroup LRU |
91 | */ | 104 | */ |
92 | spinlock_t lru_lock; | ||
93 | struct list_head lists[NR_LRU_LISTS]; | 105 | struct list_head lists[NR_LRU_LISTS]; |
94 | unsigned long count[NR_LRU_LISTS]; | 106 | unsigned long count[NR_LRU_LISTS]; |
107 | |||
108 | struct zone_reclaim_stat reclaim_stat; | ||
95 | }; | 109 | }; |
96 | /* Macro for accessing counter */ | 110 | /* Macro for accessing counter */ |
97 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | 111 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) |
@@ -122,44 +136,73 @@ struct mem_cgroup { | |||
122 | */ | 136 | */ |
123 | struct res_counter res; | 137 | struct res_counter res; |
124 | /* | 138 | /* |
139 | * the counter to account for mem+swap usage. | ||
140 | */ | ||
141 | struct res_counter memsw; | ||
142 | /* | ||
125 | * Per cgroup active and inactive list, similar to the | 143 | * Per cgroup active and inactive list, similar to the |
126 | * per zone LRU lists. | 144 | * per zone LRU lists. |
127 | */ | 145 | */ |
128 | struct mem_cgroup_lru_info info; | 146 | struct mem_cgroup_lru_info info; |
129 | 147 | ||
148 | /* | ||
149 | protect against reclaim related member. | ||
150 | */ | ||
151 | spinlock_t reclaim_param_lock; | ||
152 | |||
130 | int prev_priority; /* for recording reclaim priority */ | 153 | int prev_priority; /* for recording reclaim priority */ |
154 | |||
155 | /* | ||
156 | * While reclaiming in a hiearchy, we cache the last child we | ||
157 | * reclaimed from. Protected by hierarchy_mutex | ||
158 | */ | ||
159 | struct mem_cgroup *last_scanned_child; | ||
131 | /* | 160 | /* |
132 | * statistics. | 161 | * Should the accounting and control be hierarchical, per subtree? |
162 | */ | ||
163 | bool use_hierarchy; | ||
164 | unsigned long last_oom_jiffies; | ||
165 | atomic_t refcnt; | ||
166 | |||
167 | unsigned int swappiness; | ||
168 | |||
169 | /* | ||
170 | * statistics. This must be placed at the end of memcg. | ||
133 | */ | 171 | */ |
134 | struct mem_cgroup_stat stat; | 172 | struct mem_cgroup_stat stat; |
135 | }; | 173 | }; |
136 | static struct mem_cgroup init_mem_cgroup; | ||
137 | 174 | ||
138 | enum charge_type { | 175 | enum charge_type { |
139 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 176 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
140 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 177 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
141 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ | 178 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ |
142 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ | 179 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ |
180 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ | ||
143 | NR_CHARGE_TYPE, | 181 | NR_CHARGE_TYPE, |
144 | }; | 182 | }; |
145 | 183 | ||
146 | /* only for here (for easy reading.) */ | 184 | /* only for here (for easy reading.) */ |
147 | #define PCGF_CACHE (1UL << PCG_CACHE) | 185 | #define PCGF_CACHE (1UL << PCG_CACHE) |
148 | #define PCGF_USED (1UL << PCG_USED) | 186 | #define PCGF_USED (1UL << PCG_USED) |
149 | #define PCGF_ACTIVE (1UL << PCG_ACTIVE) | ||
150 | #define PCGF_LOCK (1UL << PCG_LOCK) | 187 | #define PCGF_LOCK (1UL << PCG_LOCK) |
151 | #define PCGF_FILE (1UL << PCG_FILE) | ||
152 | static const unsigned long | 188 | static const unsigned long |
153 | pcg_default_flags[NR_CHARGE_TYPE] = { | 189 | pcg_default_flags[NR_CHARGE_TYPE] = { |
154 | PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ | 190 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ |
155 | PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ | 191 | PCGF_USED | PCGF_LOCK, /* Anon */ |
156 | PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ | 192 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ |
157 | 0, /* FORCE */ | 193 | 0, /* FORCE */ |
158 | }; | 194 | }; |
159 | 195 | ||
160 | /* | 196 | /* for encoding cft->private value on file */ |
161 | * Always modified under lru lock. Then, not necessary to preempt_disable() | 197 | #define _MEM (0) |
162 | */ | 198 | #define _MEMSWAP (1) |
199 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | ||
200 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | ||
201 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | ||
202 | |||
203 | static void mem_cgroup_get(struct mem_cgroup *mem); | ||
204 | static void mem_cgroup_put(struct mem_cgroup *mem); | ||
205 | |||
163 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 206 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
164 | struct page_cgroup *pc, | 207 | struct page_cgroup *pc, |
165 | bool charge) | 208 | bool charge) |
@@ -167,10 +210,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
167 | int val = (charge)? 1 : -1; | 210 | int val = (charge)? 1 : -1; |
168 | struct mem_cgroup_stat *stat = &mem->stat; | 211 | struct mem_cgroup_stat *stat = &mem->stat; |
169 | struct mem_cgroup_stat_cpu *cpustat; | 212 | struct mem_cgroup_stat_cpu *cpustat; |
213 | int cpu = get_cpu(); | ||
170 | 214 | ||
171 | VM_BUG_ON(!irqs_disabled()); | 215 | cpustat = &stat->cpustat[cpu]; |
172 | |||
173 | cpustat = &stat->cpustat[smp_processor_id()]; | ||
174 | if (PageCgroupCache(pc)) | 216 | if (PageCgroupCache(pc)) |
175 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); | 217 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); |
176 | else | 218 | else |
@@ -182,6 +224,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
182 | else | 224 | else |
183 | __mem_cgroup_stat_add_safe(cpustat, | 225 | __mem_cgroup_stat_add_safe(cpustat, |
184 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 226 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); |
227 | put_cpu(); | ||
185 | } | 228 | } |
186 | 229 | ||
187 | static struct mem_cgroup_per_zone * | 230 | static struct mem_cgroup_per_zone * |
@@ -197,6 +240,9 @@ page_cgroup_zoneinfo(struct page_cgroup *pc) | |||
197 | int nid = page_cgroup_nid(pc); | 240 | int nid = page_cgroup_nid(pc); |
198 | int zid = page_cgroup_zid(pc); | 241 | int zid = page_cgroup_zid(pc); |
199 | 242 | ||
243 | if (!mem) | ||
244 | return NULL; | ||
245 | |||
200 | return mem_cgroup_zoneinfo(mem, nid, zid); | 246 | return mem_cgroup_zoneinfo(mem, nid, zid); |
201 | } | 247 | } |
202 | 248 | ||
@@ -236,77 +282,152 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
236 | struct mem_cgroup, css); | 282 | struct mem_cgroup, css); |
237 | } | 283 | } |
238 | 284 | ||
239 | static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, | 285 | static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) |
240 | struct page_cgroup *pc) | ||
241 | { | 286 | { |
242 | int lru = LRU_BASE; | 287 | struct mem_cgroup *mem = NULL; |
288 | /* | ||
289 | * Because we have no locks, mm->owner's may be being moved to other | ||
290 | * cgroup. We use css_tryget() here even if this looks | ||
291 | * pessimistic (rather than adding locks here). | ||
292 | */ | ||
293 | rcu_read_lock(); | ||
294 | do { | ||
295 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | ||
296 | if (unlikely(!mem)) | ||
297 | break; | ||
298 | } while (!css_tryget(&mem->css)); | ||
299 | rcu_read_unlock(); | ||
300 | return mem; | ||
301 | } | ||
243 | 302 | ||
244 | if (PageCgroupUnevictable(pc)) | 303 | static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) |
245 | lru = LRU_UNEVICTABLE; | 304 | { |
246 | else { | 305 | if (!mem) |
247 | if (PageCgroupActive(pc)) | 306 | return true; |
248 | lru += LRU_ACTIVE; | 307 | return css_is_removed(&mem->css); |
249 | if (PageCgroupFile(pc)) | 308 | } |
250 | lru += LRU_FILE; | ||
251 | } | ||
252 | 309 | ||
253 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | 310 | /* |
311 | * Following LRU functions are allowed to be used without PCG_LOCK. | ||
312 | * Operations are called by routine of global LRU independently from memcg. | ||
313 | * What we have to take care of here is validness of pc->mem_cgroup. | ||
314 | * | ||
315 | * Changes to pc->mem_cgroup happens when | ||
316 | * 1. charge | ||
317 | * 2. moving account | ||
318 | * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. | ||
319 | * It is added to LRU before charge. | ||
320 | * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. | ||
321 | * When moving account, the page is not on LRU. It's isolated. | ||
322 | */ | ||
254 | 323 | ||
255 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); | 324 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) |
256 | list_del(&pc->lru); | 325 | { |
326 | struct page_cgroup *pc; | ||
327 | struct mem_cgroup *mem; | ||
328 | struct mem_cgroup_per_zone *mz; | ||
329 | |||
330 | if (mem_cgroup_disabled()) | ||
331 | return; | ||
332 | pc = lookup_page_cgroup(page); | ||
333 | /* can happen while we handle swapcache. */ | ||
334 | if (list_empty(&pc->lru) || !pc->mem_cgroup) | ||
335 | return; | ||
336 | /* | ||
337 | * We don't check PCG_USED bit. It's cleared when the "page" is finally | ||
338 | * removed from global LRU. | ||
339 | */ | ||
340 | mz = page_cgroup_zoneinfo(pc); | ||
341 | mem = pc->mem_cgroup; | ||
342 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | ||
343 | list_del_init(&pc->lru); | ||
344 | return; | ||
257 | } | 345 | } |
258 | 346 | ||
259 | static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, | 347 | void mem_cgroup_del_lru(struct page *page) |
260 | struct page_cgroup *pc) | ||
261 | { | 348 | { |
262 | int lru = LRU_BASE; | 349 | mem_cgroup_del_lru_list(page, page_lru(page)); |
350 | } | ||
263 | 351 | ||
264 | if (PageCgroupUnevictable(pc)) | 352 | void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) |
265 | lru = LRU_UNEVICTABLE; | 353 | { |
266 | else { | 354 | struct mem_cgroup_per_zone *mz; |
267 | if (PageCgroupActive(pc)) | 355 | struct page_cgroup *pc; |
268 | lru += LRU_ACTIVE; | ||
269 | if (PageCgroupFile(pc)) | ||
270 | lru += LRU_FILE; | ||
271 | } | ||
272 | 356 | ||
273 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | 357 | if (mem_cgroup_disabled()) |
274 | list_add(&pc->lru, &mz->lists[lru]); | 358 | return; |
275 | 359 | ||
276 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); | 360 | pc = lookup_page_cgroup(page); |
361 | smp_rmb(); | ||
362 | /* unused page is not rotated. */ | ||
363 | if (!PageCgroupUsed(pc)) | ||
364 | return; | ||
365 | mz = page_cgroup_zoneinfo(pc); | ||
366 | list_move(&pc->lru, &mz->lists[lru]); | ||
277 | } | 367 | } |
278 | 368 | ||
279 | static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) | 369 | void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) |
280 | { | 370 | { |
281 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | 371 | struct page_cgroup *pc; |
282 | int active = PageCgroupActive(pc); | 372 | struct mem_cgroup_per_zone *mz; |
283 | int file = PageCgroupFile(pc); | ||
284 | int unevictable = PageCgroupUnevictable(pc); | ||
285 | enum lru_list from = unevictable ? LRU_UNEVICTABLE : | ||
286 | (LRU_FILE * !!file + !!active); | ||
287 | 373 | ||
288 | if (lru == from) | 374 | if (mem_cgroup_disabled()) |
375 | return; | ||
376 | pc = lookup_page_cgroup(page); | ||
377 | /* barrier to sync with "charge" */ | ||
378 | smp_rmb(); | ||
379 | if (!PageCgroupUsed(pc)) | ||
289 | return; | 380 | return; |
290 | 381 | ||
291 | MEM_CGROUP_ZSTAT(mz, from) -= 1; | 382 | mz = page_cgroup_zoneinfo(pc); |
383 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | ||
384 | list_add(&pc->lru, &mz->lists[lru]); | ||
385 | } | ||
386 | |||
387 | /* | ||
388 | * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to | ||
389 | * lru because the page may.be reused after it's fully uncharged (because of | ||
390 | * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge | ||
391 | * it again. This function is only used to charge SwapCache. It's done under | ||
392 | * lock_page and expected that zone->lru_lock is never held. | ||
393 | */ | ||
394 | static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) | ||
395 | { | ||
396 | unsigned long flags; | ||
397 | struct zone *zone = page_zone(page); | ||
398 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
399 | |||
400 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
292 | /* | 401 | /* |
293 | * However this is done under mz->lru_lock, another flags, which | 402 | * Forget old LRU when this page_cgroup is *not* used. This Used bit |
294 | * are not related to LRU, will be modified from out-of-lock. | 403 | * is guarded by lock_page() because the page is SwapCache. |
295 | * We have to use atomic set/clear flags. | ||
296 | */ | 404 | */ |
297 | if (is_unevictable_lru(lru)) { | 405 | if (!PageCgroupUsed(pc)) |
298 | ClearPageCgroupActive(pc); | 406 | mem_cgroup_del_lru_list(page, page_lru(page)); |
299 | SetPageCgroupUnevictable(pc); | 407 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
300 | } else { | 408 | } |
301 | if (is_active_lru(lru)) | ||
302 | SetPageCgroupActive(pc); | ||
303 | else | ||
304 | ClearPageCgroupActive(pc); | ||
305 | ClearPageCgroupUnevictable(pc); | ||
306 | } | ||
307 | 409 | ||
308 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | 410 | static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) |
309 | list_move(&pc->lru, &mz->lists[lru]); | 411 | { |
412 | unsigned long flags; | ||
413 | struct zone *zone = page_zone(page); | ||
414 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
415 | |||
416 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
417 | /* link when the page is linked to LRU but page_cgroup isn't */ | ||
418 | if (PageLRU(page) && list_empty(&pc->lru)) | ||
419 | mem_cgroup_add_lru_list(page, page_lru(page)); | ||
420 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
421 | } | ||
422 | |||
423 | |||
424 | void mem_cgroup_move_lists(struct page *page, | ||
425 | enum lru_list from, enum lru_list to) | ||
426 | { | ||
427 | if (mem_cgroup_disabled()) | ||
428 | return; | ||
429 | mem_cgroup_del_lru_list(page, from); | ||
430 | mem_cgroup_add_lru_list(page, to); | ||
310 | } | 431 | } |
311 | 432 | ||
312 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | 433 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) |
@@ -320,37 +441,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
320 | } | 441 | } |
321 | 442 | ||
322 | /* | 443 | /* |
323 | * This routine assumes that the appropriate zone's lru lock is already held | ||
324 | */ | ||
325 | void mem_cgroup_move_lists(struct page *page, enum lru_list lru) | ||
326 | { | ||
327 | struct page_cgroup *pc; | ||
328 | struct mem_cgroup_per_zone *mz; | ||
329 | unsigned long flags; | ||
330 | |||
331 | if (mem_cgroup_subsys.disabled) | ||
332 | return; | ||
333 | |||
334 | /* | ||
335 | * We cannot lock_page_cgroup while holding zone's lru_lock, | ||
336 | * because other holders of lock_page_cgroup can be interrupted | ||
337 | * with an attempt to rotate_reclaimable_page. But we cannot | ||
338 | * safely get to page_cgroup without it, so just try_lock it: | ||
339 | * mem_cgroup_isolate_pages allows for page left on wrong list. | ||
340 | */ | ||
341 | pc = lookup_page_cgroup(page); | ||
342 | if (!trylock_page_cgroup(pc)) | ||
343 | return; | ||
344 | if (pc && PageCgroupUsed(pc)) { | ||
345 | mz = page_cgroup_zoneinfo(pc); | ||
346 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
347 | __mem_cgroup_move_lists(pc, lru); | ||
348 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
349 | } | ||
350 | unlock_page_cgroup(pc); | ||
351 | } | ||
352 | |||
353 | /* | ||
354 | * Calculate mapped_ratio under memory controller. This will be used in | 444 | * Calculate mapped_ratio under memory controller. This will be used in |
355 | * vmscan.c for deteremining we have to reclaim mapped pages. | 445 | * vmscan.c for deteremining we have to reclaim mapped pages. |
356 | */ | 446 | */ |
@@ -372,39 +462,108 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) | |||
372 | */ | 462 | */ |
373 | int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) | 463 | int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) |
374 | { | 464 | { |
375 | return mem->prev_priority; | 465 | int prev_priority; |
466 | |||
467 | spin_lock(&mem->reclaim_param_lock); | ||
468 | prev_priority = mem->prev_priority; | ||
469 | spin_unlock(&mem->reclaim_param_lock); | ||
470 | |||
471 | return prev_priority; | ||
376 | } | 472 | } |
377 | 473 | ||
378 | void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) | 474 | void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) |
379 | { | 475 | { |
476 | spin_lock(&mem->reclaim_param_lock); | ||
380 | if (priority < mem->prev_priority) | 477 | if (priority < mem->prev_priority) |
381 | mem->prev_priority = priority; | 478 | mem->prev_priority = priority; |
479 | spin_unlock(&mem->reclaim_param_lock); | ||
382 | } | 480 | } |
383 | 481 | ||
384 | void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) | 482 | void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) |
385 | { | 483 | { |
484 | spin_lock(&mem->reclaim_param_lock); | ||
386 | mem->prev_priority = priority; | 485 | mem->prev_priority = priority; |
486 | spin_unlock(&mem->reclaim_param_lock); | ||
387 | } | 487 | } |
388 | 488 | ||
389 | /* | 489 | static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) |
390 | * Calculate # of pages to be scanned in this priority/zone. | 490 | { |
391 | * See also vmscan.c | 491 | unsigned long active; |
392 | * | 492 | unsigned long inactive; |
393 | * priority starts from "DEF_PRIORITY" and decremented in each loop. | 493 | unsigned long gb; |
394 | * (see include/linux/mmzone.h) | 494 | unsigned long inactive_ratio; |
395 | */ | 495 | |
496 | inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON); | ||
497 | active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON); | ||
498 | |||
499 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | ||
500 | if (gb) | ||
501 | inactive_ratio = int_sqrt(10 * gb); | ||
502 | else | ||
503 | inactive_ratio = 1; | ||
504 | |||
505 | if (present_pages) { | ||
506 | present_pages[0] = inactive; | ||
507 | present_pages[1] = active; | ||
508 | } | ||
509 | |||
510 | return inactive_ratio; | ||
511 | } | ||
512 | |||
513 | int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) | ||
514 | { | ||
515 | unsigned long active; | ||
516 | unsigned long inactive; | ||
517 | unsigned long present_pages[2]; | ||
518 | unsigned long inactive_ratio; | ||
396 | 519 | ||
397 | long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, | 520 | inactive_ratio = calc_inactive_ratio(memcg, present_pages); |
398 | int priority, enum lru_list lru) | 521 | |
522 | inactive = present_pages[0]; | ||
523 | active = present_pages[1]; | ||
524 | |||
525 | if (inactive * inactive_ratio < active) | ||
526 | return 1; | ||
527 | |||
528 | return 0; | ||
529 | } | ||
530 | |||
531 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, | ||
532 | struct zone *zone, | ||
533 | enum lru_list lru) | ||
399 | { | 534 | { |
400 | long nr_pages; | ||
401 | int nid = zone->zone_pgdat->node_id; | 535 | int nid = zone->zone_pgdat->node_id; |
402 | int zid = zone_idx(zone); | 536 | int zid = zone_idx(zone); |
403 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); | 537 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
404 | 538 | ||
405 | nr_pages = MEM_CGROUP_ZSTAT(mz, lru); | 539 | return MEM_CGROUP_ZSTAT(mz, lru); |
540 | } | ||
406 | 541 | ||
407 | return (nr_pages >> priority); | 542 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, |
543 | struct zone *zone) | ||
544 | { | ||
545 | int nid = zone->zone_pgdat->node_id; | ||
546 | int zid = zone_idx(zone); | ||
547 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
548 | |||
549 | return &mz->reclaim_stat; | ||
550 | } | ||
551 | |||
552 | struct zone_reclaim_stat * | ||
553 | mem_cgroup_get_reclaim_stat_from_page(struct page *page) | ||
554 | { | ||
555 | struct page_cgroup *pc; | ||
556 | struct mem_cgroup_per_zone *mz; | ||
557 | |||
558 | if (mem_cgroup_disabled()) | ||
559 | return NULL; | ||
560 | |||
561 | pc = lookup_page_cgroup(page); | ||
562 | mz = page_cgroup_zoneinfo(pc); | ||
563 | if (!mz) | ||
564 | return NULL; | ||
565 | |||
566 | return &mz->reclaim_stat; | ||
408 | } | 567 | } |
409 | 568 | ||
410 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | 569 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, |
@@ -429,95 +588,281 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
429 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | 588 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); |
430 | src = &mz->lists[lru]; | 589 | src = &mz->lists[lru]; |
431 | 590 | ||
432 | spin_lock(&mz->lru_lock); | ||
433 | scan = 0; | 591 | scan = 0; |
434 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { | 592 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { |
435 | if (scan >= nr_to_scan) | 593 | if (scan >= nr_to_scan) |
436 | break; | 594 | break; |
595 | |||
596 | page = pc->page; | ||
437 | if (unlikely(!PageCgroupUsed(pc))) | 597 | if (unlikely(!PageCgroupUsed(pc))) |
438 | continue; | 598 | continue; |
439 | page = pc->page; | ||
440 | |||
441 | if (unlikely(!PageLRU(page))) | 599 | if (unlikely(!PageLRU(page))) |
442 | continue; | 600 | continue; |
443 | 601 | ||
444 | /* | ||
445 | * TODO: play better with lumpy reclaim, grabbing anything. | ||
446 | */ | ||
447 | if (PageUnevictable(page) || | ||
448 | (PageActive(page) && !active) || | ||
449 | (!PageActive(page) && active)) { | ||
450 | __mem_cgroup_move_lists(pc, page_lru(page)); | ||
451 | continue; | ||
452 | } | ||
453 | |||
454 | scan++; | 602 | scan++; |
455 | list_move(&pc->lru, &pc_list); | ||
456 | |||
457 | if (__isolate_lru_page(page, mode, file) == 0) { | 603 | if (__isolate_lru_page(page, mode, file) == 0) { |
458 | list_move(&page->lru, dst); | 604 | list_move(&page->lru, dst); |
459 | nr_taken++; | 605 | nr_taken++; |
460 | } | 606 | } |
461 | } | 607 | } |
462 | 608 | ||
463 | list_splice(&pc_list, src); | ||
464 | spin_unlock(&mz->lru_lock); | ||
465 | |||
466 | *scanned = scan; | 609 | *scanned = scan; |
467 | return nr_taken; | 610 | return nr_taken; |
468 | } | 611 | } |
469 | 612 | ||
613 | #define mem_cgroup_from_res_counter(counter, member) \ | ||
614 | container_of(counter, struct mem_cgroup, member) | ||
615 | |||
470 | /* | 616 | /* |
471 | * Charge the memory controller for page usage. | 617 | * This routine finds the DFS walk successor. This routine should be |
472 | * Return | 618 | * called with hierarchy_mutex held |
473 | * 0 if the charge was successful | ||
474 | * < 0 if the cgroup is over its limit | ||
475 | */ | 619 | */ |
476 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | 620 | static struct mem_cgroup * |
477 | gfp_t gfp_mask, enum charge_type ctype, | 621 | mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) |
478 | struct mem_cgroup *memcg) | ||
479 | { | 622 | { |
623 | struct cgroup *cgroup, *curr_cgroup, *root_cgroup; | ||
624 | |||
625 | curr_cgroup = curr->css.cgroup; | ||
626 | root_cgroup = root_mem->css.cgroup; | ||
627 | |||
628 | if (!list_empty(&curr_cgroup->children)) { | ||
629 | /* | ||
630 | * Walk down to children | ||
631 | */ | ||
632 | mem_cgroup_put(curr); | ||
633 | cgroup = list_entry(curr_cgroup->children.next, | ||
634 | struct cgroup, sibling); | ||
635 | curr = mem_cgroup_from_cont(cgroup); | ||
636 | mem_cgroup_get(curr); | ||
637 | goto done; | ||
638 | } | ||
639 | |||
640 | visit_parent: | ||
641 | if (curr_cgroup == root_cgroup) { | ||
642 | mem_cgroup_put(curr); | ||
643 | curr = root_mem; | ||
644 | mem_cgroup_get(curr); | ||
645 | goto done; | ||
646 | } | ||
647 | |||
648 | /* | ||
649 | * Goto next sibling | ||
650 | */ | ||
651 | if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { | ||
652 | mem_cgroup_put(curr); | ||
653 | cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, | ||
654 | sibling); | ||
655 | curr = mem_cgroup_from_cont(cgroup); | ||
656 | mem_cgroup_get(curr); | ||
657 | goto done; | ||
658 | } | ||
659 | |||
660 | /* | ||
661 | * Go up to next parent and next parent's sibling if need be | ||
662 | */ | ||
663 | curr_cgroup = curr_cgroup->parent; | ||
664 | goto visit_parent; | ||
665 | |||
666 | done: | ||
667 | root_mem->last_scanned_child = curr; | ||
668 | return curr; | ||
669 | } | ||
670 | |||
671 | /* | ||
672 | * Visit the first child (need not be the first child as per the ordering | ||
673 | * of the cgroup list, since we track last_scanned_child) of @mem and use | ||
674 | * that to reclaim free pages from. | ||
675 | */ | ||
676 | static struct mem_cgroup * | ||
677 | mem_cgroup_get_first_node(struct mem_cgroup *root_mem) | ||
678 | { | ||
679 | struct cgroup *cgroup; | ||
680 | struct mem_cgroup *ret; | ||
681 | bool obsolete; | ||
682 | |||
683 | obsolete = mem_cgroup_is_obsolete(root_mem->last_scanned_child); | ||
684 | |||
685 | /* | ||
686 | * Scan all children under the mem_cgroup mem | ||
687 | */ | ||
688 | mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); | ||
689 | if (list_empty(&root_mem->css.cgroup->children)) { | ||
690 | ret = root_mem; | ||
691 | goto done; | ||
692 | } | ||
693 | |||
694 | if (!root_mem->last_scanned_child || obsolete) { | ||
695 | |||
696 | if (obsolete && root_mem->last_scanned_child) | ||
697 | mem_cgroup_put(root_mem->last_scanned_child); | ||
698 | |||
699 | cgroup = list_first_entry(&root_mem->css.cgroup->children, | ||
700 | struct cgroup, sibling); | ||
701 | ret = mem_cgroup_from_cont(cgroup); | ||
702 | mem_cgroup_get(ret); | ||
703 | } else | ||
704 | ret = mem_cgroup_get_next_node(root_mem->last_scanned_child, | ||
705 | root_mem); | ||
706 | |||
707 | done: | ||
708 | root_mem->last_scanned_child = ret; | ||
709 | mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); | ||
710 | return ret; | ||
711 | } | ||
712 | |||
713 | static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) | ||
714 | { | ||
715 | if (do_swap_account) { | ||
716 | if (res_counter_check_under_limit(&mem->res) && | ||
717 | res_counter_check_under_limit(&mem->memsw)) | ||
718 | return true; | ||
719 | } else | ||
720 | if (res_counter_check_under_limit(&mem->res)) | ||
721 | return true; | ||
722 | return false; | ||
723 | } | ||
724 | |||
725 | static unsigned int get_swappiness(struct mem_cgroup *memcg) | ||
726 | { | ||
727 | struct cgroup *cgrp = memcg->css.cgroup; | ||
728 | unsigned int swappiness; | ||
729 | |||
730 | /* root ? */ | ||
731 | if (cgrp->parent == NULL) | ||
732 | return vm_swappiness; | ||
733 | |||
734 | spin_lock(&memcg->reclaim_param_lock); | ||
735 | swappiness = memcg->swappiness; | ||
736 | spin_unlock(&memcg->reclaim_param_lock); | ||
737 | |||
738 | return swappiness; | ||
739 | } | ||
740 | |||
741 | /* | ||
742 | * Dance down the hierarchy if needed to reclaim memory. We remember the | ||
743 | * last child we reclaimed from, so that we don't end up penalizing | ||
744 | * one child extensively based on its position in the children list. | ||
745 | * | ||
746 | * root_mem is the original ancestor that we've been reclaim from. | ||
747 | */ | ||
748 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | ||
749 | gfp_t gfp_mask, bool noswap) | ||
750 | { | ||
751 | struct mem_cgroup *next_mem; | ||
752 | int ret = 0; | ||
753 | |||
754 | /* | ||
755 | * Reclaim unconditionally and don't check for return value. | ||
756 | * We need to reclaim in the current group and down the tree. | ||
757 | * One might think about checking for children before reclaiming, | ||
758 | * but there might be left over accounting, even after children | ||
759 | * have left. | ||
760 | */ | ||
761 | ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, | ||
762 | get_swappiness(root_mem)); | ||
763 | if (mem_cgroup_check_under_limit(root_mem)) | ||
764 | return 0; | ||
765 | if (!root_mem->use_hierarchy) | ||
766 | return ret; | ||
767 | |||
768 | next_mem = mem_cgroup_get_first_node(root_mem); | ||
769 | |||
770 | while (next_mem != root_mem) { | ||
771 | if (mem_cgroup_is_obsolete(next_mem)) { | ||
772 | mem_cgroup_put(next_mem); | ||
773 | next_mem = mem_cgroup_get_first_node(root_mem); | ||
774 | continue; | ||
775 | } | ||
776 | ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, | ||
777 | get_swappiness(next_mem)); | ||
778 | if (mem_cgroup_check_under_limit(root_mem)) | ||
779 | return 0; | ||
780 | mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); | ||
781 | next_mem = mem_cgroup_get_next_node(next_mem, root_mem); | ||
782 | mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); | ||
783 | } | ||
784 | return ret; | ||
785 | } | ||
786 | |||
787 | bool mem_cgroup_oom_called(struct task_struct *task) | ||
788 | { | ||
789 | bool ret = false; | ||
480 | struct mem_cgroup *mem; | 790 | struct mem_cgroup *mem; |
481 | struct page_cgroup *pc; | 791 | struct mm_struct *mm; |
482 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
483 | struct mem_cgroup_per_zone *mz; | ||
484 | unsigned long flags; | ||
485 | 792 | ||
486 | pc = lookup_page_cgroup(page); | 793 | rcu_read_lock(); |
487 | /* can happen at boot */ | 794 | mm = task->mm; |
488 | if (unlikely(!pc)) | 795 | if (!mm) |
796 | mm = &init_mm; | ||
797 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | ||
798 | if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) | ||
799 | ret = true; | ||
800 | rcu_read_unlock(); | ||
801 | return ret; | ||
802 | } | ||
803 | /* | ||
804 | * Unlike exported interface, "oom" parameter is added. if oom==true, | ||
805 | * oom-killer can be invoked. | ||
806 | */ | ||
807 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | ||
808 | gfp_t gfp_mask, struct mem_cgroup **memcg, | ||
809 | bool oom) | ||
810 | { | ||
811 | struct mem_cgroup *mem, *mem_over_limit; | ||
812 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
813 | struct res_counter *fail_res; | ||
814 | |||
815 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | ||
816 | /* Don't account this! */ | ||
817 | *memcg = NULL; | ||
489 | return 0; | 818 | return 0; |
490 | prefetchw(pc); | 819 | } |
820 | |||
491 | /* | 821 | /* |
492 | * We always charge the cgroup the mm_struct belongs to. | 822 | * We always charge the cgroup the mm_struct belongs to. |
493 | * The mm_struct's mem_cgroup changes on task migration if the | 823 | * The mm_struct's mem_cgroup changes on task migration if the |
494 | * thread group leader migrates. It's possible that mm is not | 824 | * thread group leader migrates. It's possible that mm is not |
495 | * set, if so charge the init_mm (happens for pagecache usage). | 825 | * set, if so charge the init_mm (happens for pagecache usage). |
496 | */ | 826 | */ |
497 | 827 | mem = *memcg; | |
498 | if (likely(!memcg)) { | 828 | if (likely(!mem)) { |
499 | rcu_read_lock(); | 829 | mem = try_get_mem_cgroup_from_mm(mm); |
500 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 830 | *memcg = mem; |
501 | if (unlikely(!mem)) { | ||
502 | rcu_read_unlock(); | ||
503 | return 0; | ||
504 | } | ||
505 | /* | ||
506 | * For every charge from the cgroup, increment reference count | ||
507 | */ | ||
508 | css_get(&mem->css); | ||
509 | rcu_read_unlock(); | ||
510 | } else { | 831 | } else { |
511 | mem = memcg; | 832 | css_get(&mem->css); |
512 | css_get(&memcg->css); | ||
513 | } | 833 | } |
834 | if (unlikely(!mem)) | ||
835 | return 0; | ||
836 | |||
837 | VM_BUG_ON(mem_cgroup_is_obsolete(mem)); | ||
838 | |||
839 | while (1) { | ||
840 | int ret; | ||
841 | bool noswap = false; | ||
842 | |||
843 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | ||
844 | if (likely(!ret)) { | ||
845 | if (!do_swap_account) | ||
846 | break; | ||
847 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | ||
848 | &fail_res); | ||
849 | if (likely(!ret)) | ||
850 | break; | ||
851 | /* mem+swap counter fails */ | ||
852 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
853 | noswap = true; | ||
854 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | ||
855 | memsw); | ||
856 | } else | ||
857 | /* mem counter fails */ | ||
858 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | ||
859 | res); | ||
514 | 860 | ||
515 | while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { | ||
516 | if (!(gfp_mask & __GFP_WAIT)) | 861 | if (!(gfp_mask & __GFP_WAIT)) |
517 | goto out; | 862 | goto nomem; |
518 | 863 | ||
519 | if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) | 864 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, |
520 | continue; | 865 | noswap); |
521 | 866 | ||
522 | /* | 867 | /* |
523 | * try_to_free_mem_cgroup_pages() might not give us a full | 868 | * try_to_free_mem_cgroup_pages() might not give us a full |
@@ -525,49 +870,214 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
525 | * moved to swap cache or just unmapped from the cgroup. | 870 | * moved to swap cache or just unmapped from the cgroup. |
526 | * Check the limit again to see if the reclaim reduced the | 871 | * Check the limit again to see if the reclaim reduced the |
527 | * current usage of the cgroup before giving up | 872 | * current usage of the cgroup before giving up |
873 | * | ||
528 | */ | 874 | */ |
529 | if (res_counter_check_under_limit(&mem->res)) | 875 | if (mem_cgroup_check_under_limit(mem_over_limit)) |
530 | continue; | 876 | continue; |
531 | 877 | ||
532 | if (!nr_retries--) { | 878 | if (!nr_retries--) { |
533 | mem_cgroup_out_of_memory(mem, gfp_mask); | 879 | if (oom) { |
534 | goto out; | 880 | mutex_lock(&memcg_tasklist); |
881 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | ||
882 | mutex_unlock(&memcg_tasklist); | ||
883 | mem_over_limit->last_oom_jiffies = jiffies; | ||
884 | } | ||
885 | goto nomem; | ||
535 | } | 886 | } |
536 | } | 887 | } |
888 | return 0; | ||
889 | nomem: | ||
890 | css_put(&mem->css); | ||
891 | return -ENOMEM; | ||
892 | } | ||
537 | 893 | ||
894 | static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) | ||
895 | { | ||
896 | struct mem_cgroup *mem; | ||
897 | swp_entry_t ent; | ||
898 | |||
899 | if (!PageSwapCache(page)) | ||
900 | return NULL; | ||
901 | |||
902 | ent.val = page_private(page); | ||
903 | mem = lookup_swap_cgroup(ent); | ||
904 | if (!mem) | ||
905 | return NULL; | ||
906 | if (!css_tryget(&mem->css)) | ||
907 | return NULL; | ||
908 | return mem; | ||
909 | } | ||
910 | |||
911 | /* | ||
912 | * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be | ||
913 | * USED state. If already USED, uncharge and return. | ||
914 | */ | ||
915 | |||
916 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | ||
917 | struct page_cgroup *pc, | ||
918 | enum charge_type ctype) | ||
919 | { | ||
920 | /* try_charge() can return NULL to *memcg, taking care of it. */ | ||
921 | if (!mem) | ||
922 | return; | ||
538 | 923 | ||
539 | lock_page_cgroup(pc); | 924 | lock_page_cgroup(pc); |
540 | if (unlikely(PageCgroupUsed(pc))) { | 925 | if (unlikely(PageCgroupUsed(pc))) { |
541 | unlock_page_cgroup(pc); | 926 | unlock_page_cgroup(pc); |
542 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 927 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
928 | if (do_swap_account) | ||
929 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
543 | css_put(&mem->css); | 930 | css_put(&mem->css); |
544 | 931 | return; | |
545 | goto done; | ||
546 | } | 932 | } |
547 | pc->mem_cgroup = mem; | 933 | pc->mem_cgroup = mem; |
548 | /* | 934 | smp_wmb(); |
549 | * If a page is accounted as a page cache, insert to inactive list. | ||
550 | * If anon, insert to active list. | ||
551 | */ | ||
552 | pc->flags = pcg_default_flags[ctype]; | 935 | pc->flags = pcg_default_flags[ctype]; |
553 | 936 | ||
554 | mz = page_cgroup_zoneinfo(pc); | 937 | mem_cgroup_charge_statistics(mem, pc, true); |
555 | 938 | ||
556 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
557 | __mem_cgroup_add_list(mz, pc); | ||
558 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
559 | unlock_page_cgroup(pc); | 939 | unlock_page_cgroup(pc); |
940 | } | ||
560 | 941 | ||
561 | done: | 942 | /** |
562 | return 0; | 943 | * mem_cgroup_move_account - move account of the page |
944 | * @pc: page_cgroup of the page. | ||
945 | * @from: mem_cgroup which the page is moved from. | ||
946 | * @to: mem_cgroup which the page is moved to. @from != @to. | ||
947 | * | ||
948 | * The caller must confirm following. | ||
949 | * - page is not on LRU (isolate_page() is useful.) | ||
950 | * | ||
951 | * returns 0 at success, | ||
952 | * returns -EBUSY when lock is busy or "pc" is unstable. | ||
953 | * | ||
954 | * This function does "uncharge" from old cgroup but doesn't do "charge" to | ||
955 | * new cgroup. It should be done by a caller. | ||
956 | */ | ||
957 | |||
958 | static int mem_cgroup_move_account(struct page_cgroup *pc, | ||
959 | struct mem_cgroup *from, struct mem_cgroup *to) | ||
960 | { | ||
961 | struct mem_cgroup_per_zone *from_mz, *to_mz; | ||
962 | int nid, zid; | ||
963 | int ret = -EBUSY; | ||
964 | |||
965 | VM_BUG_ON(from == to); | ||
966 | VM_BUG_ON(PageLRU(pc->page)); | ||
967 | |||
968 | nid = page_cgroup_nid(pc); | ||
969 | zid = page_cgroup_zid(pc); | ||
970 | from_mz = mem_cgroup_zoneinfo(from, nid, zid); | ||
971 | to_mz = mem_cgroup_zoneinfo(to, nid, zid); | ||
972 | |||
973 | if (!trylock_page_cgroup(pc)) | ||
974 | return ret; | ||
975 | |||
976 | if (!PageCgroupUsed(pc)) | ||
977 | goto out; | ||
978 | |||
979 | if (pc->mem_cgroup != from) | ||
980 | goto out; | ||
981 | |||
982 | css_put(&from->css); | ||
983 | res_counter_uncharge(&from->res, PAGE_SIZE); | ||
984 | mem_cgroup_charge_statistics(from, pc, false); | ||
985 | if (do_swap_account) | ||
986 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
987 | pc->mem_cgroup = to; | ||
988 | mem_cgroup_charge_statistics(to, pc, true); | ||
989 | css_get(&to->css); | ||
990 | ret = 0; | ||
563 | out: | 991 | out: |
564 | css_put(&mem->css); | 992 | unlock_page_cgroup(pc); |
565 | return -ENOMEM; | 993 | return ret; |
994 | } | ||
995 | |||
996 | /* | ||
997 | * move charges to its parent. | ||
998 | */ | ||
999 | |||
1000 | static int mem_cgroup_move_parent(struct page_cgroup *pc, | ||
1001 | struct mem_cgroup *child, | ||
1002 | gfp_t gfp_mask) | ||
1003 | { | ||
1004 | struct page *page = pc->page; | ||
1005 | struct cgroup *cg = child->css.cgroup; | ||
1006 | struct cgroup *pcg = cg->parent; | ||
1007 | struct mem_cgroup *parent; | ||
1008 | int ret; | ||
1009 | |||
1010 | /* Is ROOT ? */ | ||
1011 | if (!pcg) | ||
1012 | return -EINVAL; | ||
1013 | |||
1014 | |||
1015 | parent = mem_cgroup_from_cont(pcg); | ||
1016 | |||
1017 | |||
1018 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | ||
1019 | if (ret || !parent) | ||
1020 | return ret; | ||
1021 | |||
1022 | if (!get_page_unless_zero(page)) | ||
1023 | return -EBUSY; | ||
1024 | |||
1025 | ret = isolate_lru_page(page); | ||
1026 | |||
1027 | if (ret) | ||
1028 | goto cancel; | ||
1029 | |||
1030 | ret = mem_cgroup_move_account(pc, child, parent); | ||
1031 | |||
1032 | /* drop extra refcnt by try_charge() (move_account increment one) */ | ||
1033 | css_put(&parent->css); | ||
1034 | putback_lru_page(page); | ||
1035 | if (!ret) { | ||
1036 | put_page(page); | ||
1037 | return 0; | ||
1038 | } | ||
1039 | /* uncharge if move fails */ | ||
1040 | cancel: | ||
1041 | res_counter_uncharge(&parent->res, PAGE_SIZE); | ||
1042 | if (do_swap_account) | ||
1043 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | ||
1044 | put_page(page); | ||
1045 | return ret; | ||
1046 | } | ||
1047 | |||
1048 | /* | ||
1049 | * Charge the memory controller for page usage. | ||
1050 | * Return | ||
1051 | * 0 if the charge was successful | ||
1052 | * < 0 if the cgroup is over its limit | ||
1053 | */ | ||
1054 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | ||
1055 | gfp_t gfp_mask, enum charge_type ctype, | ||
1056 | struct mem_cgroup *memcg) | ||
1057 | { | ||
1058 | struct mem_cgroup *mem; | ||
1059 | struct page_cgroup *pc; | ||
1060 | int ret; | ||
1061 | |||
1062 | pc = lookup_page_cgroup(page); | ||
1063 | /* can happen at boot */ | ||
1064 | if (unlikely(!pc)) | ||
1065 | return 0; | ||
1066 | prefetchw(pc); | ||
1067 | |||
1068 | mem = memcg; | ||
1069 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | ||
1070 | if (ret || !mem) | ||
1071 | return ret; | ||
1072 | |||
1073 | __mem_cgroup_commit_charge(mem, pc, ctype); | ||
1074 | return 0; | ||
566 | } | 1075 | } |
567 | 1076 | ||
568 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) | 1077 | int mem_cgroup_newpage_charge(struct page *page, |
1078 | struct mm_struct *mm, gfp_t gfp_mask) | ||
569 | { | 1079 | { |
570 | if (mem_cgroup_subsys.disabled) | 1080 | if (mem_cgroup_disabled()) |
571 | return 0; | 1081 | return 0; |
572 | if (PageCompound(page)) | 1082 | if (PageCompound(page)) |
573 | return 0; | 1083 | return 0; |
@@ -589,7 +1099,10 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) | |||
589 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 1099 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
590 | gfp_t gfp_mask) | 1100 | gfp_t gfp_mask) |
591 | { | 1101 | { |
592 | if (mem_cgroup_subsys.disabled) | 1102 | struct mem_cgroup *mem = NULL; |
1103 | int ret; | ||
1104 | |||
1105 | if (mem_cgroup_disabled()) | ||
593 | return 0; | 1106 | return 0; |
594 | if (PageCompound(page)) | 1107 | if (PageCompound(page)) |
595 | return 0; | 1108 | return 0; |
@@ -601,6 +1114,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
601 | * For GFP_NOWAIT case, the page may be pre-charged before calling | 1114 | * For GFP_NOWAIT case, the page may be pre-charged before calling |
602 | * add_to_page_cache(). (See shmem.c) check it here and avoid to call | 1115 | * add_to_page_cache(). (See shmem.c) check it here and avoid to call |
603 | * charge twice. (It works but has to pay a bit larger cost.) | 1116 | * charge twice. (It works but has to pay a bit larger cost.) |
1117 | * And when the page is SwapCache, it should take swap information | ||
1118 | * into account. This is under lock_page() now. | ||
604 | */ | 1119 | */ |
605 | if (!(gfp_mask & __GFP_WAIT)) { | 1120 | if (!(gfp_mask & __GFP_WAIT)) { |
606 | struct page_cgroup *pc; | 1121 | struct page_cgroup *pc; |
@@ -617,58 +1132,198 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
617 | unlock_page_cgroup(pc); | 1132 | unlock_page_cgroup(pc); |
618 | } | 1133 | } |
619 | 1134 | ||
620 | if (unlikely(!mm)) | 1135 | if (do_swap_account && PageSwapCache(page)) { |
1136 | mem = try_get_mem_cgroup_from_swapcache(page); | ||
1137 | if (mem) | ||
1138 | mm = NULL; | ||
1139 | else | ||
1140 | mem = NULL; | ||
1141 | /* SwapCache may be still linked to LRU now. */ | ||
1142 | mem_cgroup_lru_del_before_commit_swapcache(page); | ||
1143 | } | ||
1144 | |||
1145 | if (unlikely(!mm && !mem)) | ||
621 | mm = &init_mm; | 1146 | mm = &init_mm; |
622 | 1147 | ||
623 | if (page_is_file_cache(page)) | 1148 | if (page_is_file_cache(page)) |
624 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 1149 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
625 | MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); | 1150 | MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); |
626 | else | 1151 | |
627 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 1152 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, |
628 | MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); | 1153 | MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); |
1154 | if (mem) | ||
1155 | css_put(&mem->css); | ||
1156 | if (PageSwapCache(page)) | ||
1157 | mem_cgroup_lru_add_after_commit_swapcache(page); | ||
1158 | |||
1159 | if (do_swap_account && !ret && PageSwapCache(page)) { | ||
1160 | swp_entry_t ent = {.val = page_private(page)}; | ||
1161 | /* avoid double counting */ | ||
1162 | mem = swap_cgroup_record(ent, NULL); | ||
1163 | if (mem) { | ||
1164 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1165 | mem_cgroup_put(mem); | ||
1166 | } | ||
1167 | } | ||
1168 | return ret; | ||
1169 | } | ||
1170 | |||
1171 | /* | ||
1172 | * While swap-in, try_charge -> commit or cancel, the page is locked. | ||
1173 | * And when try_charge() successfully returns, one refcnt to memcg without | ||
1174 | * struct page_cgroup is aquired. This refcnt will be cumsumed by | ||
1175 | * "commit()" or removed by "cancel()" | ||
1176 | */ | ||
1177 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | ||
1178 | struct page *page, | ||
1179 | gfp_t mask, struct mem_cgroup **ptr) | ||
1180 | { | ||
1181 | struct mem_cgroup *mem; | ||
1182 | int ret; | ||
1183 | |||
1184 | if (mem_cgroup_disabled()) | ||
1185 | return 0; | ||
1186 | |||
1187 | if (!do_swap_account) | ||
1188 | goto charge_cur_mm; | ||
1189 | /* | ||
1190 | * A racing thread's fault, or swapoff, may have already updated | ||
1191 | * the pte, and even removed page from swap cache: return success | ||
1192 | * to go on to do_swap_page()'s pte_same() test, which should fail. | ||
1193 | */ | ||
1194 | if (!PageSwapCache(page)) | ||
1195 | return 0; | ||
1196 | mem = try_get_mem_cgroup_from_swapcache(page); | ||
1197 | if (!mem) | ||
1198 | goto charge_cur_mm; | ||
1199 | *ptr = mem; | ||
1200 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); | ||
1201 | /* drop extra refcnt from tryget */ | ||
1202 | css_put(&mem->css); | ||
1203 | return ret; | ||
1204 | charge_cur_mm: | ||
1205 | if (unlikely(!mm)) | ||
1206 | mm = &init_mm; | ||
1207 | return __mem_cgroup_try_charge(mm, mask, ptr, true); | ||
1208 | } | ||
1209 | |||
1210 | void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | ||
1211 | { | ||
1212 | struct page_cgroup *pc; | ||
1213 | |||
1214 | if (mem_cgroup_disabled()) | ||
1215 | return; | ||
1216 | if (!ptr) | ||
1217 | return; | ||
1218 | pc = lookup_page_cgroup(page); | ||
1219 | mem_cgroup_lru_del_before_commit_swapcache(page); | ||
1220 | __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); | ||
1221 | mem_cgroup_lru_add_after_commit_swapcache(page); | ||
1222 | /* | ||
1223 | * Now swap is on-memory. This means this page may be | ||
1224 | * counted both as mem and swap....double count. | ||
1225 | * Fix it by uncharging from memsw. Basically, this SwapCache is stable | ||
1226 | * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() | ||
1227 | * may call delete_from_swap_cache() before reach here. | ||
1228 | */ | ||
1229 | if (do_swap_account && PageSwapCache(page)) { | ||
1230 | swp_entry_t ent = {.val = page_private(page)}; | ||
1231 | struct mem_cgroup *memcg; | ||
1232 | memcg = swap_cgroup_record(ent, NULL); | ||
1233 | if (memcg) { | ||
1234 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | ||
1235 | mem_cgroup_put(memcg); | ||
1236 | } | ||
1237 | |||
1238 | } | ||
1239 | /* add this page(page_cgroup) to the LRU we want. */ | ||
1240 | |||
629 | } | 1241 | } |
630 | 1242 | ||
1243 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | ||
1244 | { | ||
1245 | if (mem_cgroup_disabled()) | ||
1246 | return; | ||
1247 | if (!mem) | ||
1248 | return; | ||
1249 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1250 | if (do_swap_account) | ||
1251 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1252 | css_put(&mem->css); | ||
1253 | } | ||
1254 | |||
1255 | |||
631 | /* | 1256 | /* |
632 | * uncharge if !page_mapped(page) | 1257 | * uncharge if !page_mapped(page) |
633 | */ | 1258 | */ |
634 | static void | 1259 | static struct mem_cgroup * |
635 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 1260 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) |
636 | { | 1261 | { |
637 | struct page_cgroup *pc; | 1262 | struct page_cgroup *pc; |
638 | struct mem_cgroup *mem; | 1263 | struct mem_cgroup *mem = NULL; |
639 | struct mem_cgroup_per_zone *mz; | 1264 | struct mem_cgroup_per_zone *mz; |
640 | unsigned long flags; | ||
641 | 1265 | ||
642 | if (mem_cgroup_subsys.disabled) | 1266 | if (mem_cgroup_disabled()) |
643 | return; | 1267 | return NULL; |
1268 | |||
1269 | if (PageSwapCache(page)) | ||
1270 | return NULL; | ||
644 | 1271 | ||
645 | /* | 1272 | /* |
646 | * Check if our page_cgroup is valid | 1273 | * Check if our page_cgroup is valid |
647 | */ | 1274 | */ |
648 | pc = lookup_page_cgroup(page); | 1275 | pc = lookup_page_cgroup(page); |
649 | if (unlikely(!pc || !PageCgroupUsed(pc))) | 1276 | if (unlikely(!pc || !PageCgroupUsed(pc))) |
650 | return; | 1277 | return NULL; |
651 | 1278 | ||
652 | lock_page_cgroup(pc); | 1279 | lock_page_cgroup(pc); |
653 | if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page)) | 1280 | |
654 | || !PageCgroupUsed(pc)) { | 1281 | mem = pc->mem_cgroup; |
655 | /* This happens at race in zap_pte_range() and do_swap_page()*/ | 1282 | |
656 | unlock_page_cgroup(pc); | 1283 | if (!PageCgroupUsed(pc)) |
657 | return; | 1284 | goto unlock_out; |
1285 | |||
1286 | switch (ctype) { | ||
1287 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | ||
1288 | if (page_mapped(page)) | ||
1289 | goto unlock_out; | ||
1290 | break; | ||
1291 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: | ||
1292 | if (!PageAnon(page)) { /* Shared memory */ | ||
1293 | if (page->mapping && !page_is_file_cache(page)) | ||
1294 | goto unlock_out; | ||
1295 | } else if (page_mapped(page)) /* Anon */ | ||
1296 | goto unlock_out; | ||
1297 | break; | ||
1298 | default: | ||
1299 | break; | ||
658 | } | 1300 | } |
1301 | |||
1302 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1303 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
1304 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1305 | |||
1306 | mem_cgroup_charge_statistics(mem, pc, false); | ||
659 | ClearPageCgroupUsed(pc); | 1307 | ClearPageCgroupUsed(pc); |
660 | mem = pc->mem_cgroup; | 1308 | /* |
1309 | * pc->mem_cgroup is not cleared here. It will be accessed when it's | ||
1310 | * freed from LRU. This is safe because uncharged page is expected not | ||
1311 | * to be reused (freed soon). Exception is SwapCache, it's handled by | ||
1312 | * special functions. | ||
1313 | */ | ||
661 | 1314 | ||
662 | mz = page_cgroup_zoneinfo(pc); | 1315 | mz = page_cgroup_zoneinfo(pc); |
663 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
664 | __mem_cgroup_remove_list(mz, pc); | ||
665 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
666 | unlock_page_cgroup(pc); | 1316 | unlock_page_cgroup(pc); |
667 | 1317 | ||
668 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1318 | /* at swapout, this memcg will be accessed to record to swap */ |
669 | css_put(&mem->css); | 1319 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1320 | css_put(&mem->css); | ||
670 | 1321 | ||
671 | return; | 1322 | return mem; |
1323 | |||
1324 | unlock_out: | ||
1325 | unlock_page_cgroup(pc); | ||
1326 | return NULL; | ||
672 | } | 1327 | } |
673 | 1328 | ||
674 | void mem_cgroup_uncharge_page(struct page *page) | 1329 | void mem_cgroup_uncharge_page(struct page *page) |
@@ -689,16 +1344,55 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
689 | } | 1344 | } |
690 | 1345 | ||
691 | /* | 1346 | /* |
692 | * Before starting migration, account against new page. | 1347 | * called from __delete_from_swap_cache() and drop "page" account. |
1348 | * memcg information is recorded to swap_cgroup of "ent" | ||
1349 | */ | ||
1350 | void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) | ||
1351 | { | ||
1352 | struct mem_cgroup *memcg; | ||
1353 | |||
1354 | memcg = __mem_cgroup_uncharge_common(page, | ||
1355 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT); | ||
1356 | /* record memcg information */ | ||
1357 | if (do_swap_account && memcg) { | ||
1358 | swap_cgroup_record(ent, memcg); | ||
1359 | mem_cgroup_get(memcg); | ||
1360 | } | ||
1361 | if (memcg) | ||
1362 | css_put(&memcg->css); | ||
1363 | } | ||
1364 | |||
1365 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
1366 | /* | ||
1367 | * called from swap_entry_free(). remove record in swap_cgroup and | ||
1368 | * uncharge "memsw" account. | ||
693 | */ | 1369 | */ |
694 | int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) | 1370 | void mem_cgroup_uncharge_swap(swp_entry_t ent) |
1371 | { | ||
1372 | struct mem_cgroup *memcg; | ||
1373 | |||
1374 | if (!do_swap_account) | ||
1375 | return; | ||
1376 | |||
1377 | memcg = swap_cgroup_record(ent, NULL); | ||
1378 | if (memcg) { | ||
1379 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | ||
1380 | mem_cgroup_put(memcg); | ||
1381 | } | ||
1382 | } | ||
1383 | #endif | ||
1384 | |||
1385 | /* | ||
1386 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old | ||
1387 | * page belongs to. | ||
1388 | */ | ||
1389 | int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | ||
695 | { | 1390 | { |
696 | struct page_cgroup *pc; | 1391 | struct page_cgroup *pc; |
697 | struct mem_cgroup *mem = NULL; | 1392 | struct mem_cgroup *mem = NULL; |
698 | enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | ||
699 | int ret = 0; | 1393 | int ret = 0; |
700 | 1394 | ||
701 | if (mem_cgroup_subsys.disabled) | 1395 | if (mem_cgroup_disabled()) |
702 | return 0; | 1396 | return 0; |
703 | 1397 | ||
704 | pc = lookup_page_cgroup(page); | 1398 | pc = lookup_page_cgroup(page); |
@@ -706,41 +1400,67 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) | |||
706 | if (PageCgroupUsed(pc)) { | 1400 | if (PageCgroupUsed(pc)) { |
707 | mem = pc->mem_cgroup; | 1401 | mem = pc->mem_cgroup; |
708 | css_get(&mem->css); | 1402 | css_get(&mem->css); |
709 | if (PageCgroupCache(pc)) { | ||
710 | if (page_is_file_cache(page)) | ||
711 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
712 | else | ||
713 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
714 | } | ||
715 | } | 1403 | } |
716 | unlock_page_cgroup(pc); | 1404 | unlock_page_cgroup(pc); |
1405 | |||
717 | if (mem) { | 1406 | if (mem) { |
718 | ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, | 1407 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); |
719 | ctype, mem); | ||
720 | css_put(&mem->css); | 1408 | css_put(&mem->css); |
721 | } | 1409 | } |
1410 | *ptr = mem; | ||
722 | return ret; | 1411 | return ret; |
723 | } | 1412 | } |
724 | 1413 | ||
725 | /* remove redundant charge if migration failed*/ | 1414 | /* remove redundant charge if migration failed*/ |
726 | void mem_cgroup_end_migration(struct page *newpage) | 1415 | void mem_cgroup_end_migration(struct mem_cgroup *mem, |
1416 | struct page *oldpage, struct page *newpage) | ||
727 | { | 1417 | { |
1418 | struct page *target, *unused; | ||
1419 | struct page_cgroup *pc; | ||
1420 | enum charge_type ctype; | ||
1421 | |||
1422 | if (!mem) | ||
1423 | return; | ||
1424 | |||
1425 | /* at migration success, oldpage->mapping is NULL. */ | ||
1426 | if (oldpage->mapping) { | ||
1427 | target = oldpage; | ||
1428 | unused = NULL; | ||
1429 | } else { | ||
1430 | target = newpage; | ||
1431 | unused = oldpage; | ||
1432 | } | ||
1433 | |||
1434 | if (PageAnon(target)) | ||
1435 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | ||
1436 | else if (page_is_file_cache(target)) | ||
1437 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
1438 | else | ||
1439 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
1440 | |||
1441 | /* unused page is not on radix-tree now. */ | ||
1442 | if (unused) | ||
1443 | __mem_cgroup_uncharge_common(unused, ctype); | ||
1444 | |||
1445 | pc = lookup_page_cgroup(target); | ||
728 | /* | 1446 | /* |
729 | * At success, page->mapping is not NULL. | 1447 | * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. |
730 | * special rollback care is necessary when | 1448 | * So, double-counting is effectively avoided. |
731 | * 1. at migration failure. (newpage->mapping is cleared in this case) | ||
732 | * 2. the newpage was moved but not remapped again because the task | ||
733 | * exits and the newpage is obsolete. In this case, the new page | ||
734 | * may be a swapcache. So, we just call mem_cgroup_uncharge_page() | ||
735 | * always for avoiding mess. The page_cgroup will be removed if | ||
736 | * unnecessary. File cache pages is still on radix-tree. Don't | ||
737 | * care it. | ||
738 | */ | 1449 | */ |
739 | if (!newpage->mapping) | 1450 | __mem_cgroup_commit_charge(mem, pc, ctype); |
740 | __mem_cgroup_uncharge_common(newpage, | 1451 | |
741 | MEM_CGROUP_CHARGE_TYPE_FORCE); | 1452 | /* |
742 | else if (PageAnon(newpage)) | 1453 | * Both of oldpage and newpage are still under lock_page(). |
743 | mem_cgroup_uncharge_page(newpage); | 1454 | * Then, we don't have to care about race in radix-tree. |
1455 | * But we have to be careful that this page is unmapped or not. | ||
1456 | * | ||
1457 | * There is a case for !page_mapped(). At the start of | ||
1458 | * migration, oldpage was mapped. But now, it's zapped. | ||
1459 | * But we know *target* page is not freed/reused under us. | ||
1460 | * mem_cgroup_uncharge_page() does all necessary checks. | ||
1461 | */ | ||
1462 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | ||
1463 | mem_cgroup_uncharge_page(target); | ||
744 | } | 1464 | } |
745 | 1465 | ||
746 | /* | 1466 | /* |
@@ -748,29 +1468,26 @@ void mem_cgroup_end_migration(struct page *newpage) | |||
748 | * This is typically used for page reclaiming for shmem for reducing side | 1468 | * This is typically used for page reclaiming for shmem for reducing side |
749 | * effect of page allocation from shmem, which is used by some mem_cgroup. | 1469 | * effect of page allocation from shmem, which is used by some mem_cgroup. |
750 | */ | 1470 | */ |
751 | int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) | 1471 | int mem_cgroup_shrink_usage(struct page *page, |
1472 | struct mm_struct *mm, | ||
1473 | gfp_t gfp_mask) | ||
752 | { | 1474 | { |
753 | struct mem_cgroup *mem; | 1475 | struct mem_cgroup *mem = NULL; |
754 | int progress = 0; | 1476 | int progress = 0; |
755 | int retry = MEM_CGROUP_RECLAIM_RETRIES; | 1477 | int retry = MEM_CGROUP_RECLAIM_RETRIES; |
756 | 1478 | ||
757 | if (mem_cgroup_subsys.disabled) | 1479 | if (mem_cgroup_disabled()) |
758 | return 0; | 1480 | return 0; |
759 | if (!mm) | 1481 | if (page) |
1482 | mem = try_get_mem_cgroup_from_swapcache(page); | ||
1483 | if (!mem && mm) | ||
1484 | mem = try_get_mem_cgroup_from_mm(mm); | ||
1485 | if (unlikely(!mem)) | ||
760 | return 0; | 1486 | return 0; |
761 | 1487 | ||
762 | rcu_read_lock(); | ||
763 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | ||
764 | if (unlikely(!mem)) { | ||
765 | rcu_read_unlock(); | ||
766 | return 0; | ||
767 | } | ||
768 | css_get(&mem->css); | ||
769 | rcu_read_unlock(); | ||
770 | |||
771 | do { | 1488 | do { |
772 | progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); | 1489 | progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true); |
773 | progress += res_counter_check_under_limit(&mem->res); | 1490 | progress += mem_cgroup_check_under_limit(mem); |
774 | } while (!progress && --retry); | 1491 | } while (!progress && --retry); |
775 | 1492 | ||
776 | css_put(&mem->css); | 1493 | css_put(&mem->css); |
@@ -779,116 +1496,295 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) | |||
779 | return 0; | 1496 | return 0; |
780 | } | 1497 | } |
781 | 1498 | ||
782 | int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) | 1499 | static DEFINE_MUTEX(set_limit_mutex); |
1500 | |||
1501 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | ||
1502 | unsigned long long val) | ||
783 | { | 1503 | { |
784 | 1504 | ||
785 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; | 1505 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; |
786 | int progress; | 1506 | int progress; |
1507 | u64 memswlimit; | ||
787 | int ret = 0; | 1508 | int ret = 0; |
788 | 1509 | ||
789 | while (res_counter_set_limit(&memcg->res, val)) { | 1510 | while (retry_count) { |
790 | if (signal_pending(current)) { | 1511 | if (signal_pending(current)) { |
791 | ret = -EINTR; | 1512 | ret = -EINTR; |
792 | break; | 1513 | break; |
793 | } | 1514 | } |
794 | if (!retry_count) { | 1515 | /* |
795 | ret = -EBUSY; | 1516 | * Rather than hide all in some function, I do this in |
1517 | * open coded manner. You see what this really does. | ||
1518 | * We have to guarantee mem->res.limit < mem->memsw.limit. | ||
1519 | */ | ||
1520 | mutex_lock(&set_limit_mutex); | ||
1521 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
1522 | if (memswlimit < val) { | ||
1523 | ret = -EINVAL; | ||
1524 | mutex_unlock(&set_limit_mutex); | ||
796 | break; | 1525 | break; |
797 | } | 1526 | } |
798 | progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL); | 1527 | ret = res_counter_set_limit(&memcg->res, val); |
799 | if (!progress) | 1528 | mutex_unlock(&set_limit_mutex); |
800 | retry_count--; | 1529 | |
1530 | if (!ret) | ||
1531 | break; | ||
1532 | |||
1533 | progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, | ||
1534 | false); | ||
1535 | if (!progress) retry_count--; | ||
801 | } | 1536 | } |
1537 | |||
802 | return ret; | 1538 | return ret; |
803 | } | 1539 | } |
804 | 1540 | ||
1541 | int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | ||
1542 | unsigned long long val) | ||
1543 | { | ||
1544 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; | ||
1545 | u64 memlimit, oldusage, curusage; | ||
1546 | int ret; | ||
1547 | |||
1548 | if (!do_swap_account) | ||
1549 | return -EINVAL; | ||
1550 | |||
1551 | while (retry_count) { | ||
1552 | if (signal_pending(current)) { | ||
1553 | ret = -EINTR; | ||
1554 | break; | ||
1555 | } | ||
1556 | /* | ||
1557 | * Rather than hide all in some function, I do this in | ||
1558 | * open coded manner. You see what this really does. | ||
1559 | * We have to guarantee mem->res.limit < mem->memsw.limit. | ||
1560 | */ | ||
1561 | mutex_lock(&set_limit_mutex); | ||
1562 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
1563 | if (memlimit > val) { | ||
1564 | ret = -EINVAL; | ||
1565 | mutex_unlock(&set_limit_mutex); | ||
1566 | break; | ||
1567 | } | ||
1568 | ret = res_counter_set_limit(&memcg->memsw, val); | ||
1569 | mutex_unlock(&set_limit_mutex); | ||
1570 | |||
1571 | if (!ret) | ||
1572 | break; | ||
1573 | |||
1574 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | ||
1575 | mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true); | ||
1576 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | ||
1577 | if (curusage >= oldusage) | ||
1578 | retry_count--; | ||
1579 | } | ||
1580 | return ret; | ||
1581 | } | ||
805 | 1582 | ||
806 | /* | 1583 | /* |
807 | * This routine traverse page_cgroup in given list and drop them all. | 1584 | * This routine traverse page_cgroup in given list and drop them all. |
808 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 1585 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
809 | */ | 1586 | */ |
810 | #define FORCE_UNCHARGE_BATCH (128) | 1587 | static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, |
811 | static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, | 1588 | int node, int zid, enum lru_list lru) |
812 | struct mem_cgroup_per_zone *mz, | ||
813 | enum lru_list lru) | ||
814 | { | 1589 | { |
815 | struct page_cgroup *pc; | 1590 | struct zone *zone; |
816 | struct page *page; | 1591 | struct mem_cgroup_per_zone *mz; |
817 | int count = FORCE_UNCHARGE_BATCH; | 1592 | struct page_cgroup *pc, *busy; |
818 | unsigned long flags; | 1593 | unsigned long flags, loop; |
819 | struct list_head *list; | 1594 | struct list_head *list; |
1595 | int ret = 0; | ||
820 | 1596 | ||
1597 | zone = &NODE_DATA(node)->node_zones[zid]; | ||
1598 | mz = mem_cgroup_zoneinfo(mem, node, zid); | ||
821 | list = &mz->lists[lru]; | 1599 | list = &mz->lists[lru]; |
822 | 1600 | ||
823 | spin_lock_irqsave(&mz->lru_lock, flags); | 1601 | loop = MEM_CGROUP_ZSTAT(mz, lru); |
824 | while (!list_empty(list)) { | 1602 | /* give some margin against EBUSY etc...*/ |
825 | pc = list_entry(list->prev, struct page_cgroup, lru); | 1603 | loop += 256; |
826 | page = pc->page; | 1604 | busy = NULL; |
827 | if (!PageCgroupUsed(pc)) | 1605 | while (loop--) { |
828 | break; | 1606 | ret = 0; |
829 | get_page(page); | 1607 | spin_lock_irqsave(&zone->lru_lock, flags); |
830 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 1608 | if (list_empty(list)) { |
831 | /* | 1609 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
832 | * Check if this page is on LRU. !LRU page can be found | ||
833 | * if it's under page migration. | ||
834 | */ | ||
835 | if (PageLRU(page)) { | ||
836 | __mem_cgroup_uncharge_common(page, | ||
837 | MEM_CGROUP_CHARGE_TYPE_FORCE); | ||
838 | put_page(page); | ||
839 | if (--count <= 0) { | ||
840 | count = FORCE_UNCHARGE_BATCH; | ||
841 | cond_resched(); | ||
842 | } | ||
843 | } else { | ||
844 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
845 | break; | 1610 | break; |
846 | } | 1611 | } |
847 | spin_lock_irqsave(&mz->lru_lock, flags); | 1612 | pc = list_entry(list->prev, struct page_cgroup, lru); |
1613 | if (busy == pc) { | ||
1614 | list_move(&pc->lru, list); | ||
1615 | busy = 0; | ||
1616 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
1617 | continue; | ||
1618 | } | ||
1619 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
1620 | |||
1621 | ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); | ||
1622 | if (ret == -ENOMEM) | ||
1623 | break; | ||
1624 | |||
1625 | if (ret == -EBUSY || ret == -EINVAL) { | ||
1626 | /* found lock contention or "pc" is obsolete. */ | ||
1627 | busy = pc; | ||
1628 | cond_resched(); | ||
1629 | } else | ||
1630 | busy = NULL; | ||
848 | } | 1631 | } |
849 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 1632 | |
1633 | if (!ret && !list_empty(list)) | ||
1634 | return -EBUSY; | ||
1635 | return ret; | ||
850 | } | 1636 | } |
851 | 1637 | ||
852 | /* | 1638 | /* |
853 | * make mem_cgroup's charge to be 0 if there is no task. | 1639 | * make mem_cgroup's charge to be 0 if there is no task. |
854 | * This enables deleting this mem_cgroup. | 1640 | * This enables deleting this mem_cgroup. |
855 | */ | 1641 | */ |
856 | static int mem_cgroup_force_empty(struct mem_cgroup *mem) | 1642 | static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) |
857 | { | 1643 | { |
858 | int ret = -EBUSY; | 1644 | int ret; |
859 | int node, zid; | 1645 | int node, zid, shrink; |
1646 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
1647 | struct cgroup *cgrp = mem->css.cgroup; | ||
860 | 1648 | ||
861 | css_get(&mem->css); | 1649 | css_get(&mem->css); |
862 | /* | 1650 | |
863 | * page reclaim code (kswapd etc..) will move pages between | 1651 | shrink = 0; |
864 | * active_list <-> inactive_list while we don't take a lock. | 1652 | /* should free all ? */ |
865 | * So, we have to do loop here until all lists are empty. | 1653 | if (free_all) |
866 | */ | 1654 | goto try_to_free; |
1655 | move_account: | ||
867 | while (mem->res.usage > 0) { | 1656 | while (mem->res.usage > 0) { |
868 | if (atomic_read(&mem->css.cgroup->count) > 0) | 1657 | ret = -EBUSY; |
1658 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) | ||
1659 | goto out; | ||
1660 | ret = -EINTR; | ||
1661 | if (signal_pending(current)) | ||
869 | goto out; | 1662 | goto out; |
870 | /* This is for making all *used* pages to be on LRU. */ | 1663 | /* This is for making all *used* pages to be on LRU. */ |
871 | lru_add_drain_all(); | 1664 | lru_add_drain_all(); |
872 | for_each_node_state(node, N_POSSIBLE) | 1665 | ret = 0; |
873 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 1666 | for_each_node_state(node, N_POSSIBLE) { |
874 | struct mem_cgroup_per_zone *mz; | 1667 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
875 | enum lru_list l; | 1668 | enum lru_list l; |
876 | mz = mem_cgroup_zoneinfo(mem, node, zid); | 1669 | for_each_lru(l) { |
877 | for_each_lru(l) | 1670 | ret = mem_cgroup_force_empty_list(mem, |
878 | mem_cgroup_force_empty_list(mem, mz, l); | 1671 | node, zid, l); |
1672 | if (ret) | ||
1673 | break; | ||
1674 | } | ||
879 | } | 1675 | } |
1676 | if (ret) | ||
1677 | break; | ||
1678 | } | ||
1679 | /* it seems parent cgroup doesn't have enough mem */ | ||
1680 | if (ret == -ENOMEM) | ||
1681 | goto try_to_free; | ||
880 | cond_resched(); | 1682 | cond_resched(); |
881 | } | 1683 | } |
882 | ret = 0; | 1684 | ret = 0; |
883 | out: | 1685 | out: |
884 | css_put(&mem->css); | 1686 | css_put(&mem->css); |
885 | return ret; | 1687 | return ret; |
1688 | |||
1689 | try_to_free: | ||
1690 | /* returns EBUSY if there is a task or if we come here twice. */ | ||
1691 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { | ||
1692 | ret = -EBUSY; | ||
1693 | goto out; | ||
1694 | } | ||
1695 | /* we call try-to-free pages for make this cgroup empty */ | ||
1696 | lru_add_drain_all(); | ||
1697 | /* try to free all pages in this cgroup */ | ||
1698 | shrink = 1; | ||
1699 | while (nr_retries && mem->res.usage > 0) { | ||
1700 | int progress; | ||
1701 | |||
1702 | if (signal_pending(current)) { | ||
1703 | ret = -EINTR; | ||
1704 | goto out; | ||
1705 | } | ||
1706 | progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, | ||
1707 | false, get_swappiness(mem)); | ||
1708 | if (!progress) { | ||
1709 | nr_retries--; | ||
1710 | /* maybe some writeback is necessary */ | ||
1711 | congestion_wait(WRITE, HZ/10); | ||
1712 | } | ||
1713 | |||
1714 | } | ||
1715 | lru_add_drain(); | ||
1716 | /* try move_account...there may be some *locked* pages. */ | ||
1717 | if (mem->res.usage) | ||
1718 | goto move_account; | ||
1719 | ret = 0; | ||
1720 | goto out; | ||
1721 | } | ||
1722 | |||
1723 | int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) | ||
1724 | { | ||
1725 | return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); | ||
1726 | } | ||
1727 | |||
1728 | |||
1729 | static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) | ||
1730 | { | ||
1731 | return mem_cgroup_from_cont(cont)->use_hierarchy; | ||
1732 | } | ||
1733 | |||
1734 | static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | ||
1735 | u64 val) | ||
1736 | { | ||
1737 | int retval = 0; | ||
1738 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | ||
1739 | struct cgroup *parent = cont->parent; | ||
1740 | struct mem_cgroup *parent_mem = NULL; | ||
1741 | |||
1742 | if (parent) | ||
1743 | parent_mem = mem_cgroup_from_cont(parent); | ||
1744 | |||
1745 | cgroup_lock(); | ||
1746 | /* | ||
1747 | * If parent's use_hiearchy is set, we can't make any modifications | ||
1748 | * in the child subtrees. If it is unset, then the change can | ||
1749 | * occur, provided the current cgroup has no children. | ||
1750 | * | ||
1751 | * For the root cgroup, parent_mem is NULL, we allow value to be | ||
1752 | * set if there are no children. | ||
1753 | */ | ||
1754 | if ((!parent_mem || !parent_mem->use_hierarchy) && | ||
1755 | (val == 1 || val == 0)) { | ||
1756 | if (list_empty(&cont->children)) | ||
1757 | mem->use_hierarchy = val; | ||
1758 | else | ||
1759 | retval = -EBUSY; | ||
1760 | } else | ||
1761 | retval = -EINVAL; | ||
1762 | cgroup_unlock(); | ||
1763 | |||
1764 | return retval; | ||
886 | } | 1765 | } |
887 | 1766 | ||
888 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 1767 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
889 | { | 1768 | { |
890 | return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, | 1769 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
891 | cft->private); | 1770 | u64 val = 0; |
1771 | int type, name; | ||
1772 | |||
1773 | type = MEMFILE_TYPE(cft->private); | ||
1774 | name = MEMFILE_ATTR(cft->private); | ||
1775 | switch (type) { | ||
1776 | case _MEM: | ||
1777 | val = res_counter_read_u64(&mem->res, name); | ||
1778 | break; | ||
1779 | case _MEMSWAP: | ||
1780 | if (do_swap_account) | ||
1781 | val = res_counter_read_u64(&mem->memsw, name); | ||
1782 | break; | ||
1783 | default: | ||
1784 | BUG(); | ||
1785 | break; | ||
1786 | } | ||
1787 | return val; | ||
892 | } | 1788 | } |
893 | /* | 1789 | /* |
894 | * The user of this function is... | 1790 | * The user of this function is... |
@@ -898,15 +1794,22 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
898 | const char *buffer) | 1794 | const char *buffer) |
899 | { | 1795 | { |
900 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 1796 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
1797 | int type, name; | ||
901 | unsigned long long val; | 1798 | unsigned long long val; |
902 | int ret; | 1799 | int ret; |
903 | 1800 | ||
904 | switch (cft->private) { | 1801 | type = MEMFILE_TYPE(cft->private); |
1802 | name = MEMFILE_ATTR(cft->private); | ||
1803 | switch (name) { | ||
905 | case RES_LIMIT: | 1804 | case RES_LIMIT: |
906 | /* This function does all necessary parse...reuse it */ | 1805 | /* This function does all necessary parse...reuse it */ |
907 | ret = res_counter_memparse_write_strategy(buffer, &val); | 1806 | ret = res_counter_memparse_write_strategy(buffer, &val); |
908 | if (!ret) | 1807 | if (ret) |
1808 | break; | ||
1809 | if (type == _MEM) | ||
909 | ret = mem_cgroup_resize_limit(memcg, val); | 1810 | ret = mem_cgroup_resize_limit(memcg, val); |
1811 | else | ||
1812 | ret = mem_cgroup_resize_memsw_limit(memcg, val); | ||
910 | break; | 1813 | break; |
911 | default: | 1814 | default: |
912 | ret = -EINVAL; /* should be BUG() ? */ | 1815 | ret = -EINVAL; /* should be BUG() ? */ |
@@ -915,27 +1818,59 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
915 | return ret; | 1818 | return ret; |
916 | } | 1819 | } |
917 | 1820 | ||
1821 | static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, | ||
1822 | unsigned long long *mem_limit, unsigned long long *memsw_limit) | ||
1823 | { | ||
1824 | struct cgroup *cgroup; | ||
1825 | unsigned long long min_limit, min_memsw_limit, tmp; | ||
1826 | |||
1827 | min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
1828 | min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
1829 | cgroup = memcg->css.cgroup; | ||
1830 | if (!memcg->use_hierarchy) | ||
1831 | goto out; | ||
1832 | |||
1833 | while (cgroup->parent) { | ||
1834 | cgroup = cgroup->parent; | ||
1835 | memcg = mem_cgroup_from_cont(cgroup); | ||
1836 | if (!memcg->use_hierarchy) | ||
1837 | break; | ||
1838 | tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
1839 | min_limit = min(min_limit, tmp); | ||
1840 | tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
1841 | min_memsw_limit = min(min_memsw_limit, tmp); | ||
1842 | } | ||
1843 | out: | ||
1844 | *mem_limit = min_limit; | ||
1845 | *memsw_limit = min_memsw_limit; | ||
1846 | return; | ||
1847 | } | ||
1848 | |||
918 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 1849 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
919 | { | 1850 | { |
920 | struct mem_cgroup *mem; | 1851 | struct mem_cgroup *mem; |
1852 | int type, name; | ||
921 | 1853 | ||
922 | mem = mem_cgroup_from_cont(cont); | 1854 | mem = mem_cgroup_from_cont(cont); |
923 | switch (event) { | 1855 | type = MEMFILE_TYPE(event); |
1856 | name = MEMFILE_ATTR(event); | ||
1857 | switch (name) { | ||
924 | case RES_MAX_USAGE: | 1858 | case RES_MAX_USAGE: |
925 | res_counter_reset_max(&mem->res); | 1859 | if (type == _MEM) |
1860 | res_counter_reset_max(&mem->res); | ||
1861 | else | ||
1862 | res_counter_reset_max(&mem->memsw); | ||
926 | break; | 1863 | break; |
927 | case RES_FAILCNT: | 1864 | case RES_FAILCNT: |
928 | res_counter_reset_failcnt(&mem->res); | 1865 | if (type == _MEM) |
1866 | res_counter_reset_failcnt(&mem->res); | ||
1867 | else | ||
1868 | res_counter_reset_failcnt(&mem->memsw); | ||
929 | break; | 1869 | break; |
930 | } | 1870 | } |
931 | return 0; | 1871 | return 0; |
932 | } | 1872 | } |
933 | 1873 | ||
934 | static int mem_force_empty_write(struct cgroup *cont, unsigned int event) | ||
935 | { | ||
936 | return mem_cgroup_force_empty(mem_cgroup_from_cont(cont)); | ||
937 | } | ||
938 | |||
939 | static const struct mem_cgroup_stat_desc { | 1874 | static const struct mem_cgroup_stat_desc { |
940 | const char *msg; | 1875 | const char *msg; |
941 | u64 unit; | 1876 | u64 unit; |
@@ -984,43 +1919,163 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
984 | cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); | 1919 | cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); |
985 | 1920 | ||
986 | } | 1921 | } |
1922 | { | ||
1923 | unsigned long long limit, memsw_limit; | ||
1924 | memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); | ||
1925 | cb->fill(cb, "hierarchical_memory_limit", limit); | ||
1926 | if (do_swap_account) | ||
1927 | cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); | ||
1928 | } | ||
1929 | |||
1930 | #ifdef CONFIG_DEBUG_VM | ||
1931 | cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); | ||
1932 | |||
1933 | { | ||
1934 | int nid, zid; | ||
1935 | struct mem_cgroup_per_zone *mz; | ||
1936 | unsigned long recent_rotated[2] = {0, 0}; | ||
1937 | unsigned long recent_scanned[2] = {0, 0}; | ||
1938 | |||
1939 | for_each_online_node(nid) | ||
1940 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | ||
1941 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | ||
1942 | |||
1943 | recent_rotated[0] += | ||
1944 | mz->reclaim_stat.recent_rotated[0]; | ||
1945 | recent_rotated[1] += | ||
1946 | mz->reclaim_stat.recent_rotated[1]; | ||
1947 | recent_scanned[0] += | ||
1948 | mz->reclaim_stat.recent_scanned[0]; | ||
1949 | recent_scanned[1] += | ||
1950 | mz->reclaim_stat.recent_scanned[1]; | ||
1951 | } | ||
1952 | cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); | ||
1953 | cb->fill(cb, "recent_rotated_file", recent_rotated[1]); | ||
1954 | cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); | ||
1955 | cb->fill(cb, "recent_scanned_file", recent_scanned[1]); | ||
1956 | } | ||
1957 | #endif | ||
1958 | |||
1959 | return 0; | ||
1960 | } | ||
1961 | |||
1962 | static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) | ||
1963 | { | ||
1964 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
1965 | |||
1966 | return get_swappiness(memcg); | ||
1967 | } | ||
1968 | |||
1969 | static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | ||
1970 | u64 val) | ||
1971 | { | ||
1972 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
1973 | struct mem_cgroup *parent; | ||
1974 | if (val > 100) | ||
1975 | return -EINVAL; | ||
1976 | |||
1977 | if (cgrp->parent == NULL) | ||
1978 | return -EINVAL; | ||
1979 | |||
1980 | parent = mem_cgroup_from_cont(cgrp->parent); | ||
1981 | /* If under hierarchy, only empty-root can set this value */ | ||
1982 | if ((parent->use_hierarchy) || | ||
1983 | (memcg->use_hierarchy && !list_empty(&cgrp->children))) | ||
1984 | return -EINVAL; | ||
1985 | |||
1986 | spin_lock(&memcg->reclaim_param_lock); | ||
1987 | memcg->swappiness = val; | ||
1988 | spin_unlock(&memcg->reclaim_param_lock); | ||
1989 | |||
987 | return 0; | 1990 | return 0; |
988 | } | 1991 | } |
989 | 1992 | ||
1993 | |||
990 | static struct cftype mem_cgroup_files[] = { | 1994 | static struct cftype mem_cgroup_files[] = { |
991 | { | 1995 | { |
992 | .name = "usage_in_bytes", | 1996 | .name = "usage_in_bytes", |
993 | .private = RES_USAGE, | 1997 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
994 | .read_u64 = mem_cgroup_read, | 1998 | .read_u64 = mem_cgroup_read, |
995 | }, | 1999 | }, |
996 | { | 2000 | { |
997 | .name = "max_usage_in_bytes", | 2001 | .name = "max_usage_in_bytes", |
998 | .private = RES_MAX_USAGE, | 2002 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), |
999 | .trigger = mem_cgroup_reset, | 2003 | .trigger = mem_cgroup_reset, |
1000 | .read_u64 = mem_cgroup_read, | 2004 | .read_u64 = mem_cgroup_read, |
1001 | }, | 2005 | }, |
1002 | { | 2006 | { |
1003 | .name = "limit_in_bytes", | 2007 | .name = "limit_in_bytes", |
1004 | .private = RES_LIMIT, | 2008 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), |
1005 | .write_string = mem_cgroup_write, | 2009 | .write_string = mem_cgroup_write, |
1006 | .read_u64 = mem_cgroup_read, | 2010 | .read_u64 = mem_cgroup_read, |
1007 | }, | 2011 | }, |
1008 | { | 2012 | { |
1009 | .name = "failcnt", | 2013 | .name = "failcnt", |
1010 | .private = RES_FAILCNT, | 2014 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), |
1011 | .trigger = mem_cgroup_reset, | 2015 | .trigger = mem_cgroup_reset, |
1012 | .read_u64 = mem_cgroup_read, | 2016 | .read_u64 = mem_cgroup_read, |
1013 | }, | 2017 | }, |
1014 | { | 2018 | { |
2019 | .name = "stat", | ||
2020 | .read_map = mem_control_stat_show, | ||
2021 | }, | ||
2022 | { | ||
1015 | .name = "force_empty", | 2023 | .name = "force_empty", |
1016 | .trigger = mem_force_empty_write, | 2024 | .trigger = mem_cgroup_force_empty_write, |
1017 | }, | 2025 | }, |
1018 | { | 2026 | { |
1019 | .name = "stat", | 2027 | .name = "use_hierarchy", |
1020 | .read_map = mem_control_stat_show, | 2028 | .write_u64 = mem_cgroup_hierarchy_write, |
2029 | .read_u64 = mem_cgroup_hierarchy_read, | ||
2030 | }, | ||
2031 | { | ||
2032 | .name = "swappiness", | ||
2033 | .read_u64 = mem_cgroup_swappiness_read, | ||
2034 | .write_u64 = mem_cgroup_swappiness_write, | ||
1021 | }, | 2035 | }, |
1022 | }; | 2036 | }; |
1023 | 2037 | ||
2038 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
2039 | static struct cftype memsw_cgroup_files[] = { | ||
2040 | { | ||
2041 | .name = "memsw.usage_in_bytes", | ||
2042 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||
2043 | .read_u64 = mem_cgroup_read, | ||
2044 | }, | ||
2045 | { | ||
2046 | .name = "memsw.max_usage_in_bytes", | ||
2047 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | ||
2048 | .trigger = mem_cgroup_reset, | ||
2049 | .read_u64 = mem_cgroup_read, | ||
2050 | }, | ||
2051 | { | ||
2052 | .name = "memsw.limit_in_bytes", | ||
2053 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | ||
2054 | .write_string = mem_cgroup_write, | ||
2055 | .read_u64 = mem_cgroup_read, | ||
2056 | }, | ||
2057 | { | ||
2058 | .name = "memsw.failcnt", | ||
2059 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | ||
2060 | .trigger = mem_cgroup_reset, | ||
2061 | .read_u64 = mem_cgroup_read, | ||
2062 | }, | ||
2063 | }; | ||
2064 | |||
2065 | static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
2066 | { | ||
2067 | if (!do_swap_account) | ||
2068 | return 0; | ||
2069 | return cgroup_add_files(cont, ss, memsw_cgroup_files, | ||
2070 | ARRAY_SIZE(memsw_cgroup_files)); | ||
2071 | }; | ||
2072 | #else | ||
2073 | static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) | ||
2074 | { | ||
2075 | return 0; | ||
2076 | } | ||
2077 | #endif | ||
2078 | |||
1024 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | 2079 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) |
1025 | { | 2080 | { |
1026 | struct mem_cgroup_per_node *pn; | 2081 | struct mem_cgroup_per_node *pn; |
@@ -1046,7 +2101,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
1046 | 2101 | ||
1047 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 2102 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
1048 | mz = &pn->zoneinfo[zone]; | 2103 | mz = &pn->zoneinfo[zone]; |
1049 | spin_lock_init(&mz->lru_lock); | ||
1050 | for_each_lru(l) | 2104 | for_each_lru(l) |
1051 | INIT_LIST_HEAD(&mz->lists[l]); | 2105 | INIT_LIST_HEAD(&mz->lists[l]); |
1052 | } | 2106 | } |
@@ -1058,55 +2112,113 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
1058 | kfree(mem->info.nodeinfo[node]); | 2112 | kfree(mem->info.nodeinfo[node]); |
1059 | } | 2113 | } |
1060 | 2114 | ||
2115 | static int mem_cgroup_size(void) | ||
2116 | { | ||
2117 | int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); | ||
2118 | return sizeof(struct mem_cgroup) + cpustat_size; | ||
2119 | } | ||
2120 | |||
1061 | static struct mem_cgroup *mem_cgroup_alloc(void) | 2121 | static struct mem_cgroup *mem_cgroup_alloc(void) |
1062 | { | 2122 | { |
1063 | struct mem_cgroup *mem; | 2123 | struct mem_cgroup *mem; |
2124 | int size = mem_cgroup_size(); | ||
1064 | 2125 | ||
1065 | if (sizeof(*mem) < PAGE_SIZE) | 2126 | if (size < PAGE_SIZE) |
1066 | mem = kmalloc(sizeof(*mem), GFP_KERNEL); | 2127 | mem = kmalloc(size, GFP_KERNEL); |
1067 | else | 2128 | else |
1068 | mem = vmalloc(sizeof(*mem)); | 2129 | mem = vmalloc(size); |
1069 | 2130 | ||
1070 | if (mem) | 2131 | if (mem) |
1071 | memset(mem, 0, sizeof(*mem)); | 2132 | memset(mem, 0, size); |
1072 | return mem; | 2133 | return mem; |
1073 | } | 2134 | } |
1074 | 2135 | ||
1075 | static void mem_cgroup_free(struct mem_cgroup *mem) | 2136 | /* |
2137 | * At destroying mem_cgroup, references from swap_cgroup can remain. | ||
2138 | * (scanning all at force_empty is too costly...) | ||
2139 | * | ||
2140 | * Instead of clearing all references at force_empty, we remember | ||
2141 | * the number of reference from swap_cgroup and free mem_cgroup when | ||
2142 | * it goes down to 0. | ||
2143 | * | ||
2144 | * Removal of cgroup itself succeeds regardless of refs from swap. | ||
2145 | */ | ||
2146 | |||
2147 | static void __mem_cgroup_free(struct mem_cgroup *mem) | ||
1076 | { | 2148 | { |
1077 | if (sizeof(*mem) < PAGE_SIZE) | 2149 | int node; |
2150 | |||
2151 | for_each_node_state(node, N_POSSIBLE) | ||
2152 | free_mem_cgroup_per_zone_info(mem, node); | ||
2153 | |||
2154 | if (mem_cgroup_size() < PAGE_SIZE) | ||
1078 | kfree(mem); | 2155 | kfree(mem); |
1079 | else | 2156 | else |
1080 | vfree(mem); | 2157 | vfree(mem); |
1081 | } | 2158 | } |
1082 | 2159 | ||
2160 | static void mem_cgroup_get(struct mem_cgroup *mem) | ||
2161 | { | ||
2162 | atomic_inc(&mem->refcnt); | ||
2163 | } | ||
2164 | |||
2165 | static void mem_cgroup_put(struct mem_cgroup *mem) | ||
2166 | { | ||
2167 | if (atomic_dec_and_test(&mem->refcnt)) | ||
2168 | __mem_cgroup_free(mem); | ||
2169 | } | ||
2170 | |||
2171 | |||
2172 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
2173 | static void __init enable_swap_cgroup(void) | ||
2174 | { | ||
2175 | if (!mem_cgroup_disabled() && really_do_swap_account) | ||
2176 | do_swap_account = 1; | ||
2177 | } | ||
2178 | #else | ||
2179 | static void __init enable_swap_cgroup(void) | ||
2180 | { | ||
2181 | } | ||
2182 | #endif | ||
1083 | 2183 | ||
1084 | static struct cgroup_subsys_state * | 2184 | static struct cgroup_subsys_state * |
1085 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 2185 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
1086 | { | 2186 | { |
1087 | struct mem_cgroup *mem; | 2187 | struct mem_cgroup *mem, *parent; |
1088 | int node; | 2188 | int node; |
1089 | 2189 | ||
1090 | if (unlikely((cont->parent) == NULL)) { | 2190 | mem = mem_cgroup_alloc(); |
1091 | mem = &init_mem_cgroup; | 2191 | if (!mem) |
1092 | } else { | 2192 | return ERR_PTR(-ENOMEM); |
1093 | mem = mem_cgroup_alloc(); | ||
1094 | if (!mem) | ||
1095 | return ERR_PTR(-ENOMEM); | ||
1096 | } | ||
1097 | |||
1098 | res_counter_init(&mem->res); | ||
1099 | 2193 | ||
1100 | for_each_node_state(node, N_POSSIBLE) | 2194 | for_each_node_state(node, N_POSSIBLE) |
1101 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | 2195 | if (alloc_mem_cgroup_per_zone_info(mem, node)) |
1102 | goto free_out; | 2196 | goto free_out; |
2197 | /* root ? */ | ||
2198 | if (cont->parent == NULL) { | ||
2199 | enable_swap_cgroup(); | ||
2200 | parent = NULL; | ||
2201 | } else { | ||
2202 | parent = mem_cgroup_from_cont(cont->parent); | ||
2203 | mem->use_hierarchy = parent->use_hierarchy; | ||
2204 | } | ||
1103 | 2205 | ||
2206 | if (parent && parent->use_hierarchy) { | ||
2207 | res_counter_init(&mem->res, &parent->res); | ||
2208 | res_counter_init(&mem->memsw, &parent->memsw); | ||
2209 | } else { | ||
2210 | res_counter_init(&mem->res, NULL); | ||
2211 | res_counter_init(&mem->memsw, NULL); | ||
2212 | } | ||
2213 | mem->last_scanned_child = NULL; | ||
2214 | spin_lock_init(&mem->reclaim_param_lock); | ||
2215 | |||
2216 | if (parent) | ||
2217 | mem->swappiness = get_swappiness(parent); | ||
2218 | atomic_set(&mem->refcnt, 1); | ||
1104 | return &mem->css; | 2219 | return &mem->css; |
1105 | free_out: | 2220 | free_out: |
1106 | for_each_node_state(node, N_POSSIBLE) | 2221 | __mem_cgroup_free(mem); |
1107 | free_mem_cgroup_per_zone_info(mem, node); | ||
1108 | if (cont->parent != NULL) | ||
1109 | mem_cgroup_free(mem); | ||
1110 | return ERR_PTR(-ENOMEM); | 2222 | return ERR_PTR(-ENOMEM); |
1111 | } | 2223 | } |
1112 | 2224 | ||
@@ -1114,26 +2226,26 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | |||
1114 | struct cgroup *cont) | 2226 | struct cgroup *cont) |
1115 | { | 2227 | { |
1116 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2228 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
1117 | mem_cgroup_force_empty(mem); | 2229 | mem_cgroup_force_empty(mem, false); |
1118 | } | 2230 | } |
1119 | 2231 | ||
1120 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, | 2232 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, |
1121 | struct cgroup *cont) | 2233 | struct cgroup *cont) |
1122 | { | 2234 | { |
1123 | int node; | 2235 | mem_cgroup_put(mem_cgroup_from_cont(cont)); |
1124 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | ||
1125 | |||
1126 | for_each_node_state(node, N_POSSIBLE) | ||
1127 | free_mem_cgroup_per_zone_info(mem, node); | ||
1128 | |||
1129 | mem_cgroup_free(mem_cgroup_from_cont(cont)); | ||
1130 | } | 2236 | } |
1131 | 2237 | ||
1132 | static int mem_cgroup_populate(struct cgroup_subsys *ss, | 2238 | static int mem_cgroup_populate(struct cgroup_subsys *ss, |
1133 | struct cgroup *cont) | 2239 | struct cgroup *cont) |
1134 | { | 2240 | { |
1135 | return cgroup_add_files(cont, ss, mem_cgroup_files, | 2241 | int ret; |
1136 | ARRAY_SIZE(mem_cgroup_files)); | 2242 | |
2243 | ret = cgroup_add_files(cont, ss, mem_cgroup_files, | ||
2244 | ARRAY_SIZE(mem_cgroup_files)); | ||
2245 | |||
2246 | if (!ret) | ||
2247 | ret = register_memsw_files(cont, ss); | ||
2248 | return ret; | ||
1137 | } | 2249 | } |
1138 | 2250 | ||
1139 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 2251 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
@@ -1141,25 +2253,12 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
1141 | struct cgroup *old_cont, | 2253 | struct cgroup *old_cont, |
1142 | struct task_struct *p) | 2254 | struct task_struct *p) |
1143 | { | 2255 | { |
1144 | struct mm_struct *mm; | 2256 | mutex_lock(&memcg_tasklist); |
1145 | struct mem_cgroup *mem, *old_mem; | ||
1146 | |||
1147 | mm = get_task_mm(p); | ||
1148 | if (mm == NULL) | ||
1149 | return; | ||
1150 | |||
1151 | mem = mem_cgroup_from_cont(cont); | ||
1152 | old_mem = mem_cgroup_from_cont(old_cont); | ||
1153 | |||
1154 | /* | 2257 | /* |
1155 | * Only thread group leaders are allowed to migrate, the mm_struct is | 2258 | * FIXME: It's better to move charges of this process from old |
1156 | * in effect owned by the leader | 2259 | * memcg to new memcg. But it's just on TODO-List now. |
1157 | */ | 2260 | */ |
1158 | if (!thread_group_leader(p)) | 2261 | mutex_unlock(&memcg_tasklist); |
1159 | goto out; | ||
1160 | |||
1161 | out: | ||
1162 | mmput(mm); | ||
1163 | } | 2262 | } |
1164 | 2263 | ||
1165 | struct cgroup_subsys mem_cgroup_subsys = { | 2264 | struct cgroup_subsys mem_cgroup_subsys = { |
@@ -1172,3 +2271,13 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
1172 | .attach = mem_cgroup_move_task, | 2271 | .attach = mem_cgroup_move_task, |
1173 | .early_init = 0, | 2272 | .early_init = 0, |
1174 | }; | 2273 | }; |
2274 | |||
2275 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
2276 | |||
2277 | static int __init disable_swap_account(char *s) | ||
2278 | { | ||
2279 | really_do_swap_account = 0; | ||
2280 | return 1; | ||
2281 | } | ||
2282 | __setup("noswapaccount", disable_swap_account); | ||
2283 | #endif | ||
diff --git a/mm/memory.c b/mm/memory.c index 7b9db658aca2..e009ce870859 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -52,6 +52,9 @@ | |||
52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
53 | #include <linux/memcontrol.h> | 53 | #include <linux/memcontrol.h> |
54 | #include <linux/mmu_notifier.h> | 54 | #include <linux/mmu_notifier.h> |
55 | #include <linux/kallsyms.h> | ||
56 | #include <linux/swapops.h> | ||
57 | #include <linux/elf.h> | ||
55 | 58 | ||
56 | #include <asm/pgalloc.h> | 59 | #include <asm/pgalloc.h> |
57 | #include <asm/uaccess.h> | 60 | #include <asm/uaccess.h> |
@@ -59,9 +62,6 @@ | |||
59 | #include <asm/tlbflush.h> | 62 | #include <asm/tlbflush.h> |
60 | #include <asm/pgtable.h> | 63 | #include <asm/pgtable.h> |
61 | 64 | ||
62 | #include <linux/swapops.h> | ||
63 | #include <linux/elf.h> | ||
64 | |||
65 | #include "internal.h" | 65 | #include "internal.h" |
66 | 66 | ||
67 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 67 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
@@ -375,15 +375,65 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) | |||
375 | * | 375 | * |
376 | * The calling function must still handle the error. | 376 | * The calling function must still handle the error. |
377 | */ | 377 | */ |
378 | static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, | 378 | static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, |
379 | unsigned long vaddr) | 379 | pte_t pte, struct page *page) |
380 | { | 380 | { |
381 | printk(KERN_ERR "Bad pte = %08llx, process = %s, " | 381 | pgd_t *pgd = pgd_offset(vma->vm_mm, addr); |
382 | "vm_flags = %lx, vaddr = %lx\n", | 382 | pud_t *pud = pud_offset(pgd, addr); |
383 | (long long)pte_val(pte), | 383 | pmd_t *pmd = pmd_offset(pud, addr); |
384 | (vma->vm_mm == current->mm ? current->comm : "???"), | 384 | struct address_space *mapping; |
385 | vma->vm_flags, vaddr); | 385 | pgoff_t index; |
386 | static unsigned long resume; | ||
387 | static unsigned long nr_shown; | ||
388 | static unsigned long nr_unshown; | ||
389 | |||
390 | /* | ||
391 | * Allow a burst of 60 reports, then keep quiet for that minute; | ||
392 | * or allow a steady drip of one report per second. | ||
393 | */ | ||
394 | if (nr_shown == 60) { | ||
395 | if (time_before(jiffies, resume)) { | ||
396 | nr_unshown++; | ||
397 | return; | ||
398 | } | ||
399 | if (nr_unshown) { | ||
400 | printk(KERN_ALERT | ||
401 | "BUG: Bad page map: %lu messages suppressed\n", | ||
402 | nr_unshown); | ||
403 | nr_unshown = 0; | ||
404 | } | ||
405 | nr_shown = 0; | ||
406 | } | ||
407 | if (nr_shown++ == 0) | ||
408 | resume = jiffies + 60 * HZ; | ||
409 | |||
410 | mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; | ||
411 | index = linear_page_index(vma, addr); | ||
412 | |||
413 | printk(KERN_ALERT | ||
414 | "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", | ||
415 | current->comm, | ||
416 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); | ||
417 | if (page) { | ||
418 | printk(KERN_ALERT | ||
419 | "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", | ||
420 | page, (void *)page->flags, page_count(page), | ||
421 | page_mapcount(page), page->mapping, page->index); | ||
422 | } | ||
423 | printk(KERN_ALERT | ||
424 | "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", | ||
425 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); | ||
426 | /* | ||
427 | * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y | ||
428 | */ | ||
429 | if (vma->vm_ops) | ||
430 | print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", | ||
431 | (unsigned long)vma->vm_ops->fault); | ||
432 | if (vma->vm_file && vma->vm_file->f_op) | ||
433 | print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", | ||
434 | (unsigned long)vma->vm_file->f_op->mmap); | ||
386 | dump_stack(); | 435 | dump_stack(); |
436 | add_taint(TAINT_BAD_PAGE); | ||
387 | } | 437 | } |
388 | 438 | ||
389 | static inline int is_cow_mapping(unsigned int flags) | 439 | static inline int is_cow_mapping(unsigned int flags) |
@@ -441,21 +491,18 @@ static inline int is_cow_mapping(unsigned int flags) | |||
441 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | 491 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, |
442 | pte_t pte) | 492 | pte_t pte) |
443 | { | 493 | { |
444 | unsigned long pfn; | 494 | unsigned long pfn = pte_pfn(pte); |
445 | 495 | ||
446 | if (HAVE_PTE_SPECIAL) { | 496 | if (HAVE_PTE_SPECIAL) { |
447 | if (likely(!pte_special(pte))) { | 497 | if (likely(!pte_special(pte))) |
448 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 498 | goto check_pfn; |
449 | return pte_page(pte); | 499 | if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) |
450 | } | 500 | print_bad_pte(vma, addr, pte, NULL); |
451 | VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); | ||
452 | return NULL; | 501 | return NULL; |
453 | } | 502 | } |
454 | 503 | ||
455 | /* !HAVE_PTE_SPECIAL case follows: */ | 504 | /* !HAVE_PTE_SPECIAL case follows: */ |
456 | 505 | ||
457 | pfn = pte_pfn(pte); | ||
458 | |||
459 | if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { | 506 | if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { |
460 | if (vma->vm_flags & VM_MIXEDMAP) { | 507 | if (vma->vm_flags & VM_MIXEDMAP) { |
461 | if (!pfn_valid(pfn)) | 508 | if (!pfn_valid(pfn)) |
@@ -471,11 +518,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | |||
471 | } | 518 | } |
472 | } | 519 | } |
473 | 520 | ||
474 | VM_BUG_ON(!pfn_valid(pfn)); | 521 | check_pfn: |
522 | if (unlikely(pfn > highest_memmap_pfn)) { | ||
523 | print_bad_pte(vma, addr, pte, NULL); | ||
524 | return NULL; | ||
525 | } | ||
475 | 526 | ||
476 | /* | 527 | /* |
477 | * NOTE! We still have PageReserved() pages in the page tables. | 528 | * NOTE! We still have PageReserved() pages in the page tables. |
478 | * | ||
479 | * eg. VDSO mappings can cause them to exist. | 529 | * eg. VDSO mappings can cause them to exist. |
480 | */ | 530 | */ |
481 | out: | 531 | out: |
@@ -767,11 +817,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
767 | else { | 817 | else { |
768 | if (pte_dirty(ptent)) | 818 | if (pte_dirty(ptent)) |
769 | set_page_dirty(page); | 819 | set_page_dirty(page); |
770 | if (pte_young(ptent)) | 820 | if (pte_young(ptent) && |
771 | SetPageReferenced(page); | 821 | likely(!VM_SequentialReadHint(vma))) |
822 | mark_page_accessed(page); | ||
772 | file_rss--; | 823 | file_rss--; |
773 | } | 824 | } |
774 | page_remove_rmap(page, vma); | 825 | page_remove_rmap(page); |
826 | if (unlikely(page_mapcount(page) < 0)) | ||
827 | print_bad_pte(vma, addr, ptent, page); | ||
775 | tlb_remove_page(tlb, page); | 828 | tlb_remove_page(tlb, page); |
776 | continue; | 829 | continue; |
777 | } | 830 | } |
@@ -781,8 +834,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
781 | */ | 834 | */ |
782 | if (unlikely(details)) | 835 | if (unlikely(details)) |
783 | continue; | 836 | continue; |
784 | if (!pte_file(ptent)) | 837 | if (pte_file(ptent)) { |
785 | free_swap_and_cache(pte_to_swp_entry(ptent)); | 838 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) |
839 | print_bad_pte(vma, addr, ptent, NULL); | ||
840 | } else if | ||
841 | (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) | ||
842 | print_bad_pte(vma, addr, ptent, NULL); | ||
786 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | 843 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); |
787 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); | 844 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); |
788 | 845 | ||
@@ -1153,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1153 | int write = !!(flags & GUP_FLAGS_WRITE); | 1210 | int write = !!(flags & GUP_FLAGS_WRITE); |
1154 | int force = !!(flags & GUP_FLAGS_FORCE); | 1211 | int force = !!(flags & GUP_FLAGS_FORCE); |
1155 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); | 1212 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); |
1213 | int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); | ||
1156 | 1214 | ||
1157 | if (len <= 0) | 1215 | if (len <= 0) |
1158 | return 0; | 1216 | return 0; |
@@ -1231,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1231 | struct page *page; | 1289 | struct page *page; |
1232 | 1290 | ||
1233 | /* | 1291 | /* |
1234 | * If tsk is ooming, cut off its access to large memory | 1292 | * If we have a pending SIGKILL, don't keep faulting |
1235 | * allocations. It has a pending SIGKILL, but it can't | 1293 | * pages and potentially allocating memory, unless |
1236 | * be processed until returning to user space. | 1294 | * current is handling munlock--e.g., on exit. In |
1295 | * that case, we are not allocating memory. Rather, | ||
1296 | * we're only unlocking already resident/mapped pages. | ||
1237 | */ | 1297 | */ |
1238 | if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) | 1298 | if (unlikely(!ignore_sigkill && |
1239 | return i ? i : -ENOMEM; | 1299 | fatal_signal_pending(current))) |
1300 | return i ? i : -ERESTARTSYS; | ||
1240 | 1301 | ||
1241 | if (write) | 1302 | if (write) |
1242 | foll_flags |= FOLL_WRITE; | 1303 | foll_flags |= FOLL_WRITE; |
@@ -1263,9 +1324,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1263 | * do_wp_page has broken COW when necessary, | 1324 | * do_wp_page has broken COW when necessary, |
1264 | * even if maybe_mkwrite decided not to set | 1325 | * even if maybe_mkwrite decided not to set |
1265 | * pte_write. We can thus safely do subsequent | 1326 | * pte_write. We can thus safely do subsequent |
1266 | * page lookups as if they were reads. | 1327 | * page lookups as if they were reads. But only |
1328 | * do so when looping for pte_write is futile: | ||
1329 | * in some cases userspace may also be wanting | ||
1330 | * to write to the gotten user page, which a | ||
1331 | * read fault here might prevent (a readonly | ||
1332 | * page might get reCOWed by userspace write). | ||
1267 | */ | 1333 | */ |
1268 | if (ret & VM_FAULT_WRITE) | 1334 | if ((ret & VM_FAULT_WRITE) && |
1335 | !(vma->vm_flags & VM_WRITE)) | ||
1269 | foll_flags &= ~FOLL_WRITE; | 1336 | foll_flags &= ~FOLL_WRITE; |
1270 | 1337 | ||
1271 | cond_resched(); | 1338 | cond_resched(); |
@@ -1644,6 +1711,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1644 | 1711 | ||
1645 | BUG_ON(pmd_huge(*pmd)); | 1712 | BUG_ON(pmd_huge(*pmd)); |
1646 | 1713 | ||
1714 | arch_enter_lazy_mmu_mode(); | ||
1715 | |||
1647 | token = pmd_pgtable(*pmd); | 1716 | token = pmd_pgtable(*pmd); |
1648 | 1717 | ||
1649 | do { | 1718 | do { |
@@ -1652,6 +1721,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1652 | break; | 1721 | break; |
1653 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1722 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1654 | 1723 | ||
1724 | arch_leave_lazy_mmu_mode(); | ||
1725 | |||
1655 | if (mm != &init_mm) | 1726 | if (mm != &init_mm) |
1656 | pte_unmap_unlock(pte-1, ptl); | 1727 | pte_unmap_unlock(pte-1, ptl); |
1657 | return err; | 1728 | return err; |
@@ -1837,10 +1908,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1837 | * not dirty accountable. | 1908 | * not dirty accountable. |
1838 | */ | 1909 | */ |
1839 | if (PageAnon(old_page)) { | 1910 | if (PageAnon(old_page)) { |
1840 | if (trylock_page(old_page)) { | 1911 | if (!trylock_page(old_page)) { |
1841 | reuse = can_share_swap_page(old_page); | 1912 | page_cache_get(old_page); |
1842 | unlock_page(old_page); | 1913 | pte_unmap_unlock(page_table, ptl); |
1914 | lock_page(old_page); | ||
1915 | page_table = pte_offset_map_lock(mm, pmd, address, | ||
1916 | &ptl); | ||
1917 | if (!pte_same(*page_table, orig_pte)) { | ||
1918 | unlock_page(old_page); | ||
1919 | page_cache_release(old_page); | ||
1920 | goto unlock; | ||
1921 | } | ||
1922 | page_cache_release(old_page); | ||
1843 | } | 1923 | } |
1924 | reuse = reuse_swap_page(old_page); | ||
1925 | unlock_page(old_page); | ||
1844 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 1926 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
1845 | (VM_WRITE|VM_SHARED))) { | 1927 | (VM_WRITE|VM_SHARED))) { |
1846 | /* | 1928 | /* |
@@ -1918,7 +2000,7 @@ gotten: | |||
1918 | cow_user_page(new_page, old_page, address, vma); | 2000 | cow_user_page(new_page, old_page, address, vma); |
1919 | __SetPageUptodate(new_page); | 2001 | __SetPageUptodate(new_page); |
1920 | 2002 | ||
1921 | if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) | 2003 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) |
1922 | goto oom_free_new; | 2004 | goto oom_free_new; |
1923 | 2005 | ||
1924 | /* | 2006 | /* |
@@ -1943,11 +2025,7 @@ gotten: | |||
1943 | * thread doing COW. | 2025 | * thread doing COW. |
1944 | */ | 2026 | */ |
1945 | ptep_clear_flush_notify(vma, address, page_table); | 2027 | ptep_clear_flush_notify(vma, address, page_table); |
1946 | SetPageSwapBacked(new_page); | ||
1947 | lru_cache_add_active_or_unevictable(new_page, vma); | ||
1948 | page_add_new_anon_rmap(new_page, vma, address); | 2028 | page_add_new_anon_rmap(new_page, vma, address); |
1949 | |||
1950 | //TODO: is this safe? do_anonymous_page() does it this way. | ||
1951 | set_pte_at(mm, address, page_table, entry); | 2029 | set_pte_at(mm, address, page_table, entry); |
1952 | update_mmu_cache(vma, address, entry); | 2030 | update_mmu_cache(vma, address, entry); |
1953 | if (old_page) { | 2031 | if (old_page) { |
@@ -1973,7 +2051,7 @@ gotten: | |||
1973 | * mapcount is visible. So transitively, TLBs to | 2051 | * mapcount is visible. So transitively, TLBs to |
1974 | * old page will be flushed before it can be reused. | 2052 | * old page will be flushed before it can be reused. |
1975 | */ | 2053 | */ |
1976 | page_remove_rmap(old_page, vma); | 2054 | page_remove_rmap(old_page); |
1977 | } | 2055 | } |
1978 | 2056 | ||
1979 | /* Free the old page.. */ | 2057 | /* Free the old page.. */ |
@@ -2314,6 +2392,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2314 | struct page *page; | 2392 | struct page *page; |
2315 | swp_entry_t entry; | 2393 | swp_entry_t entry; |
2316 | pte_t pte; | 2394 | pte_t pte; |
2395 | struct mem_cgroup *ptr = NULL; | ||
2317 | int ret = 0; | 2396 | int ret = 0; |
2318 | 2397 | ||
2319 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2398 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
@@ -2352,7 +2431,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2352 | lock_page(page); | 2431 | lock_page(page); |
2353 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2432 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2354 | 2433 | ||
2355 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { | 2434 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { |
2356 | ret = VM_FAULT_OOM; | 2435 | ret = VM_FAULT_OOM; |
2357 | unlock_page(page); | 2436 | unlock_page(page); |
2358 | goto out; | 2437 | goto out; |
@@ -2370,22 +2449,35 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2370 | goto out_nomap; | 2449 | goto out_nomap; |
2371 | } | 2450 | } |
2372 | 2451 | ||
2373 | /* The page isn't present yet, go ahead with the fault. */ | 2452 | /* |
2453 | * The page isn't present yet, go ahead with the fault. | ||
2454 | * | ||
2455 | * Be careful about the sequence of operations here. | ||
2456 | * To get its accounting right, reuse_swap_page() must be called | ||
2457 | * while the page is counted on swap but not yet in mapcount i.e. | ||
2458 | * before page_add_anon_rmap() and swap_free(); try_to_free_swap() | ||
2459 | * must be called after the swap_free(), or it will never succeed. | ||
2460 | * Because delete_from_swap_page() may be called by reuse_swap_page(), | ||
2461 | * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry | ||
2462 | * in page->private. In this case, a record in swap_cgroup is silently | ||
2463 | * discarded at swap_free(). | ||
2464 | */ | ||
2374 | 2465 | ||
2375 | inc_mm_counter(mm, anon_rss); | 2466 | inc_mm_counter(mm, anon_rss); |
2376 | pte = mk_pte(page, vma->vm_page_prot); | 2467 | pte = mk_pte(page, vma->vm_page_prot); |
2377 | if (write_access && can_share_swap_page(page)) { | 2468 | if (write_access && reuse_swap_page(page)) { |
2378 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2469 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
2379 | write_access = 0; | 2470 | write_access = 0; |
2380 | } | 2471 | } |
2381 | |||
2382 | flush_icache_page(vma, page); | 2472 | flush_icache_page(vma, page); |
2383 | set_pte_at(mm, address, page_table, pte); | 2473 | set_pte_at(mm, address, page_table, pte); |
2384 | page_add_anon_rmap(page, vma, address); | 2474 | page_add_anon_rmap(page, vma, address); |
2475 | /* It's better to call commit-charge after rmap is established */ | ||
2476 | mem_cgroup_commit_charge_swapin(page, ptr); | ||
2385 | 2477 | ||
2386 | swap_free(entry); | 2478 | swap_free(entry); |
2387 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) | 2479 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) |
2388 | remove_exclusive_swap_page(page); | 2480 | try_to_free_swap(page); |
2389 | unlock_page(page); | 2481 | unlock_page(page); |
2390 | 2482 | ||
2391 | if (write_access) { | 2483 | if (write_access) { |
@@ -2402,7 +2494,7 @@ unlock: | |||
2402 | out: | 2494 | out: |
2403 | return ret; | 2495 | return ret; |
2404 | out_nomap: | 2496 | out_nomap: |
2405 | mem_cgroup_uncharge_page(page); | 2497 | mem_cgroup_cancel_charge_swapin(ptr); |
2406 | pte_unmap_unlock(page_table, ptl); | 2498 | pte_unmap_unlock(page_table, ptl); |
2407 | unlock_page(page); | 2499 | unlock_page(page); |
2408 | page_cache_release(page); | 2500 | page_cache_release(page); |
@@ -2432,7 +2524,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2432 | goto oom; | 2524 | goto oom; |
2433 | __SetPageUptodate(page); | 2525 | __SetPageUptodate(page); |
2434 | 2526 | ||
2435 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) | 2527 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) |
2436 | goto oom_free_page; | 2528 | goto oom_free_page; |
2437 | 2529 | ||
2438 | entry = mk_pte(page, vma->vm_page_prot); | 2530 | entry = mk_pte(page, vma->vm_page_prot); |
@@ -2442,8 +2534,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2442 | if (!pte_none(*page_table)) | 2534 | if (!pte_none(*page_table)) |
2443 | goto release; | 2535 | goto release; |
2444 | inc_mm_counter(mm, anon_rss); | 2536 | inc_mm_counter(mm, anon_rss); |
2445 | SetPageSwapBacked(page); | ||
2446 | lru_cache_add_active_or_unevictable(page, vma); | ||
2447 | page_add_new_anon_rmap(page, vma, address); | 2537 | page_add_new_anon_rmap(page, vma, address); |
2448 | set_pte_at(mm, address, page_table, entry); | 2538 | set_pte_at(mm, address, page_table, entry); |
2449 | 2539 | ||
@@ -2525,7 +2615,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2525 | ret = VM_FAULT_OOM; | 2615 | ret = VM_FAULT_OOM; |
2526 | goto out; | 2616 | goto out; |
2527 | } | 2617 | } |
2528 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { | 2618 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { |
2529 | ret = VM_FAULT_OOM; | 2619 | ret = VM_FAULT_OOM; |
2530 | page_cache_release(page); | 2620 | page_cache_release(page); |
2531 | goto out; | 2621 | goto out; |
@@ -2591,8 +2681,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2591 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2681 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2592 | if (anon) { | 2682 | if (anon) { |
2593 | inc_mm_counter(mm, anon_rss); | 2683 | inc_mm_counter(mm, anon_rss); |
2594 | SetPageSwapBacked(page); | ||
2595 | lru_cache_add_active_or_unevictable(page, vma); | ||
2596 | page_add_new_anon_rmap(page, vma, address); | 2684 | page_add_new_anon_rmap(page, vma, address); |
2597 | } else { | 2685 | } else { |
2598 | inc_mm_counter(mm, file_rss); | 2686 | inc_mm_counter(mm, file_rss); |
@@ -2602,7 +2690,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2602 | get_page(dirty_page); | 2690 | get_page(dirty_page); |
2603 | } | 2691 | } |
2604 | } | 2692 | } |
2605 | //TODO: is this safe? do_anonymous_page() does it this way. | ||
2606 | set_pte_at(mm, address, page_table, entry); | 2693 | set_pte_at(mm, address, page_table, entry); |
2607 | 2694 | ||
2608 | /* no need to invalidate: a not-present page won't be cached */ | 2695 | /* no need to invalidate: a not-present page won't be cached */ |
@@ -2666,12 +2753,11 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2666 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2753 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
2667 | return 0; | 2754 | return 0; |
2668 | 2755 | ||
2669 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || | 2756 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { |
2670 | !(vma->vm_flags & VM_CAN_NONLINEAR))) { | ||
2671 | /* | 2757 | /* |
2672 | * Page table corrupted: show pte and kill process. | 2758 | * Page table corrupted: show pte and kill process. |
2673 | */ | 2759 | */ |
2674 | print_bad_pte(vma, orig_pte, address); | 2760 | print_bad_pte(vma, address, orig_pte, NULL); |
2675 | return VM_FAULT_OOM; | 2761 | return VM_FAULT_OOM; |
2676 | } | 2762 | } |
2677 | 2763 | ||
@@ -2953,7 +3039,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, | |||
2953 | { | 3039 | { |
2954 | resource_size_t phys_addr; | 3040 | resource_size_t phys_addr; |
2955 | unsigned long prot = 0; | 3041 | unsigned long prot = 0; |
2956 | void *maddr; | 3042 | void __iomem *maddr; |
2957 | int offset = addr & (PAGE_SIZE-1); | 3043 | int offset = addr & (PAGE_SIZE-1); |
2958 | 3044 | ||
2959 | if (follow_phys(vma, addr, write, &prot, &phys_addr)) | 3045 | if (follow_phys(vma, addr, write, &prot, &phys_addr)) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b17371185468..c083cf5fd6df 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -216,7 +216,8 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
216 | return 0; | 216 | return 0; |
217 | } | 217 | } |
218 | 218 | ||
219 | static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn) | 219 | static int __meminit __add_section(int nid, struct zone *zone, |
220 | unsigned long phys_start_pfn) | ||
220 | { | 221 | { |
221 | int nr_pages = PAGES_PER_SECTION; | 222 | int nr_pages = PAGES_PER_SECTION; |
222 | int ret; | 223 | int ret; |
@@ -234,7 +235,7 @@ static int __meminit __add_section(struct zone *zone, unsigned long phys_start_p | |||
234 | if (ret < 0) | 235 | if (ret < 0) |
235 | return ret; | 236 | return ret; |
236 | 237 | ||
237 | return register_new_memory(__pfn_to_section(phys_start_pfn)); | 238 | return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); |
238 | } | 239 | } |
239 | 240 | ||
240 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 241 | #ifdef CONFIG_SPARSEMEM_VMEMMAP |
@@ -273,8 +274,8 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) | |||
273 | * call this function after deciding the zone to which to | 274 | * call this function after deciding the zone to which to |
274 | * add the new pages. | 275 | * add the new pages. |
275 | */ | 276 | */ |
276 | int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, | 277 | int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, |
277 | unsigned long nr_pages) | 278 | unsigned long nr_pages) |
278 | { | 279 | { |
279 | unsigned long i; | 280 | unsigned long i; |
280 | int err = 0; | 281 | int err = 0; |
@@ -284,7 +285,7 @@ int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
284 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); | 285 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); |
285 | 286 | ||
286 | for (i = start_sec; i <= end_sec; i++) { | 287 | for (i = start_sec; i <= end_sec; i++) { |
287 | err = __add_section(zone, i << PFN_SECTION_SHIFT); | 288 | err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); |
288 | 289 | ||
289 | /* | 290 | /* |
290 | * EEXIST is finally dealt with by ioresource collision | 291 | * EEXIST is finally dealt with by ioresource collision |
@@ -626,15 +627,12 @@ int scan_lru_pages(unsigned long start, unsigned long end) | |||
626 | } | 627 | } |
627 | 628 | ||
628 | static struct page * | 629 | static struct page * |
629 | hotremove_migrate_alloc(struct page *page, | 630 | hotremove_migrate_alloc(struct page *page, unsigned long private, int **x) |
630 | unsigned long private, | ||
631 | int **x) | ||
632 | { | 631 | { |
633 | /* This should be improoooooved!! */ | 632 | /* This should be improooooved!! */ |
634 | return alloc_page(GFP_HIGHUSER_PAGECACHE); | 633 | return alloc_page(GFP_HIGHUSER_MOVABLE); |
635 | } | 634 | } |
636 | 635 | ||
637 | |||
638 | #define NR_OFFLINE_AT_ONCE_PAGES (256) | 636 | #define NR_OFFLINE_AT_ONCE_PAGES (256) |
639 | static int | 637 | static int |
640 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | 638 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) |
diff --git a/mm/migrate.c b/mm/migrate.c index 21631ab8c08b..a30ea5fcf9f1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -121,20 +121,6 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
121 | if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) | 121 | if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) |
122 | goto out; | 122 | goto out; |
123 | 123 | ||
124 | /* | ||
125 | * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge. | ||
126 | * Failure is not an option here: we're now expected to remove every | ||
127 | * migration pte, and will cause crashes otherwise. Normally this | ||
128 | * is not an issue: mem_cgroup_prepare_migration bumped up the old | ||
129 | * page_cgroup count for safety, that's now attached to the new page, | ||
130 | * so this charge should just be another incrementation of the count, | ||
131 | * to keep in balance with rmap.c's mem_cgroup_uncharging. But if | ||
132 | * there's been a force_empty, those reference counts may no longer | ||
133 | * be reliable, and this charge can actually fail: oh well, we don't | ||
134 | * make the situation any worse by proceeding as if it had succeeded. | ||
135 | */ | ||
136 | mem_cgroup_charge(new, mm, GFP_ATOMIC); | ||
137 | |||
138 | get_page(new); | 124 | get_page(new); |
139 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 125 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
140 | if (is_write_migration_entry(entry)) | 126 | if (is_write_migration_entry(entry)) |
@@ -300,12 +286,10 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
300 | * Now we know that no one else is looking at the page. | 286 | * Now we know that no one else is looking at the page. |
301 | */ | 287 | */ |
302 | get_page(newpage); /* add cache reference */ | 288 | get_page(newpage); /* add cache reference */ |
303 | #ifdef CONFIG_SWAP | ||
304 | if (PageSwapCache(page)) { | 289 | if (PageSwapCache(page)) { |
305 | SetPageSwapCache(newpage); | 290 | SetPageSwapCache(newpage); |
306 | set_page_private(newpage, page_private(page)); | 291 | set_page_private(newpage, page_private(page)); |
307 | } | 292 | } |
308 | #endif | ||
309 | 293 | ||
310 | radix_tree_replace_slot(pslot, newpage); | 294 | radix_tree_replace_slot(pslot, newpage); |
311 | 295 | ||
@@ -373,18 +357,13 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
373 | 357 | ||
374 | mlock_migrate_page(newpage, page); | 358 | mlock_migrate_page(newpage, page); |
375 | 359 | ||
376 | #ifdef CONFIG_SWAP | ||
377 | ClearPageSwapCache(page); | 360 | ClearPageSwapCache(page); |
378 | #endif | ||
379 | ClearPagePrivate(page); | 361 | ClearPagePrivate(page); |
380 | set_page_private(page, 0); | 362 | set_page_private(page, 0); |
381 | /* page->mapping contains a flag for PageAnon() */ | 363 | /* page->mapping contains a flag for PageAnon() */ |
382 | anon = PageAnon(page); | 364 | anon = PageAnon(page); |
383 | page->mapping = NULL; | 365 | page->mapping = NULL; |
384 | 366 | ||
385 | if (!anon) /* This page was removed from radix-tree. */ | ||
386 | mem_cgroup_uncharge_cache_page(page); | ||
387 | |||
388 | /* | 367 | /* |
389 | * If any waiters have accumulated on the new page then | 368 | * If any waiters have accumulated on the new page then |
390 | * wake them up. | 369 | * wake them up. |
@@ -618,6 +597,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
618 | struct page *newpage = get_new_page(page, private, &result); | 597 | struct page *newpage = get_new_page(page, private, &result); |
619 | int rcu_locked = 0; | 598 | int rcu_locked = 0; |
620 | int charge = 0; | 599 | int charge = 0; |
600 | struct mem_cgroup *mem; | ||
621 | 601 | ||
622 | if (!newpage) | 602 | if (!newpage) |
623 | return -ENOMEM; | 603 | return -ENOMEM; |
@@ -627,24 +607,26 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
627 | goto move_newpage; | 607 | goto move_newpage; |
628 | } | 608 | } |
629 | 609 | ||
630 | charge = mem_cgroup_prepare_migration(page, newpage); | ||
631 | if (charge == -ENOMEM) { | ||
632 | rc = -ENOMEM; | ||
633 | goto move_newpage; | ||
634 | } | ||
635 | /* prepare cgroup just returns 0 or -ENOMEM */ | 610 | /* prepare cgroup just returns 0 or -ENOMEM */ |
636 | BUG_ON(charge); | ||
637 | |||
638 | rc = -EAGAIN; | 611 | rc = -EAGAIN; |
612 | |||
639 | if (!trylock_page(page)) { | 613 | if (!trylock_page(page)) { |
640 | if (!force) | 614 | if (!force) |
641 | goto move_newpage; | 615 | goto move_newpage; |
642 | lock_page(page); | 616 | lock_page(page); |
643 | } | 617 | } |
644 | 618 | ||
619 | /* charge against new page */ | ||
620 | charge = mem_cgroup_prepare_migration(page, &mem); | ||
621 | if (charge == -ENOMEM) { | ||
622 | rc = -ENOMEM; | ||
623 | goto unlock; | ||
624 | } | ||
625 | BUG_ON(charge); | ||
626 | |||
645 | if (PageWriteback(page)) { | 627 | if (PageWriteback(page)) { |
646 | if (!force) | 628 | if (!force) |
647 | goto unlock; | 629 | goto uncharge; |
648 | wait_on_page_writeback(page); | 630 | wait_on_page_writeback(page); |
649 | } | 631 | } |
650 | /* | 632 | /* |
@@ -697,7 +679,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
697 | rcu_unlock: | 679 | rcu_unlock: |
698 | if (rcu_locked) | 680 | if (rcu_locked) |
699 | rcu_read_unlock(); | 681 | rcu_read_unlock(); |
700 | 682 | uncharge: | |
683 | if (!charge) | ||
684 | mem_cgroup_end_migration(mem, page, newpage); | ||
701 | unlock: | 685 | unlock: |
702 | unlock_page(page); | 686 | unlock_page(page); |
703 | 687 | ||
@@ -713,8 +697,6 @@ unlock: | |||
713 | } | 697 | } |
714 | 698 | ||
715 | move_newpage: | 699 | move_newpage: |
716 | if (!charge) | ||
717 | mem_cgroup_end_migration(newpage); | ||
718 | 700 | ||
719 | /* | 701 | /* |
720 | * Move the new page to the LRU. If migration was not successful | 702 | * Move the new page to the LRU. If migration was not successful |
@@ -848,12 +830,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
848 | struct vm_area_struct *vma; | 830 | struct vm_area_struct *vma; |
849 | struct page *page; | 831 | struct page *page; |
850 | 832 | ||
851 | /* | ||
852 | * A valid page pointer that will not match any of the | ||
853 | * pages that will be moved. | ||
854 | */ | ||
855 | pp->page = ZERO_PAGE(0); | ||
856 | |||
857 | err = -EFAULT; | 833 | err = -EFAULT; |
858 | vma = find_vma(mm, pp->addr); | 834 | vma = find_vma(mm, pp->addr); |
859 | if (!vma || !vma_migratable(vma)) | 835 | if (!vma || !vma_migratable(vma)) |
@@ -919,41 +895,43 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | |||
919 | const int __user *nodes, | 895 | const int __user *nodes, |
920 | int __user *status, int flags) | 896 | int __user *status, int flags) |
921 | { | 897 | { |
922 | struct page_to_node *pm = NULL; | 898 | struct page_to_node *pm; |
923 | nodemask_t task_nodes; | 899 | nodemask_t task_nodes; |
924 | int err = 0; | 900 | unsigned long chunk_nr_pages; |
925 | int i; | 901 | unsigned long chunk_start; |
902 | int err; | ||
926 | 903 | ||
927 | task_nodes = cpuset_mems_allowed(task); | 904 | task_nodes = cpuset_mems_allowed(task); |
928 | 905 | ||
929 | /* Limit nr_pages so that the multiplication may not overflow */ | 906 | err = -ENOMEM; |
930 | if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { | 907 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); |
931 | err = -E2BIG; | 908 | if (!pm) |
932 | goto out; | ||
933 | } | ||
934 | |||
935 | pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); | ||
936 | if (!pm) { | ||
937 | err = -ENOMEM; | ||
938 | goto out; | 909 | goto out; |
939 | } | ||
940 | |||
941 | /* | 910 | /* |
942 | * Get parameters from user space and initialize the pm | 911 | * Store a chunk of page_to_node array in a page, |
943 | * array. Return various errors if the user did something wrong. | 912 | * but keep the last one as a marker |
944 | */ | 913 | */ |
945 | for (i = 0; i < nr_pages; i++) { | 914 | chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; |
946 | const void __user *p; | ||
947 | 915 | ||
948 | err = -EFAULT; | 916 | for (chunk_start = 0; |
949 | if (get_user(p, pages + i)) | 917 | chunk_start < nr_pages; |
950 | goto out_pm; | 918 | chunk_start += chunk_nr_pages) { |
919 | int j; | ||
920 | |||
921 | if (chunk_start + chunk_nr_pages > nr_pages) | ||
922 | chunk_nr_pages = nr_pages - chunk_start; | ||
951 | 923 | ||
952 | pm[i].addr = (unsigned long)p; | 924 | /* fill the chunk pm with addrs and nodes from user-space */ |
953 | if (nodes) { | 925 | for (j = 0; j < chunk_nr_pages; j++) { |
926 | const void __user *p; | ||
954 | int node; | 927 | int node; |
955 | 928 | ||
956 | if (get_user(node, nodes + i)) | 929 | err = -EFAULT; |
930 | if (get_user(p, pages + j + chunk_start)) | ||
931 | goto out_pm; | ||
932 | pm[j].addr = (unsigned long) p; | ||
933 | |||
934 | if (get_user(node, nodes + j + chunk_start)) | ||
957 | goto out_pm; | 935 | goto out_pm; |
958 | 936 | ||
959 | err = -ENODEV; | 937 | err = -ENODEV; |
@@ -964,22 +942,29 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | |||
964 | if (!node_isset(node, task_nodes)) | 942 | if (!node_isset(node, task_nodes)) |
965 | goto out_pm; | 943 | goto out_pm; |
966 | 944 | ||
967 | pm[i].node = node; | 945 | pm[j].node = node; |
968 | } else | 946 | } |
969 | pm[i].node = 0; /* anything to not match MAX_NUMNODES */ | 947 | |
970 | } | 948 | /* End marker for this chunk */ |
971 | /* End marker */ | 949 | pm[chunk_nr_pages].node = MAX_NUMNODES; |
972 | pm[nr_pages].node = MAX_NUMNODES; | 950 | |
951 | /* Migrate this chunk */ | ||
952 | err = do_move_page_to_node_array(mm, pm, | ||
953 | flags & MPOL_MF_MOVE_ALL); | ||
954 | if (err < 0) | ||
955 | goto out_pm; | ||
973 | 956 | ||
974 | err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL); | ||
975 | if (err >= 0) | ||
976 | /* Return status information */ | 957 | /* Return status information */ |
977 | for (i = 0; i < nr_pages; i++) | 958 | for (j = 0; j < chunk_nr_pages; j++) |
978 | if (put_user(pm[i].status, status + i)) | 959 | if (put_user(pm[j].status, status + j + chunk_start)) { |
979 | err = -EFAULT; | 960 | err = -EFAULT; |
961 | goto out_pm; | ||
962 | } | ||
963 | } | ||
964 | err = 0; | ||
980 | 965 | ||
981 | out_pm: | 966 | out_pm: |
982 | vfree(pm); | 967 | free_page((unsigned long)pm); |
983 | out: | 968 | out: |
984 | return err; | 969 | return err; |
985 | } | 970 | } |
diff --git a/mm/mlock.c b/mm/mlock.c index 3035a56e7616..e125156c664e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -173,12 +173,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
173 | (atomic_read(&mm->mm_users) != 0)); | 173 | (atomic_read(&mm->mm_users) != 0)); |
174 | 174 | ||
175 | /* | 175 | /* |
176 | * mlock: don't page populate if page has PROT_NONE permission. | 176 | * mlock: don't page populate if vma has PROT_NONE permission. |
177 | * munlock: the pages always do munlock althrough | 177 | * munlock: always do munlock although the vma has PROT_NONE |
178 | * its has PROT_NONE permission. | 178 | * permission, or SIGKILL is pending. |
179 | */ | 179 | */ |
180 | if (!mlock) | 180 | if (!mlock) |
181 | gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; | 181 | gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS | |
182 | GUP_FLAGS_IGNORE_SIGKILL; | ||
182 | 183 | ||
183 | if (vma->vm_flags & VM_WRITE) | 184 | if (vma->vm_flags & VM_WRITE) |
184 | gup_flags |= GUP_FLAGS_WRITE; | 185 | gup_flags |= GUP_FLAGS_WRITE; |
@@ -413,7 +413,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | |||
413 | 413 | ||
414 | static void __vma_link_file(struct vm_area_struct *vma) | 414 | static void __vma_link_file(struct vm_area_struct *vma) |
415 | { | 415 | { |
416 | struct file * file; | 416 | struct file *file; |
417 | 417 | ||
418 | file = vma->vm_file; | 418 | file = vma->vm_file; |
419 | if (file) { | 419 | if (file) { |
@@ -474,11 +474,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
474 | * insert vm structure into list and rbtree and anon_vma, | 474 | * insert vm structure into list and rbtree and anon_vma, |
475 | * but it has already been inserted into prio_tree earlier. | 475 | * but it has already been inserted into prio_tree earlier. |
476 | */ | 476 | */ |
477 | static void | 477 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
478 | __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | ||
479 | { | 478 | { |
480 | struct vm_area_struct * __vma, * prev; | 479 | struct vm_area_struct *__vma, *prev; |
481 | struct rb_node ** rb_link, * rb_parent; | 480 | struct rb_node **rb_link, *rb_parent; |
482 | 481 | ||
483 | __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); | 482 | __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); |
484 | BUG_ON(__vma && __vma->vm_start < vma->vm_end); | 483 | BUG_ON(__vma && __vma->vm_start < vma->vm_end); |
@@ -908,7 +907,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, | |||
908 | * The caller must hold down_write(current->mm->mmap_sem). | 907 | * The caller must hold down_write(current->mm->mmap_sem). |
909 | */ | 908 | */ |
910 | 909 | ||
911 | unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | 910 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, |
912 | unsigned long len, unsigned long prot, | 911 | unsigned long len, unsigned long prot, |
913 | unsigned long flags, unsigned long pgoff) | 912 | unsigned long flags, unsigned long pgoff) |
914 | { | 913 | { |
@@ -1464,7 +1463,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | |||
1464 | EXPORT_SYMBOL(get_unmapped_area); | 1463 | EXPORT_SYMBOL(get_unmapped_area); |
1465 | 1464 | ||
1466 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ | 1465 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ |
1467 | struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) | 1466 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) |
1468 | { | 1467 | { |
1469 | struct vm_area_struct *vma = NULL; | 1468 | struct vm_area_struct *vma = NULL; |
1470 | 1469 | ||
@@ -1507,7 +1506,7 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, | |||
1507 | struct vm_area_struct **pprev) | 1506 | struct vm_area_struct **pprev) |
1508 | { | 1507 | { |
1509 | struct vm_area_struct *vma = NULL, *prev = NULL; | 1508 | struct vm_area_struct *vma = NULL, *prev = NULL; |
1510 | struct rb_node * rb_node; | 1509 | struct rb_node *rb_node; |
1511 | if (!mm) | 1510 | if (!mm) |
1512 | goto out; | 1511 | goto out; |
1513 | 1512 | ||
@@ -1541,7 +1540,7 @@ out: | |||
1541 | * update accounting. This is shared with both the | 1540 | * update accounting. This is shared with both the |
1542 | * grow-up and grow-down cases. | 1541 | * grow-up and grow-down cases. |
1543 | */ | 1542 | */ |
1544 | static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow) | 1543 | static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) |
1545 | { | 1544 | { |
1546 | struct mm_struct *mm = vma->vm_mm; | 1545 | struct mm_struct *mm = vma->vm_mm; |
1547 | struct rlimit *rlim = current->signal->rlim; | 1546 | struct rlimit *rlim = current->signal->rlim; |
@@ -2091,6 +2090,9 @@ void exit_mmap(struct mm_struct *mm) | |||
2091 | arch_exit_mmap(mm); | 2090 | arch_exit_mmap(mm); |
2092 | mmu_notifier_release(mm); | 2091 | mmu_notifier_release(mm); |
2093 | 2092 | ||
2093 | if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */ | ||
2094 | return; | ||
2095 | |||
2094 | if (mm->locked_vm) { | 2096 | if (mm->locked_vm) { |
2095 | vma = mm->mmap; | 2097 | vma = mm->mmap; |
2096 | while (vma) { | 2098 | while (vma) { |
@@ -2103,7 +2105,7 @@ void exit_mmap(struct mm_struct *mm) | |||
2103 | lru_add_drain(); | 2105 | lru_add_drain(); |
2104 | flush_cache_mm(mm); | 2106 | flush_cache_mm(mm); |
2105 | tlb = tlb_gather_mmu(mm, 1); | 2107 | tlb = tlb_gather_mmu(mm, 1); |
2106 | /* Don't update_hiwater_rss(mm) here, do_exit already did */ | 2108 | /* update_hiwater_rss(mm) here? but nobody should be looking */ |
2107 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 2109 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
2108 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); | 2110 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); |
2109 | vm_unacct_memory(nr_accounted); | 2111 | vm_unacct_memory(nr_accounted); |
@@ -2470,3 +2472,13 @@ void mm_drop_all_locks(struct mm_struct *mm) | |||
2470 | 2472 | ||
2471 | mutex_unlock(&mm_all_locks_mutex); | 2473 | mutex_unlock(&mm_all_locks_mutex); |
2472 | } | 2474 | } |
2475 | |||
2476 | /* | ||
2477 | * initialise the VMA slab | ||
2478 | */ | ||
2479 | void __init mmap_init(void) | ||
2480 | { | ||
2481 | vm_area_cachep = kmem_cache_create("vm_area_struct", | ||
2482 | sizeof(struct vm_area_struct), 0, | ||
2483 | SLAB_PANIC, NULL); | ||
2484 | } | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index cfb4c4852062..d0f6e7ce09f1 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/swap.h> | 22 | #include <linux/swap.h> |
23 | #include <linux/swapops.h> | 23 | #include <linux/swapops.h> |
24 | #include <linux/mmu_notifier.h> | 24 | #include <linux/mmu_notifier.h> |
25 | #include <linux/migrate.h> | ||
25 | #include <asm/uaccess.h> | 26 | #include <asm/uaccess.h> |
26 | #include <asm/pgtable.h> | 27 | #include <asm/pgtable.h> |
27 | #include <asm/cacheflush.h> | 28 | #include <asm/cacheflush.h> |
@@ -59,8 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
59 | ptent = pte_mkwrite(ptent); | 60 | ptent = pte_mkwrite(ptent); |
60 | 61 | ||
61 | ptep_modify_prot_commit(mm, addr, pte, ptent); | 62 | ptep_modify_prot_commit(mm, addr, pte, ptent); |
62 | #ifdef CONFIG_MIGRATION | 63 | } else if (PAGE_MIGRATION && !pte_file(oldpte)) { |
63 | } else if (!pte_file(oldpte)) { | ||
64 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 64 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
65 | 65 | ||
66 | if (is_write_migration_entry(entry)) { | 66 | if (is_write_migration_entry(entry)) { |
@@ -72,9 +72,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
72 | set_pte_at(mm, addr, pte, | 72 | set_pte_at(mm, addr, pte, |
73 | swp_entry_to_pte(entry)); | 73 | swp_entry_to_pte(entry)); |
74 | } | 74 | } |
75 | #endif | ||
76 | } | 75 | } |
77 | |||
78 | } while (pte++, addr += PAGE_SIZE, addr != end); | 76 | } while (pte++, addr += PAGE_SIZE, addr != end); |
79 | arch_leave_lazy_mmu_mode(); | 77 | arch_leave_lazy_mmu_mode(); |
80 | pte_unmap_unlock(pte - 1, ptl); | 78 | pte_unmap_unlock(pte - 1, ptl); |
diff --git a/mm/nommu.c b/mm/nommu.c index 1c28ea3a4e9c..60ed8375c986 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -6,11 +6,11 @@ | |||
6 | * | 6 | * |
7 | * See Documentation/nommu-mmap.txt | 7 | * See Documentation/nommu-mmap.txt |
8 | * | 8 | * |
9 | * Copyright (c) 2004-2005 David Howells <dhowells@redhat.com> | 9 | * Copyright (c) 2004-2008 David Howells <dhowells@redhat.com> |
10 | * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> | 10 | * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> |
11 | * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> | 11 | * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> |
12 | * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> | 12 | * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> |
13 | * Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org> | 13 | * Copyright (c) 2007-2008 Paul Mundt <lethal@linux-sh.org> |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
@@ -33,6 +33,28 @@ | |||
33 | #include <asm/uaccess.h> | 33 | #include <asm/uaccess.h> |
34 | #include <asm/tlb.h> | 34 | #include <asm/tlb.h> |
35 | #include <asm/tlbflush.h> | 35 | #include <asm/tlbflush.h> |
36 | #include "internal.h" | ||
37 | |||
38 | static inline __attribute__((format(printf, 1, 2))) | ||
39 | void no_printk(const char *fmt, ...) | ||
40 | { | ||
41 | } | ||
42 | |||
43 | #if 0 | ||
44 | #define kenter(FMT, ...) \ | ||
45 | printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) | ||
46 | #define kleave(FMT, ...) \ | ||
47 | printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) | ||
48 | #define kdebug(FMT, ...) \ | ||
49 | printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__) | ||
50 | #else | ||
51 | #define kenter(FMT, ...) \ | ||
52 | no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) | ||
53 | #define kleave(FMT, ...) \ | ||
54 | no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) | ||
55 | #define kdebug(FMT, ...) \ | ||
56 | no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) | ||
57 | #endif | ||
36 | 58 | ||
37 | #include "internal.h" | 59 | #include "internal.h" |
38 | 60 | ||
@@ -40,19 +62,22 @@ void *high_memory; | |||
40 | struct page *mem_map; | 62 | struct page *mem_map; |
41 | unsigned long max_mapnr; | 63 | unsigned long max_mapnr; |
42 | unsigned long num_physpages; | 64 | unsigned long num_physpages; |
43 | unsigned long askedalloc, realalloc; | ||
44 | atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); | 65 | atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); |
45 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 66 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
46 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 67 | int sysctl_overcommit_ratio = 50; /* default is 50% */ |
47 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | 68 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; |
69 | int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ | ||
48 | int heap_stack_gap = 0; | 70 | int heap_stack_gap = 0; |
49 | 71 | ||
72 | atomic_t mmap_pages_allocated; | ||
73 | |||
50 | EXPORT_SYMBOL(mem_map); | 74 | EXPORT_SYMBOL(mem_map); |
51 | EXPORT_SYMBOL(num_physpages); | 75 | EXPORT_SYMBOL(num_physpages); |
52 | 76 | ||
53 | /* list of shareable VMAs */ | 77 | /* list of mapped, potentially shareable regions */ |
54 | struct rb_root nommu_vma_tree = RB_ROOT; | 78 | static struct kmem_cache *vm_region_jar; |
55 | DECLARE_RWSEM(nommu_vma_sem); | 79 | struct rb_root nommu_region_tree = RB_ROOT; |
80 | DECLARE_RWSEM(nommu_region_sem); | ||
56 | 81 | ||
57 | struct vm_operations_struct generic_file_vm_ops = { | 82 | struct vm_operations_struct generic_file_vm_ops = { |
58 | }; | 83 | }; |
@@ -124,6 +149,20 @@ unsigned int kobjsize(const void *objp) | |||
124 | return ksize(objp); | 149 | return ksize(objp); |
125 | 150 | ||
126 | /* | 151 | /* |
152 | * If it's not a compound page, see if we have a matching VMA | ||
153 | * region. This test is intentionally done in reverse order, | ||
154 | * so if there's no VMA, we still fall through and hand back | ||
155 | * PAGE_SIZE for 0-order pages. | ||
156 | */ | ||
157 | if (!PageCompound(page)) { | ||
158 | struct vm_area_struct *vma; | ||
159 | |||
160 | vma = find_vma(current->mm, (unsigned long)objp); | ||
161 | if (vma) | ||
162 | return vma->vm_end - vma->vm_start; | ||
163 | } | ||
164 | |||
165 | /* | ||
127 | * The ksize() function is only guaranteed to work for pointers | 166 | * The ksize() function is only guaranteed to work for pointers |
128 | * returned by kmalloc(). So handle arbitrary pointers here. | 167 | * returned by kmalloc(). So handle arbitrary pointers here. |
129 | */ | 168 | */ |
@@ -401,129 +440,178 @@ asmlinkage unsigned long sys_brk(unsigned long brk) | |||
401 | return mm->brk = brk; | 440 | return mm->brk = brk; |
402 | } | 441 | } |
403 | 442 | ||
404 | #ifdef DEBUG | 443 | /* |
405 | static void show_process_blocks(void) | 444 | * initialise the VMA and region record slabs |
445 | */ | ||
446 | void __init mmap_init(void) | ||
406 | { | 447 | { |
407 | struct vm_list_struct *vml; | 448 | vm_region_jar = kmem_cache_create("vm_region_jar", |
408 | 449 | sizeof(struct vm_region), 0, | |
409 | printk("Process blocks %d:", current->pid); | 450 | SLAB_PANIC, NULL); |
410 | 451 | vm_area_cachep = kmem_cache_create("vm_area_struct", | |
411 | for (vml = ¤t->mm->context.vmlist; vml; vml = vml->next) { | 452 | sizeof(struct vm_area_struct), 0, |
412 | printk(" %p: %p", vml, vml->vma); | 453 | SLAB_PANIC, NULL); |
413 | if (vml->vma) | ||
414 | printk(" (%d @%lx #%d)", | ||
415 | kobjsize((void *) vml->vma->vm_start), | ||
416 | vml->vma->vm_start, | ||
417 | atomic_read(&vml->vma->vm_usage)); | ||
418 | printk(vml->next ? " ->" : ".\n"); | ||
419 | } | ||
420 | } | 454 | } |
421 | #endif /* DEBUG */ | ||
422 | 455 | ||
423 | /* | 456 | /* |
424 | * add a VMA into a process's mm_struct in the appropriate place in the list | 457 | * validate the region tree |
425 | * - should be called with mm->mmap_sem held writelocked | 458 | * - the caller must hold the region lock |
426 | */ | 459 | */ |
427 | static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml) | 460 | #ifdef CONFIG_DEBUG_NOMMU_REGIONS |
461 | static noinline void validate_nommu_regions(void) | ||
428 | { | 462 | { |
429 | struct vm_list_struct **ppv; | 463 | struct vm_region *region, *last; |
430 | 464 | struct rb_node *p, *lastp; | |
431 | for (ppv = ¤t->mm->context.vmlist; *ppv; ppv = &(*ppv)->next) | 465 | |
432 | if ((*ppv)->vma->vm_start > vml->vma->vm_start) | 466 | lastp = rb_first(&nommu_region_tree); |
433 | break; | 467 | if (!lastp) |
434 | 468 | return; | |
435 | vml->next = *ppv; | 469 | |
436 | *ppv = vml; | 470 | last = rb_entry(lastp, struct vm_region, vm_rb); |
471 | if (unlikely(last->vm_end <= last->vm_start)) | ||
472 | BUG(); | ||
473 | if (unlikely(last->vm_top < last->vm_end)) | ||
474 | BUG(); | ||
475 | |||
476 | while ((p = rb_next(lastp))) { | ||
477 | region = rb_entry(p, struct vm_region, vm_rb); | ||
478 | last = rb_entry(lastp, struct vm_region, vm_rb); | ||
479 | |||
480 | if (unlikely(region->vm_end <= region->vm_start)) | ||
481 | BUG(); | ||
482 | if (unlikely(region->vm_top < region->vm_end)) | ||
483 | BUG(); | ||
484 | if (unlikely(region->vm_start < last->vm_top)) | ||
485 | BUG(); | ||
486 | |||
487 | lastp = p; | ||
488 | } | ||
437 | } | 489 | } |
490 | #else | ||
491 | #define validate_nommu_regions() do {} while(0) | ||
492 | #endif | ||
438 | 493 | ||
439 | /* | 494 | /* |
440 | * look up the first VMA in which addr resides, NULL if none | 495 | * add a region into the global tree |
441 | * - should be called with mm->mmap_sem at least held readlocked | ||
442 | */ | 496 | */ |
443 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | 497 | static void add_nommu_region(struct vm_region *region) |
444 | { | 498 | { |
445 | struct vm_list_struct *loop, *vml; | 499 | struct vm_region *pregion; |
500 | struct rb_node **p, *parent; | ||
446 | 501 | ||
447 | /* search the vm_start ordered list */ | 502 | validate_nommu_regions(); |
448 | vml = NULL; | 503 | |
449 | for (loop = mm->context.vmlist; loop; loop = loop->next) { | 504 | BUG_ON(region->vm_start & ~PAGE_MASK); |
450 | if (loop->vma->vm_start > addr) | 505 | |
451 | break; | 506 | parent = NULL; |
452 | vml = loop; | 507 | p = &nommu_region_tree.rb_node; |
508 | while (*p) { | ||
509 | parent = *p; | ||
510 | pregion = rb_entry(parent, struct vm_region, vm_rb); | ||
511 | if (region->vm_start < pregion->vm_start) | ||
512 | p = &(*p)->rb_left; | ||
513 | else if (region->vm_start > pregion->vm_start) | ||
514 | p = &(*p)->rb_right; | ||
515 | else if (pregion == region) | ||
516 | return; | ||
517 | else | ||
518 | BUG(); | ||
453 | } | 519 | } |
454 | 520 | ||
455 | if (vml && vml->vma->vm_end > addr) | 521 | rb_link_node(®ion->vm_rb, parent, p); |
456 | return vml->vma; | 522 | rb_insert_color(®ion->vm_rb, &nommu_region_tree); |
457 | 523 | ||
458 | return NULL; | 524 | validate_nommu_regions(); |
459 | } | 525 | } |
460 | EXPORT_SYMBOL(find_vma); | ||
461 | 526 | ||
462 | /* | 527 | /* |
463 | * find a VMA | 528 | * delete a region from the global tree |
464 | * - we don't extend stack VMAs under NOMMU conditions | ||
465 | */ | 529 | */ |
466 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | 530 | static void delete_nommu_region(struct vm_region *region) |
467 | { | 531 | { |
468 | return find_vma(mm, addr); | 532 | BUG_ON(!nommu_region_tree.rb_node); |
469 | } | ||
470 | 533 | ||
471 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | 534 | validate_nommu_regions(); |
472 | { | 535 | rb_erase(®ion->vm_rb, &nommu_region_tree); |
473 | return -ENOMEM; | 536 | validate_nommu_regions(); |
474 | } | 537 | } |
475 | 538 | ||
476 | /* | 539 | /* |
477 | * look up the first VMA exactly that exactly matches addr | 540 | * free a contiguous series of pages |
478 | * - should be called with mm->mmap_sem at least held readlocked | ||
479 | */ | 541 | */ |
480 | static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | 542 | static void free_page_series(unsigned long from, unsigned long to) |
481 | unsigned long addr) | ||
482 | { | 543 | { |
483 | struct vm_list_struct *vml; | 544 | for (; from < to; from += PAGE_SIZE) { |
484 | 545 | struct page *page = virt_to_page(from); | |
485 | /* search the vm_start ordered list */ | 546 | |
486 | for (vml = mm->context.vmlist; vml; vml = vml->next) { | 547 | kdebug("- free %lx", from); |
487 | if (vml->vma->vm_start == addr) | 548 | atomic_dec(&mmap_pages_allocated); |
488 | return vml->vma; | 549 | if (page_count(page) != 1) |
489 | if (vml->vma->vm_start > addr) | 550 | kdebug("free page %p [%d]", page, page_count(page)); |
490 | break; | 551 | put_page(page); |
491 | } | 552 | } |
492 | |||
493 | return NULL; | ||
494 | } | 553 | } |
495 | 554 | ||
496 | /* | 555 | /* |
497 | * find a VMA in the global tree | 556 | * release a reference to a region |
557 | * - the caller must hold the region semaphore, which this releases | ||
558 | * - the region may not have been added to the tree yet, in which case vm_top | ||
559 | * will equal vm_start | ||
498 | */ | 560 | */ |
499 | static inline struct vm_area_struct *find_nommu_vma(unsigned long start) | 561 | static void __put_nommu_region(struct vm_region *region) |
562 | __releases(nommu_region_sem) | ||
500 | { | 563 | { |
501 | struct vm_area_struct *vma; | 564 | kenter("%p{%d}", region, atomic_read(®ion->vm_usage)); |
502 | struct rb_node *n = nommu_vma_tree.rb_node; | ||
503 | 565 | ||
504 | while (n) { | 566 | BUG_ON(!nommu_region_tree.rb_node); |
505 | vma = rb_entry(n, struct vm_area_struct, vm_rb); | ||
506 | 567 | ||
507 | if (start < vma->vm_start) | 568 | if (atomic_dec_and_test(®ion->vm_usage)) { |
508 | n = n->rb_left; | 569 | if (region->vm_top > region->vm_start) |
509 | else if (start > vma->vm_start) | 570 | delete_nommu_region(region); |
510 | n = n->rb_right; | 571 | up_write(&nommu_region_sem); |
511 | else | 572 | |
512 | return vma; | 573 | if (region->vm_file) |
574 | fput(region->vm_file); | ||
575 | |||
576 | /* IO memory and memory shared directly out of the pagecache | ||
577 | * from ramfs/tmpfs mustn't be released here */ | ||
578 | if (region->vm_flags & VM_MAPPED_COPY) { | ||
579 | kdebug("free series"); | ||
580 | free_page_series(region->vm_start, region->vm_top); | ||
581 | } | ||
582 | kmem_cache_free(vm_region_jar, region); | ||
583 | } else { | ||
584 | up_write(&nommu_region_sem); | ||
513 | } | 585 | } |
586 | } | ||
514 | 587 | ||
515 | return NULL; | 588 | /* |
589 | * release a reference to a region | ||
590 | */ | ||
591 | static void put_nommu_region(struct vm_region *region) | ||
592 | { | ||
593 | down_write(&nommu_region_sem); | ||
594 | __put_nommu_region(region); | ||
516 | } | 595 | } |
517 | 596 | ||
518 | /* | 597 | /* |
519 | * add a VMA in the global tree | 598 | * add a VMA into a process's mm_struct in the appropriate place in the list |
599 | * and tree and add to the address space's page tree also if not an anonymous | ||
600 | * page | ||
601 | * - should be called with mm->mmap_sem held writelocked | ||
520 | */ | 602 | */ |
521 | static void add_nommu_vma(struct vm_area_struct *vma) | 603 | static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) |
522 | { | 604 | { |
523 | struct vm_area_struct *pvma; | 605 | struct vm_area_struct *pvma, **pp; |
524 | struct address_space *mapping; | 606 | struct address_space *mapping; |
525 | struct rb_node **p = &nommu_vma_tree.rb_node; | 607 | struct rb_node **p, *parent; |
526 | struct rb_node *parent = NULL; | 608 | |
609 | kenter(",%p", vma); | ||
610 | |||
611 | BUG_ON(!vma->vm_region); | ||
612 | |||
613 | mm->map_count++; | ||
614 | vma->vm_mm = mm; | ||
527 | 615 | ||
528 | /* add the VMA to the mapping */ | 616 | /* add the VMA to the mapping */ |
529 | if (vma->vm_file) { | 617 | if (vma->vm_file) { |
@@ -534,42 +622,62 @@ static void add_nommu_vma(struct vm_area_struct *vma) | |||
534 | flush_dcache_mmap_unlock(mapping); | 622 | flush_dcache_mmap_unlock(mapping); |
535 | } | 623 | } |
536 | 624 | ||
537 | /* add the VMA to the master list */ | 625 | /* add the VMA to the tree */ |
626 | parent = NULL; | ||
627 | p = &mm->mm_rb.rb_node; | ||
538 | while (*p) { | 628 | while (*p) { |
539 | parent = *p; | 629 | parent = *p; |
540 | pvma = rb_entry(parent, struct vm_area_struct, vm_rb); | 630 | pvma = rb_entry(parent, struct vm_area_struct, vm_rb); |
541 | 631 | ||
542 | if (vma->vm_start < pvma->vm_start) { | 632 | /* sort by: start addr, end addr, VMA struct addr in that order |
633 | * (the latter is necessary as we may get identical VMAs) */ | ||
634 | if (vma->vm_start < pvma->vm_start) | ||
543 | p = &(*p)->rb_left; | 635 | p = &(*p)->rb_left; |
544 | } | 636 | else if (vma->vm_start > pvma->vm_start) |
545 | else if (vma->vm_start > pvma->vm_start) { | ||
546 | p = &(*p)->rb_right; | 637 | p = &(*p)->rb_right; |
547 | } | 638 | else if (vma->vm_end < pvma->vm_end) |
548 | else { | 639 | p = &(*p)->rb_left; |
549 | /* mappings are at the same address - this can only | 640 | else if (vma->vm_end > pvma->vm_end) |
550 | * happen for shared-mem chardevs and shared file | 641 | p = &(*p)->rb_right; |
551 | * mappings backed by ramfs/tmpfs */ | 642 | else if (vma < pvma) |
552 | BUG_ON(!(pvma->vm_flags & VM_SHARED)); | 643 | p = &(*p)->rb_left; |
553 | 644 | else if (vma > pvma) | |
554 | if (vma < pvma) | 645 | p = &(*p)->rb_right; |
555 | p = &(*p)->rb_left; | 646 | else |
556 | else if (vma > pvma) | 647 | BUG(); |
557 | p = &(*p)->rb_right; | ||
558 | else | ||
559 | BUG(); | ||
560 | } | ||
561 | } | 648 | } |
562 | 649 | ||
563 | rb_link_node(&vma->vm_rb, parent, p); | 650 | rb_link_node(&vma->vm_rb, parent, p); |
564 | rb_insert_color(&vma->vm_rb, &nommu_vma_tree); | 651 | rb_insert_color(&vma->vm_rb, &mm->mm_rb); |
652 | |||
653 | /* add VMA to the VMA list also */ | ||
654 | for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) { | ||
655 | if (pvma->vm_start > vma->vm_start) | ||
656 | break; | ||
657 | if (pvma->vm_start < vma->vm_start) | ||
658 | continue; | ||
659 | if (pvma->vm_end < vma->vm_end) | ||
660 | break; | ||
661 | } | ||
662 | |||
663 | vma->vm_next = *pp; | ||
664 | *pp = vma; | ||
565 | } | 665 | } |
566 | 666 | ||
567 | /* | 667 | /* |
568 | * delete a VMA from the global list | 668 | * delete a VMA from its owning mm_struct and address space |
569 | */ | 669 | */ |
570 | static void delete_nommu_vma(struct vm_area_struct *vma) | 670 | static void delete_vma_from_mm(struct vm_area_struct *vma) |
571 | { | 671 | { |
672 | struct vm_area_struct **pp; | ||
572 | struct address_space *mapping; | 673 | struct address_space *mapping; |
674 | struct mm_struct *mm = vma->vm_mm; | ||
675 | |||
676 | kenter("%p", vma); | ||
677 | |||
678 | mm->map_count--; | ||
679 | if (mm->mmap_cache == vma) | ||
680 | mm->mmap_cache = NULL; | ||
573 | 681 | ||
574 | /* remove the VMA from the mapping */ | 682 | /* remove the VMA from the mapping */ |
575 | if (vma->vm_file) { | 683 | if (vma->vm_file) { |
@@ -580,8 +688,115 @@ static void delete_nommu_vma(struct vm_area_struct *vma) | |||
580 | flush_dcache_mmap_unlock(mapping); | 688 | flush_dcache_mmap_unlock(mapping); |
581 | } | 689 | } |
582 | 690 | ||
583 | /* remove from the master list */ | 691 | /* remove from the MM's tree and list */ |
584 | rb_erase(&vma->vm_rb, &nommu_vma_tree); | 692 | rb_erase(&vma->vm_rb, &mm->mm_rb); |
693 | for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) { | ||
694 | if (*pp == vma) { | ||
695 | *pp = vma->vm_next; | ||
696 | break; | ||
697 | } | ||
698 | } | ||
699 | |||
700 | vma->vm_mm = NULL; | ||
701 | } | ||
702 | |||
703 | /* | ||
704 | * destroy a VMA record | ||
705 | */ | ||
706 | static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) | ||
707 | { | ||
708 | kenter("%p", vma); | ||
709 | if (vma->vm_ops && vma->vm_ops->close) | ||
710 | vma->vm_ops->close(vma); | ||
711 | if (vma->vm_file) { | ||
712 | fput(vma->vm_file); | ||
713 | if (vma->vm_flags & VM_EXECUTABLE) | ||
714 | removed_exe_file_vma(mm); | ||
715 | } | ||
716 | put_nommu_region(vma->vm_region); | ||
717 | kmem_cache_free(vm_area_cachep, vma); | ||
718 | } | ||
719 | |||
720 | /* | ||
721 | * look up the first VMA in which addr resides, NULL if none | ||
722 | * - should be called with mm->mmap_sem at least held readlocked | ||
723 | */ | ||
724 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | ||
725 | { | ||
726 | struct vm_area_struct *vma; | ||
727 | struct rb_node *n = mm->mm_rb.rb_node; | ||
728 | |||
729 | /* check the cache first */ | ||
730 | vma = mm->mmap_cache; | ||
731 | if (vma && vma->vm_start <= addr && vma->vm_end > addr) | ||
732 | return vma; | ||
733 | |||
734 | /* trawl the tree (there may be multiple mappings in which addr | ||
735 | * resides) */ | ||
736 | for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { | ||
737 | vma = rb_entry(n, struct vm_area_struct, vm_rb); | ||
738 | if (vma->vm_start > addr) | ||
739 | return NULL; | ||
740 | if (vma->vm_end > addr) { | ||
741 | mm->mmap_cache = vma; | ||
742 | return vma; | ||
743 | } | ||
744 | } | ||
745 | |||
746 | return NULL; | ||
747 | } | ||
748 | EXPORT_SYMBOL(find_vma); | ||
749 | |||
750 | /* | ||
751 | * find a VMA | ||
752 | * - we don't extend stack VMAs under NOMMU conditions | ||
753 | */ | ||
754 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | ||
755 | { | ||
756 | return find_vma(mm, addr); | ||
757 | } | ||
758 | |||
759 | /* | ||
760 | * expand a stack to a given address | ||
761 | * - not supported under NOMMU conditions | ||
762 | */ | ||
763 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | ||
764 | { | ||
765 | return -ENOMEM; | ||
766 | } | ||
767 | |||
768 | /* | ||
769 | * look up the first VMA exactly that exactly matches addr | ||
770 | * - should be called with mm->mmap_sem at least held readlocked | ||
771 | */ | ||
772 | static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | ||
773 | unsigned long addr, | ||
774 | unsigned long len) | ||
775 | { | ||
776 | struct vm_area_struct *vma; | ||
777 | struct rb_node *n = mm->mm_rb.rb_node; | ||
778 | unsigned long end = addr + len; | ||
779 | |||
780 | /* check the cache first */ | ||
781 | vma = mm->mmap_cache; | ||
782 | if (vma && vma->vm_start == addr && vma->vm_end == end) | ||
783 | return vma; | ||
784 | |||
785 | /* trawl the tree (there may be multiple mappings in which addr | ||
786 | * resides) */ | ||
787 | for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { | ||
788 | vma = rb_entry(n, struct vm_area_struct, vm_rb); | ||
789 | if (vma->vm_start < addr) | ||
790 | continue; | ||
791 | if (vma->vm_start > addr) | ||
792 | return NULL; | ||
793 | if (vma->vm_end == end) { | ||
794 | mm->mmap_cache = vma; | ||
795 | return vma; | ||
796 | } | ||
797 | } | ||
798 | |||
799 | return NULL; | ||
585 | } | 800 | } |
586 | 801 | ||
587 | /* | 802 | /* |
@@ -596,7 +811,7 @@ static int validate_mmap_request(struct file *file, | |||
596 | unsigned long pgoff, | 811 | unsigned long pgoff, |
597 | unsigned long *_capabilities) | 812 | unsigned long *_capabilities) |
598 | { | 813 | { |
599 | unsigned long capabilities; | 814 | unsigned long capabilities, rlen; |
600 | unsigned long reqprot = prot; | 815 | unsigned long reqprot = prot; |
601 | int ret; | 816 | int ret; |
602 | 817 | ||
@@ -616,12 +831,12 @@ static int validate_mmap_request(struct file *file, | |||
616 | return -EINVAL; | 831 | return -EINVAL; |
617 | 832 | ||
618 | /* Careful about overflows.. */ | 833 | /* Careful about overflows.. */ |
619 | len = PAGE_ALIGN(len); | 834 | rlen = PAGE_ALIGN(len); |
620 | if (!len || len > TASK_SIZE) | 835 | if (!rlen || rlen > TASK_SIZE) |
621 | return -ENOMEM; | 836 | return -ENOMEM; |
622 | 837 | ||
623 | /* offset overflow? */ | 838 | /* offset overflow? */ |
624 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) | 839 | if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff) |
625 | return -EOVERFLOW; | 840 | return -EOVERFLOW; |
626 | 841 | ||
627 | if (file) { | 842 | if (file) { |
@@ -795,13 +1010,18 @@ static unsigned long determine_vm_flags(struct file *file, | |||
795 | } | 1010 | } |
796 | 1011 | ||
797 | /* | 1012 | /* |
798 | * set up a shared mapping on a file | 1013 | * set up a shared mapping on a file (the driver or filesystem provides and |
1014 | * pins the storage) | ||
799 | */ | 1015 | */ |
800 | static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) | 1016 | static int do_mmap_shared_file(struct vm_area_struct *vma) |
801 | { | 1017 | { |
802 | int ret; | 1018 | int ret; |
803 | 1019 | ||
804 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); | 1020 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); |
1021 | if (ret == 0) { | ||
1022 | vma->vm_region->vm_top = vma->vm_region->vm_end; | ||
1023 | return ret; | ||
1024 | } | ||
805 | if (ret != -ENOSYS) | 1025 | if (ret != -ENOSYS) |
806 | return ret; | 1026 | return ret; |
807 | 1027 | ||
@@ -815,10 +1035,14 @@ static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) | |||
815 | /* | 1035 | /* |
816 | * set up a private mapping or an anonymous shared mapping | 1036 | * set up a private mapping or an anonymous shared mapping |
817 | */ | 1037 | */ |
818 | static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) | 1038 | static int do_mmap_private(struct vm_area_struct *vma, |
1039 | struct vm_region *region, | ||
1040 | unsigned long len) | ||
819 | { | 1041 | { |
1042 | struct page *pages; | ||
1043 | unsigned long total, point, n, rlen; | ||
820 | void *base; | 1044 | void *base; |
821 | int ret; | 1045 | int ret, order; |
822 | 1046 | ||
823 | /* invoke the file's mapping function so that it can keep track of | 1047 | /* invoke the file's mapping function so that it can keep track of |
824 | * shared mappings on devices or memory | 1048 | * shared mappings on devices or memory |
@@ -826,34 +1050,63 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) | |||
826 | */ | 1050 | */ |
827 | if (vma->vm_file) { | 1051 | if (vma->vm_file) { |
828 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); | 1052 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); |
829 | if (ret != -ENOSYS) { | 1053 | if (ret == 0) { |
830 | /* shouldn't return success if we're not sharing */ | 1054 | /* shouldn't return success if we're not sharing */ |
831 | BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE)); | 1055 | BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); |
832 | return ret; /* success or a real error */ | 1056 | vma->vm_region->vm_top = vma->vm_region->vm_end; |
1057 | return ret; | ||
833 | } | 1058 | } |
1059 | if (ret != -ENOSYS) | ||
1060 | return ret; | ||
834 | 1061 | ||
835 | /* getting an ENOSYS error indicates that direct mmap isn't | 1062 | /* getting an ENOSYS error indicates that direct mmap isn't |
836 | * possible (as opposed to tried but failed) so we'll try to | 1063 | * possible (as opposed to tried but failed) so we'll try to |
837 | * make a private copy of the data and map that instead */ | 1064 | * make a private copy of the data and map that instead */ |
838 | } | 1065 | } |
839 | 1066 | ||
1067 | rlen = PAGE_ALIGN(len); | ||
1068 | |||
840 | /* allocate some memory to hold the mapping | 1069 | /* allocate some memory to hold the mapping |
841 | * - note that this may not return a page-aligned address if the object | 1070 | * - note that this may not return a page-aligned address if the object |
842 | * we're allocating is smaller than a page | 1071 | * we're allocating is smaller than a page |
843 | */ | 1072 | */ |
844 | base = kmalloc(len, GFP_KERNEL|__GFP_COMP); | 1073 | order = get_order(rlen); |
845 | if (!base) | 1074 | kdebug("alloc order %d for %lx", order, len); |
1075 | |||
1076 | pages = alloc_pages(GFP_KERNEL, order); | ||
1077 | if (!pages) | ||
846 | goto enomem; | 1078 | goto enomem; |
847 | 1079 | ||
848 | vma->vm_start = (unsigned long) base; | 1080 | total = 1 << order; |
849 | vma->vm_end = vma->vm_start + len; | 1081 | atomic_add(total, &mmap_pages_allocated); |
850 | vma->vm_flags |= VM_MAPPED_COPY; | 1082 | |
1083 | point = rlen >> PAGE_SHIFT; | ||
1084 | |||
1085 | /* we allocated a power-of-2 sized page set, so we may want to trim off | ||
1086 | * the excess */ | ||
1087 | if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { | ||
1088 | while (total > point) { | ||
1089 | order = ilog2(total - point); | ||
1090 | n = 1 << order; | ||
1091 | kdebug("shave %lu/%lu @%lu", n, total - point, total); | ||
1092 | atomic_sub(n, &mmap_pages_allocated); | ||
1093 | total -= n; | ||
1094 | set_page_refcounted(pages + total); | ||
1095 | __free_pages(pages + total, order); | ||
1096 | } | ||
1097 | } | ||
1098 | |||
1099 | for (point = 1; point < total; point++) | ||
1100 | set_page_refcounted(&pages[point]); | ||
851 | 1101 | ||
852 | #ifdef WARN_ON_SLACK | 1102 | base = page_address(pages); |
853 | if (len + WARN_ON_SLACK <= kobjsize(result)) | 1103 | region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; |
854 | printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n", | 1104 | region->vm_start = (unsigned long) base; |
855 | len, current->pid, kobjsize(result) - len); | 1105 | region->vm_end = region->vm_start + rlen; |
856 | #endif | 1106 | region->vm_top = region->vm_start + (total << PAGE_SHIFT); |
1107 | |||
1108 | vma->vm_start = region->vm_start; | ||
1109 | vma->vm_end = region->vm_start + len; | ||
857 | 1110 | ||
858 | if (vma->vm_file) { | 1111 | if (vma->vm_file) { |
859 | /* read the contents of a file into the copy */ | 1112 | /* read the contents of a file into the copy */ |
@@ -865,26 +1118,28 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) | |||
865 | 1118 | ||
866 | old_fs = get_fs(); | 1119 | old_fs = get_fs(); |
867 | set_fs(KERNEL_DS); | 1120 | set_fs(KERNEL_DS); |
868 | ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); | 1121 | ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos); |
869 | set_fs(old_fs); | 1122 | set_fs(old_fs); |
870 | 1123 | ||
871 | if (ret < 0) | 1124 | if (ret < 0) |
872 | goto error_free; | 1125 | goto error_free; |
873 | 1126 | ||
874 | /* clear the last little bit */ | 1127 | /* clear the last little bit */ |
875 | if (ret < len) | 1128 | if (ret < rlen) |
876 | memset(base + ret, 0, len - ret); | 1129 | memset(base + ret, 0, rlen - ret); |
877 | 1130 | ||
878 | } else { | 1131 | } else { |
879 | /* if it's an anonymous mapping, then just clear it */ | 1132 | /* if it's an anonymous mapping, then just clear it */ |
880 | memset(base, 0, len); | 1133 | memset(base, 0, rlen); |
881 | } | 1134 | } |
882 | 1135 | ||
883 | return 0; | 1136 | return 0; |
884 | 1137 | ||
885 | error_free: | 1138 | error_free: |
886 | kfree(base); | 1139 | free_page_series(region->vm_start, region->vm_end); |
887 | vma->vm_start = 0; | 1140 | region->vm_start = vma->vm_start = 0; |
1141 | region->vm_end = vma->vm_end = 0; | ||
1142 | region->vm_top = 0; | ||
888 | return ret; | 1143 | return ret; |
889 | 1144 | ||
890 | enomem: | 1145 | enomem: |
@@ -904,13 +1159,14 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
904 | unsigned long flags, | 1159 | unsigned long flags, |
905 | unsigned long pgoff) | 1160 | unsigned long pgoff) |
906 | { | 1161 | { |
907 | struct vm_list_struct *vml = NULL; | 1162 | struct vm_area_struct *vma; |
908 | struct vm_area_struct *vma = NULL; | 1163 | struct vm_region *region; |
909 | struct rb_node *rb; | 1164 | struct rb_node *rb; |
910 | unsigned long capabilities, vm_flags; | 1165 | unsigned long capabilities, vm_flags, result; |
911 | void *result; | ||
912 | int ret; | 1166 | int ret; |
913 | 1167 | ||
1168 | kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); | ||
1169 | |||
914 | if (!(flags & MAP_FIXED)) | 1170 | if (!(flags & MAP_FIXED)) |
915 | addr = round_hint_to_min(addr); | 1171 | addr = round_hint_to_min(addr); |
916 | 1172 | ||
@@ -918,73 +1174,120 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
918 | * mapping */ | 1174 | * mapping */ |
919 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, | 1175 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, |
920 | &capabilities); | 1176 | &capabilities); |
921 | if (ret < 0) | 1177 | if (ret < 0) { |
1178 | kleave(" = %d [val]", ret); | ||
922 | return ret; | 1179 | return ret; |
1180 | } | ||
923 | 1181 | ||
924 | /* we've determined that we can make the mapping, now translate what we | 1182 | /* we've determined that we can make the mapping, now translate what we |
925 | * now know into VMA flags */ | 1183 | * now know into VMA flags */ |
926 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); | 1184 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); |
927 | 1185 | ||
928 | /* we're going to need to record the mapping if it works */ | 1186 | /* we're going to need to record the mapping */ |
929 | vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL); | 1187 | region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); |
930 | if (!vml) | 1188 | if (!region) |
931 | goto error_getting_vml; | 1189 | goto error_getting_region; |
1190 | |||
1191 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | ||
1192 | if (!vma) | ||
1193 | goto error_getting_vma; | ||
1194 | |||
1195 | atomic_set(®ion->vm_usage, 1); | ||
1196 | region->vm_flags = vm_flags; | ||
1197 | region->vm_pgoff = pgoff; | ||
1198 | |||
1199 | INIT_LIST_HEAD(&vma->anon_vma_node); | ||
1200 | vma->vm_flags = vm_flags; | ||
1201 | vma->vm_pgoff = pgoff; | ||
932 | 1202 | ||
933 | down_write(&nommu_vma_sem); | 1203 | if (file) { |
1204 | region->vm_file = file; | ||
1205 | get_file(file); | ||
1206 | vma->vm_file = file; | ||
1207 | get_file(file); | ||
1208 | if (vm_flags & VM_EXECUTABLE) { | ||
1209 | added_exe_file_vma(current->mm); | ||
1210 | vma->vm_mm = current->mm; | ||
1211 | } | ||
1212 | } | ||
934 | 1213 | ||
935 | /* if we want to share, we need to check for VMAs created by other | 1214 | down_write(&nommu_region_sem); |
1215 | |||
1216 | /* if we want to share, we need to check for regions created by other | ||
936 | * mmap() calls that overlap with our proposed mapping | 1217 | * mmap() calls that overlap with our proposed mapping |
937 | * - we can only share with an exact match on most regular files | 1218 | * - we can only share with a superset match on most regular files |
938 | * - shared mappings on character devices and memory backed files are | 1219 | * - shared mappings on character devices and memory backed files are |
939 | * permitted to overlap inexactly as far as we are concerned for in | 1220 | * permitted to overlap inexactly as far as we are concerned for in |
940 | * these cases, sharing is handled in the driver or filesystem rather | 1221 | * these cases, sharing is handled in the driver or filesystem rather |
941 | * than here | 1222 | * than here |
942 | */ | 1223 | */ |
943 | if (vm_flags & VM_MAYSHARE) { | 1224 | if (vm_flags & VM_MAYSHARE) { |
944 | unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; | 1225 | struct vm_region *pregion; |
945 | unsigned long vmpglen; | 1226 | unsigned long pglen, rpglen, pgend, rpgend, start; |
946 | 1227 | ||
947 | /* suppress VMA sharing for shared regions */ | 1228 | pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; |
948 | if (vm_flags & VM_SHARED && | 1229 | pgend = pgoff + pglen; |
949 | capabilities & BDI_CAP_MAP_DIRECT) | ||
950 | goto dont_share_VMAs; | ||
951 | 1230 | ||
952 | for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) { | 1231 | for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { |
953 | vma = rb_entry(rb, struct vm_area_struct, vm_rb); | 1232 | pregion = rb_entry(rb, struct vm_region, vm_rb); |
954 | 1233 | ||
955 | if (!(vma->vm_flags & VM_MAYSHARE)) | 1234 | if (!(pregion->vm_flags & VM_MAYSHARE)) |
956 | continue; | 1235 | continue; |
957 | 1236 | ||
958 | /* search for overlapping mappings on the same file */ | 1237 | /* search for overlapping mappings on the same file */ |
959 | if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode) | 1238 | if (pregion->vm_file->f_path.dentry->d_inode != |
1239 | file->f_path.dentry->d_inode) | ||
960 | continue; | 1240 | continue; |
961 | 1241 | ||
962 | if (vma->vm_pgoff >= pgoff + pglen) | 1242 | if (pregion->vm_pgoff >= pgend) |
963 | continue; | 1243 | continue; |
964 | 1244 | ||
965 | vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1; | 1245 | rpglen = pregion->vm_end - pregion->vm_start; |
966 | vmpglen >>= PAGE_SHIFT; | 1246 | rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT; |
967 | if (pgoff >= vma->vm_pgoff + vmpglen) | 1247 | rpgend = pregion->vm_pgoff + rpglen; |
1248 | if (pgoff >= rpgend) | ||
968 | continue; | 1249 | continue; |
969 | 1250 | ||
970 | /* handle inexactly overlapping matches between mappings */ | 1251 | /* handle inexactly overlapping matches between |
971 | if (vma->vm_pgoff != pgoff || vmpglen != pglen) { | 1252 | * mappings */ |
1253 | if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && | ||
1254 | !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { | ||
1255 | /* new mapping is not a subset of the region */ | ||
972 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) | 1256 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) |
973 | goto sharing_violation; | 1257 | goto sharing_violation; |
974 | continue; | 1258 | continue; |
975 | } | 1259 | } |
976 | 1260 | ||
977 | /* we've found a VMA we can share */ | 1261 | /* we've found a region we can share */ |
978 | atomic_inc(&vma->vm_usage); | 1262 | atomic_inc(&pregion->vm_usage); |
979 | 1263 | vma->vm_region = pregion; | |
980 | vml->vma = vma; | 1264 | start = pregion->vm_start; |
981 | result = (void *) vma->vm_start; | 1265 | start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; |
982 | goto shared; | 1266 | vma->vm_start = start; |
1267 | vma->vm_end = start + len; | ||
1268 | |||
1269 | if (pregion->vm_flags & VM_MAPPED_COPY) { | ||
1270 | kdebug("share copy"); | ||
1271 | vma->vm_flags |= VM_MAPPED_COPY; | ||
1272 | } else { | ||
1273 | kdebug("share mmap"); | ||
1274 | ret = do_mmap_shared_file(vma); | ||
1275 | if (ret < 0) { | ||
1276 | vma->vm_region = NULL; | ||
1277 | vma->vm_start = 0; | ||
1278 | vma->vm_end = 0; | ||
1279 | atomic_dec(&pregion->vm_usage); | ||
1280 | pregion = NULL; | ||
1281 | goto error_just_free; | ||
1282 | } | ||
1283 | } | ||
1284 | fput(region->vm_file); | ||
1285 | kmem_cache_free(vm_region_jar, region); | ||
1286 | region = pregion; | ||
1287 | result = start; | ||
1288 | goto share; | ||
983 | } | 1289 | } |
984 | 1290 | ||
985 | dont_share_VMAs: | ||
986 | vma = NULL; | ||
987 | |||
988 | /* obtain the address at which to make a shared mapping | 1291 | /* obtain the address at which to make a shared mapping |
989 | * - this is the hook for quasi-memory character devices to | 1292 | * - this is the hook for quasi-memory character devices to |
990 | * tell us the location of a shared mapping | 1293 | * tell us the location of a shared mapping |
@@ -995,113 +1298,93 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
995 | if (IS_ERR((void *) addr)) { | 1298 | if (IS_ERR((void *) addr)) { |
996 | ret = addr; | 1299 | ret = addr; |
997 | if (ret != (unsigned long) -ENOSYS) | 1300 | if (ret != (unsigned long) -ENOSYS) |
998 | goto error; | 1301 | goto error_just_free; |
999 | 1302 | ||
1000 | /* the driver refused to tell us where to site | 1303 | /* the driver refused to tell us where to site |
1001 | * the mapping so we'll have to attempt to copy | 1304 | * the mapping so we'll have to attempt to copy |
1002 | * it */ | 1305 | * it */ |
1003 | ret = (unsigned long) -ENODEV; | 1306 | ret = (unsigned long) -ENODEV; |
1004 | if (!(capabilities & BDI_CAP_MAP_COPY)) | 1307 | if (!(capabilities & BDI_CAP_MAP_COPY)) |
1005 | goto error; | 1308 | goto error_just_free; |
1006 | 1309 | ||
1007 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 1310 | capabilities &= ~BDI_CAP_MAP_DIRECT; |
1311 | } else { | ||
1312 | vma->vm_start = region->vm_start = addr; | ||
1313 | vma->vm_end = region->vm_end = addr + len; | ||
1008 | } | 1314 | } |
1009 | } | 1315 | } |
1010 | } | 1316 | } |
1011 | 1317 | ||
1012 | /* we're going to need a VMA struct as well */ | 1318 | vma->vm_region = region; |
1013 | vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL); | ||
1014 | if (!vma) | ||
1015 | goto error_getting_vma; | ||
1016 | |||
1017 | INIT_LIST_HEAD(&vma->anon_vma_node); | ||
1018 | atomic_set(&vma->vm_usage, 1); | ||
1019 | if (file) { | ||
1020 | get_file(file); | ||
1021 | if (vm_flags & VM_EXECUTABLE) { | ||
1022 | added_exe_file_vma(current->mm); | ||
1023 | vma->vm_mm = current->mm; | ||
1024 | } | ||
1025 | } | ||
1026 | vma->vm_file = file; | ||
1027 | vma->vm_flags = vm_flags; | ||
1028 | vma->vm_start = addr; | ||
1029 | vma->vm_end = addr + len; | ||
1030 | vma->vm_pgoff = pgoff; | ||
1031 | |||
1032 | vml->vma = vma; | ||
1033 | 1319 | ||
1034 | /* set up the mapping */ | 1320 | /* set up the mapping */ |
1035 | if (file && vma->vm_flags & VM_SHARED) | 1321 | if (file && vma->vm_flags & VM_SHARED) |
1036 | ret = do_mmap_shared_file(vma, len); | 1322 | ret = do_mmap_shared_file(vma); |
1037 | else | 1323 | else |
1038 | ret = do_mmap_private(vma, len); | 1324 | ret = do_mmap_private(vma, region, len); |
1039 | if (ret < 0) | 1325 | if (ret < 0) |
1040 | goto error; | 1326 | goto error_put_region; |
1041 | |||
1042 | /* okay... we have a mapping; now we have to register it */ | ||
1043 | result = (void *) vma->vm_start; | ||
1044 | 1327 | ||
1045 | if (vma->vm_flags & VM_MAPPED_COPY) { | 1328 | add_nommu_region(region); |
1046 | realalloc += kobjsize(result); | ||
1047 | askedalloc += len; | ||
1048 | } | ||
1049 | 1329 | ||
1050 | realalloc += kobjsize(vma); | 1330 | /* okay... we have a mapping; now we have to register it */ |
1051 | askedalloc += sizeof(*vma); | 1331 | result = vma->vm_start; |
1052 | 1332 | ||
1053 | current->mm->total_vm += len >> PAGE_SHIFT; | 1333 | current->mm->total_vm += len >> PAGE_SHIFT; |
1054 | 1334 | ||
1055 | add_nommu_vma(vma); | 1335 | share: |
1056 | 1336 | add_vma_to_mm(current->mm, vma); | |
1057 | shared: | ||
1058 | realalloc += kobjsize(vml); | ||
1059 | askedalloc += sizeof(*vml); | ||
1060 | |||
1061 | add_vma_to_mm(current->mm, vml); | ||
1062 | 1337 | ||
1063 | up_write(&nommu_vma_sem); | 1338 | up_write(&nommu_region_sem); |
1064 | 1339 | ||
1065 | if (prot & PROT_EXEC) | 1340 | if (prot & PROT_EXEC) |
1066 | flush_icache_range((unsigned long) result, | 1341 | flush_icache_range(result, result + len); |
1067 | (unsigned long) result + len); | ||
1068 | 1342 | ||
1069 | #ifdef DEBUG | 1343 | kleave(" = %lx", result); |
1070 | printk("do_mmap:\n"); | 1344 | return result; |
1071 | show_process_blocks(); | ||
1072 | #endif | ||
1073 | |||
1074 | return (unsigned long) result; | ||
1075 | 1345 | ||
1076 | error: | 1346 | error_put_region: |
1077 | up_write(&nommu_vma_sem); | 1347 | __put_nommu_region(region); |
1078 | kfree(vml); | ||
1079 | if (vma) { | 1348 | if (vma) { |
1080 | if (vma->vm_file) { | 1349 | if (vma->vm_file) { |
1081 | fput(vma->vm_file); | 1350 | fput(vma->vm_file); |
1082 | if (vma->vm_flags & VM_EXECUTABLE) | 1351 | if (vma->vm_flags & VM_EXECUTABLE) |
1083 | removed_exe_file_vma(vma->vm_mm); | 1352 | removed_exe_file_vma(vma->vm_mm); |
1084 | } | 1353 | } |
1085 | kfree(vma); | 1354 | kmem_cache_free(vm_area_cachep, vma); |
1086 | } | 1355 | } |
1356 | kleave(" = %d [pr]", ret); | ||
1087 | return ret; | 1357 | return ret; |
1088 | 1358 | ||
1089 | sharing_violation: | 1359 | error_just_free: |
1090 | up_write(&nommu_vma_sem); | 1360 | up_write(&nommu_region_sem); |
1091 | printk("Attempt to share mismatched mappings\n"); | 1361 | error: |
1092 | kfree(vml); | 1362 | fput(region->vm_file); |
1093 | return -EINVAL; | 1363 | kmem_cache_free(vm_region_jar, region); |
1364 | fput(vma->vm_file); | ||
1365 | if (vma->vm_flags & VM_EXECUTABLE) | ||
1366 | removed_exe_file_vma(vma->vm_mm); | ||
1367 | kmem_cache_free(vm_area_cachep, vma); | ||
1368 | kleave(" = %d", ret); | ||
1369 | return ret; | ||
1094 | 1370 | ||
1095 | error_getting_vma: | 1371 | sharing_violation: |
1096 | up_write(&nommu_vma_sem); | 1372 | up_write(&nommu_region_sem); |
1097 | kfree(vml); | 1373 | printk(KERN_WARNING "Attempt to share mismatched mappings\n"); |
1098 | printk("Allocation of vma for %lu byte allocation from process %d failed\n", | 1374 | ret = -EINVAL; |
1375 | goto error; | ||
1376 | |||
1377 | error_getting_vma: | ||
1378 | kmem_cache_free(vm_region_jar, region); | ||
1379 | printk(KERN_WARNING "Allocation of vma for %lu byte allocation" | ||
1380 | " from process %d failed\n", | ||
1099 | len, current->pid); | 1381 | len, current->pid); |
1100 | show_free_areas(); | 1382 | show_free_areas(); |
1101 | return -ENOMEM; | 1383 | return -ENOMEM; |
1102 | 1384 | ||
1103 | error_getting_vml: | 1385 | error_getting_region: |
1104 | printk("Allocation of vml for %lu byte allocation from process %d failed\n", | 1386 | printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" |
1387 | " from process %d failed\n", | ||
1105 | len, current->pid); | 1388 | len, current->pid); |
1106 | show_free_areas(); | 1389 | show_free_areas(); |
1107 | return -ENOMEM; | 1390 | return -ENOMEM; |
@@ -1109,85 +1392,183 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1109 | EXPORT_SYMBOL(do_mmap_pgoff); | 1392 | EXPORT_SYMBOL(do_mmap_pgoff); |
1110 | 1393 | ||
1111 | /* | 1394 | /* |
1112 | * handle mapping disposal for uClinux | 1395 | * split a vma into two pieces at address 'addr', a new vma is allocated either |
1396 | * for the first part or the tail. | ||
1113 | */ | 1397 | */ |
1114 | static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma) | 1398 | int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, |
1399 | unsigned long addr, int new_below) | ||
1115 | { | 1400 | { |
1116 | if (vma) { | 1401 | struct vm_area_struct *new; |
1117 | down_write(&nommu_vma_sem); | 1402 | struct vm_region *region; |
1403 | unsigned long npages; | ||
1118 | 1404 | ||
1119 | if (atomic_dec_and_test(&vma->vm_usage)) { | 1405 | kenter(""); |
1120 | delete_nommu_vma(vma); | ||
1121 | 1406 | ||
1122 | if (vma->vm_ops && vma->vm_ops->close) | 1407 | /* we're only permitted to split anonymous regions that have a single |
1123 | vma->vm_ops->close(vma); | 1408 | * owner */ |
1409 | if (vma->vm_file || | ||
1410 | atomic_read(&vma->vm_region->vm_usage) != 1) | ||
1411 | return -ENOMEM; | ||
1124 | 1412 | ||
1125 | /* IO memory and memory shared directly out of the pagecache from | 1413 | if (mm->map_count >= sysctl_max_map_count) |
1126 | * ramfs/tmpfs mustn't be released here */ | 1414 | return -ENOMEM; |
1127 | if (vma->vm_flags & VM_MAPPED_COPY) { | ||
1128 | realalloc -= kobjsize((void *) vma->vm_start); | ||
1129 | askedalloc -= vma->vm_end - vma->vm_start; | ||
1130 | kfree((void *) vma->vm_start); | ||
1131 | } | ||
1132 | 1415 | ||
1133 | realalloc -= kobjsize(vma); | 1416 | region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); |
1134 | askedalloc -= sizeof(*vma); | 1417 | if (!region) |
1418 | return -ENOMEM; | ||
1135 | 1419 | ||
1136 | if (vma->vm_file) { | 1420 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
1137 | fput(vma->vm_file); | 1421 | if (!new) { |
1138 | if (vma->vm_flags & VM_EXECUTABLE) | 1422 | kmem_cache_free(vm_region_jar, region); |
1139 | removed_exe_file_vma(mm); | 1423 | return -ENOMEM; |
1140 | } | 1424 | } |
1141 | kfree(vma); | 1425 | |
1142 | } | 1426 | /* most fields are the same, copy all, and then fixup */ |
1427 | *new = *vma; | ||
1428 | *region = *vma->vm_region; | ||
1429 | new->vm_region = region; | ||
1430 | |||
1431 | npages = (addr - vma->vm_start) >> PAGE_SHIFT; | ||
1143 | 1432 | ||
1144 | up_write(&nommu_vma_sem); | 1433 | if (new_below) { |
1434 | region->vm_top = region->vm_end = new->vm_end = addr; | ||
1435 | } else { | ||
1436 | region->vm_start = new->vm_start = addr; | ||
1437 | region->vm_pgoff = new->vm_pgoff += npages; | ||
1438 | } | ||
1439 | |||
1440 | if (new->vm_ops && new->vm_ops->open) | ||
1441 | new->vm_ops->open(new); | ||
1442 | |||
1443 | delete_vma_from_mm(vma); | ||
1444 | down_write(&nommu_region_sem); | ||
1445 | delete_nommu_region(vma->vm_region); | ||
1446 | if (new_below) { | ||
1447 | vma->vm_region->vm_start = vma->vm_start = addr; | ||
1448 | vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; | ||
1449 | } else { | ||
1450 | vma->vm_region->vm_end = vma->vm_end = addr; | ||
1451 | vma->vm_region->vm_top = addr; | ||
1145 | } | 1452 | } |
1453 | add_nommu_region(vma->vm_region); | ||
1454 | add_nommu_region(new->vm_region); | ||
1455 | up_write(&nommu_region_sem); | ||
1456 | add_vma_to_mm(mm, vma); | ||
1457 | add_vma_to_mm(mm, new); | ||
1458 | return 0; | ||
1146 | } | 1459 | } |
1147 | 1460 | ||
1148 | /* | 1461 | /* |
1149 | * release a mapping | 1462 | * shrink a VMA by removing the specified chunk from either the beginning or |
1150 | * - under NOMMU conditions the parameters must match exactly to the mapping to | 1463 | * the end |
1151 | * be removed | ||
1152 | */ | 1464 | */ |
1153 | int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | 1465 | static int shrink_vma(struct mm_struct *mm, |
1466 | struct vm_area_struct *vma, | ||
1467 | unsigned long from, unsigned long to) | ||
1154 | { | 1468 | { |
1155 | struct vm_list_struct *vml, **parent; | 1469 | struct vm_region *region; |
1156 | unsigned long end = addr + len; | ||
1157 | 1470 | ||
1158 | #ifdef DEBUG | 1471 | kenter(""); |
1159 | printk("do_munmap:\n"); | ||
1160 | #endif | ||
1161 | 1472 | ||
1162 | for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) { | 1473 | /* adjust the VMA's pointers, which may reposition it in the MM's tree |
1163 | if ((*parent)->vma->vm_start > addr) | 1474 | * and list */ |
1164 | break; | 1475 | delete_vma_from_mm(vma); |
1165 | if ((*parent)->vma->vm_start == addr && | 1476 | if (from > vma->vm_start) |
1166 | ((len == 0) || ((*parent)->vma->vm_end == end))) | 1477 | vma->vm_end = from; |
1167 | goto found; | 1478 | else |
1479 | vma->vm_start = to; | ||
1480 | add_vma_to_mm(mm, vma); | ||
1481 | |||
1482 | /* cut the backing region down to size */ | ||
1483 | region = vma->vm_region; | ||
1484 | BUG_ON(atomic_read(®ion->vm_usage) != 1); | ||
1485 | |||
1486 | down_write(&nommu_region_sem); | ||
1487 | delete_nommu_region(region); | ||
1488 | if (from > region->vm_start) { | ||
1489 | to = region->vm_top; | ||
1490 | region->vm_top = region->vm_end = from; | ||
1491 | } else { | ||
1492 | region->vm_start = to; | ||
1168 | } | 1493 | } |
1494 | add_nommu_region(region); | ||
1495 | up_write(&nommu_region_sem); | ||
1169 | 1496 | ||
1170 | printk("munmap of non-mmaped memory by process %d (%s): %p\n", | 1497 | free_page_series(from, to); |
1171 | current->pid, current->comm, (void *) addr); | 1498 | return 0; |
1172 | return -EINVAL; | 1499 | } |
1173 | 1500 | ||
1174 | found: | 1501 | /* |
1175 | vml = *parent; | 1502 | * release a mapping |
1503 | * - under NOMMU conditions the chunk to be unmapped must be backed by a single | ||
1504 | * VMA, though it need not cover the whole VMA | ||
1505 | */ | ||
1506 | int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | ||
1507 | { | ||
1508 | struct vm_area_struct *vma; | ||
1509 | struct rb_node *rb; | ||
1510 | unsigned long end = start + len; | ||
1511 | int ret; | ||
1176 | 1512 | ||
1177 | put_vma(mm, vml->vma); | 1513 | kenter(",%lx,%zx", start, len); |
1178 | 1514 | ||
1179 | *parent = vml->next; | 1515 | if (len == 0) |
1180 | realalloc -= kobjsize(vml); | 1516 | return -EINVAL; |
1181 | askedalloc -= sizeof(*vml); | ||
1182 | kfree(vml); | ||
1183 | 1517 | ||
1184 | update_hiwater_vm(mm); | 1518 | /* find the first potentially overlapping VMA */ |
1185 | mm->total_vm -= len >> PAGE_SHIFT; | 1519 | vma = find_vma(mm, start); |
1520 | if (!vma) { | ||
1521 | printk(KERN_WARNING | ||
1522 | "munmap of memory not mmapped by process %d (%s):" | ||
1523 | " 0x%lx-0x%lx\n", | ||
1524 | current->pid, current->comm, start, start + len - 1); | ||
1525 | return -EINVAL; | ||
1526 | } | ||
1186 | 1527 | ||
1187 | #ifdef DEBUG | 1528 | /* we're allowed to split an anonymous VMA but not a file-backed one */ |
1188 | show_process_blocks(); | 1529 | if (vma->vm_file) { |
1189 | #endif | 1530 | do { |
1531 | if (start > vma->vm_start) { | ||
1532 | kleave(" = -EINVAL [miss]"); | ||
1533 | return -EINVAL; | ||
1534 | } | ||
1535 | if (end == vma->vm_end) | ||
1536 | goto erase_whole_vma; | ||
1537 | rb = rb_next(&vma->vm_rb); | ||
1538 | vma = rb_entry(rb, struct vm_area_struct, vm_rb); | ||
1539 | } while (rb); | ||
1540 | kleave(" = -EINVAL [split file]"); | ||
1541 | return -EINVAL; | ||
1542 | } else { | ||
1543 | /* the chunk must be a subset of the VMA found */ | ||
1544 | if (start == vma->vm_start && end == vma->vm_end) | ||
1545 | goto erase_whole_vma; | ||
1546 | if (start < vma->vm_start || end > vma->vm_end) { | ||
1547 | kleave(" = -EINVAL [superset]"); | ||
1548 | return -EINVAL; | ||
1549 | } | ||
1550 | if (start & ~PAGE_MASK) { | ||
1551 | kleave(" = -EINVAL [unaligned start]"); | ||
1552 | return -EINVAL; | ||
1553 | } | ||
1554 | if (end != vma->vm_end && end & ~PAGE_MASK) { | ||
1555 | kleave(" = -EINVAL [unaligned split]"); | ||
1556 | return -EINVAL; | ||
1557 | } | ||
1558 | if (start != vma->vm_start && end != vma->vm_end) { | ||
1559 | ret = split_vma(mm, vma, start, 1); | ||
1560 | if (ret < 0) { | ||
1561 | kleave(" = %d [split]", ret); | ||
1562 | return ret; | ||
1563 | } | ||
1564 | } | ||
1565 | return shrink_vma(mm, vma, start, end); | ||
1566 | } | ||
1190 | 1567 | ||
1568 | erase_whole_vma: | ||
1569 | delete_vma_from_mm(vma); | ||
1570 | delete_vma(mm, vma); | ||
1571 | kleave(" = 0"); | ||
1191 | return 0; | 1572 | return 0; |
1192 | } | 1573 | } |
1193 | EXPORT_SYMBOL(do_munmap); | 1574 | EXPORT_SYMBOL(do_munmap); |
@@ -1204,32 +1585,26 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len) | |||
1204 | } | 1585 | } |
1205 | 1586 | ||
1206 | /* | 1587 | /* |
1207 | * Release all mappings | 1588 | * release all the mappings made in a process's VM space |
1208 | */ | 1589 | */ |
1209 | void exit_mmap(struct mm_struct * mm) | 1590 | void exit_mmap(struct mm_struct *mm) |
1210 | { | 1591 | { |
1211 | struct vm_list_struct *tmp; | 1592 | struct vm_area_struct *vma; |
1212 | |||
1213 | if (mm) { | ||
1214 | #ifdef DEBUG | ||
1215 | printk("Exit_mmap:\n"); | ||
1216 | #endif | ||
1217 | 1593 | ||
1218 | mm->total_vm = 0; | 1594 | if (!mm) |
1595 | return; | ||
1219 | 1596 | ||
1220 | while ((tmp = mm->context.vmlist)) { | 1597 | kenter(""); |
1221 | mm->context.vmlist = tmp->next; | ||
1222 | put_vma(mm, tmp->vma); | ||
1223 | 1598 | ||
1224 | realalloc -= kobjsize(tmp); | 1599 | mm->total_vm = 0; |
1225 | askedalloc -= sizeof(*tmp); | ||
1226 | kfree(tmp); | ||
1227 | } | ||
1228 | 1600 | ||
1229 | #ifdef DEBUG | 1601 | while ((vma = mm->mmap)) { |
1230 | show_process_blocks(); | 1602 | mm->mmap = vma->vm_next; |
1231 | #endif | 1603 | delete_vma_from_mm(vma); |
1604 | delete_vma(mm, vma); | ||
1232 | } | 1605 | } |
1606 | |||
1607 | kleave(""); | ||
1233 | } | 1608 | } |
1234 | 1609 | ||
1235 | unsigned long do_brk(unsigned long addr, unsigned long len) | 1610 | unsigned long do_brk(unsigned long addr, unsigned long len) |
@@ -1242,8 +1617,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
1242 | * time (controlled by the MREMAP_MAYMOVE flag and available VM space) | 1617 | * time (controlled by the MREMAP_MAYMOVE flag and available VM space) |
1243 | * | 1618 | * |
1244 | * under NOMMU conditions, we only permit changing a mapping's size, and only | 1619 | * under NOMMU conditions, we only permit changing a mapping's size, and only |
1245 | * as long as it stays within the hole allocated by the kmalloc() call in | 1620 | * as long as it stays within the region allocated by do_mmap_private() and the |
1246 | * do_mmap_pgoff() and the block is not shareable | 1621 | * block is not shareable |
1247 | * | 1622 | * |
1248 | * MREMAP_FIXED is not supported under NOMMU conditions | 1623 | * MREMAP_FIXED is not supported under NOMMU conditions |
1249 | */ | 1624 | */ |
@@ -1254,13 +1629,16 @@ unsigned long do_mremap(unsigned long addr, | |||
1254 | struct vm_area_struct *vma; | 1629 | struct vm_area_struct *vma; |
1255 | 1630 | ||
1256 | /* insanity checks first */ | 1631 | /* insanity checks first */ |
1257 | if (new_len == 0) | 1632 | if (old_len == 0 || new_len == 0) |
1258 | return (unsigned long) -EINVAL; | 1633 | return (unsigned long) -EINVAL; |
1259 | 1634 | ||
1635 | if (addr & ~PAGE_MASK) | ||
1636 | return -EINVAL; | ||
1637 | |||
1260 | if (flags & MREMAP_FIXED && new_addr != addr) | 1638 | if (flags & MREMAP_FIXED && new_addr != addr) |
1261 | return (unsigned long) -EINVAL; | 1639 | return (unsigned long) -EINVAL; |
1262 | 1640 | ||
1263 | vma = find_vma_exact(current->mm, addr); | 1641 | vma = find_vma_exact(current->mm, addr, old_len); |
1264 | if (!vma) | 1642 | if (!vma) |
1265 | return (unsigned long) -EINVAL; | 1643 | return (unsigned long) -EINVAL; |
1266 | 1644 | ||
@@ -1270,22 +1648,19 @@ unsigned long do_mremap(unsigned long addr, | |||
1270 | if (vma->vm_flags & VM_MAYSHARE) | 1648 | if (vma->vm_flags & VM_MAYSHARE) |
1271 | return (unsigned long) -EPERM; | 1649 | return (unsigned long) -EPERM; |
1272 | 1650 | ||
1273 | if (new_len > kobjsize((void *) addr)) | 1651 | if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) |
1274 | return (unsigned long) -ENOMEM; | 1652 | return (unsigned long) -ENOMEM; |
1275 | 1653 | ||
1276 | /* all checks complete - do it */ | 1654 | /* all checks complete - do it */ |
1277 | vma->vm_end = vma->vm_start + new_len; | 1655 | vma->vm_end = vma->vm_start + new_len; |
1278 | |||
1279 | askedalloc -= old_len; | ||
1280 | askedalloc += new_len; | ||
1281 | |||
1282 | return vma->vm_start; | 1656 | return vma->vm_start; |
1283 | } | 1657 | } |
1284 | EXPORT_SYMBOL(do_mremap); | 1658 | EXPORT_SYMBOL(do_mremap); |
1285 | 1659 | ||
1286 | asmlinkage unsigned long sys_mremap(unsigned long addr, | 1660 | asmlinkage |
1287 | unsigned long old_len, unsigned long new_len, | 1661 | unsigned long sys_mremap(unsigned long addr, |
1288 | unsigned long flags, unsigned long new_addr) | 1662 | unsigned long old_len, unsigned long new_len, |
1663 | unsigned long flags, unsigned long new_addr) | ||
1289 | { | 1664 | { |
1290 | unsigned long ret; | 1665 | unsigned long ret; |
1291 | 1666 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 558f9afe6e4e..40ba05061a4f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -31,7 +31,7 @@ | |||
31 | int sysctl_panic_on_oom; | 31 | int sysctl_panic_on_oom; |
32 | int sysctl_oom_kill_allocating_task; | 32 | int sysctl_oom_kill_allocating_task; |
33 | int sysctl_oom_dump_tasks; | 33 | int sysctl_oom_dump_tasks; |
34 | static DEFINE_SPINLOCK(zone_scan_mutex); | 34 | static DEFINE_SPINLOCK(zone_scan_lock); |
35 | /* #define DEBUG */ | 35 | /* #define DEBUG */ |
36 | 36 | ||
37 | /** | 37 | /** |
@@ -392,6 +392,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
392 | printk(KERN_WARNING "%s invoked oom-killer: " | 392 | printk(KERN_WARNING "%s invoked oom-killer: " |
393 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", | 393 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", |
394 | current->comm, gfp_mask, order, current->oomkilladj); | 394 | current->comm, gfp_mask, order, current->oomkilladj); |
395 | task_lock(current); | ||
396 | cpuset_print_task_mems_allowed(current); | ||
397 | task_unlock(current); | ||
395 | dump_stack(); | 398 | dump_stack(); |
396 | show_mem(); | 399 | show_mem(); |
397 | if (sysctl_oom_dump_tasks) | 400 | if (sysctl_oom_dump_tasks) |
@@ -426,7 +429,6 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) | |||
426 | unsigned long points = 0; | 429 | unsigned long points = 0; |
427 | struct task_struct *p; | 430 | struct task_struct *p; |
428 | 431 | ||
429 | cgroup_lock(); | ||
430 | read_lock(&tasklist_lock); | 432 | read_lock(&tasklist_lock); |
431 | retry: | 433 | retry: |
432 | p = select_bad_process(&points, mem); | 434 | p = select_bad_process(&points, mem); |
@@ -441,7 +443,6 @@ retry: | |||
441 | goto retry; | 443 | goto retry; |
442 | out: | 444 | out: |
443 | read_unlock(&tasklist_lock); | 445 | read_unlock(&tasklist_lock); |
444 | cgroup_unlock(); | ||
445 | } | 446 | } |
446 | #endif | 447 | #endif |
447 | 448 | ||
@@ -470,7 +471,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
470 | struct zone *zone; | 471 | struct zone *zone; |
471 | int ret = 1; | 472 | int ret = 1; |
472 | 473 | ||
473 | spin_lock(&zone_scan_mutex); | 474 | spin_lock(&zone_scan_lock); |
474 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { | 475 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { |
475 | if (zone_is_oom_locked(zone)) { | 476 | if (zone_is_oom_locked(zone)) { |
476 | ret = 0; | 477 | ret = 0; |
@@ -480,7 +481,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
480 | 481 | ||
481 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { | 482 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { |
482 | /* | 483 | /* |
483 | * Lock each zone in the zonelist under zone_scan_mutex so a | 484 | * Lock each zone in the zonelist under zone_scan_lock so a |
484 | * parallel invocation of try_set_zone_oom() doesn't succeed | 485 | * parallel invocation of try_set_zone_oom() doesn't succeed |
485 | * when it shouldn't. | 486 | * when it shouldn't. |
486 | */ | 487 | */ |
@@ -488,7 +489,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
488 | } | 489 | } |
489 | 490 | ||
490 | out: | 491 | out: |
491 | spin_unlock(&zone_scan_mutex); | 492 | spin_unlock(&zone_scan_lock); |
492 | return ret; | 493 | return ret; |
493 | } | 494 | } |
494 | 495 | ||
@@ -502,11 +503,82 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
502 | struct zoneref *z; | 503 | struct zoneref *z; |
503 | struct zone *zone; | 504 | struct zone *zone; |
504 | 505 | ||
505 | spin_lock(&zone_scan_mutex); | 506 | spin_lock(&zone_scan_lock); |
506 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { | 507 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { |
507 | zone_clear_flag(zone, ZONE_OOM_LOCKED); | 508 | zone_clear_flag(zone, ZONE_OOM_LOCKED); |
508 | } | 509 | } |
509 | spin_unlock(&zone_scan_mutex); | 510 | spin_unlock(&zone_scan_lock); |
511 | } | ||
512 | |||
513 | /* | ||
514 | * Must be called with tasklist_lock held for read. | ||
515 | */ | ||
516 | static void __out_of_memory(gfp_t gfp_mask, int order) | ||
517 | { | ||
518 | if (sysctl_oom_kill_allocating_task) { | ||
519 | oom_kill_process(current, gfp_mask, order, 0, NULL, | ||
520 | "Out of memory (oom_kill_allocating_task)"); | ||
521 | |||
522 | } else { | ||
523 | unsigned long points; | ||
524 | struct task_struct *p; | ||
525 | |||
526 | retry: | ||
527 | /* | ||
528 | * Rambo mode: Shoot down a process and hope it solves whatever | ||
529 | * issues we may have. | ||
530 | */ | ||
531 | p = select_bad_process(&points, NULL); | ||
532 | |||
533 | if (PTR_ERR(p) == -1UL) | ||
534 | return; | ||
535 | |||
536 | /* Found nothing?!?! Either we hang forever, or we panic. */ | ||
537 | if (!p) { | ||
538 | read_unlock(&tasklist_lock); | ||
539 | panic("Out of memory and no killable processes...\n"); | ||
540 | } | ||
541 | |||
542 | if (oom_kill_process(p, gfp_mask, order, points, NULL, | ||
543 | "Out of memory")) | ||
544 | goto retry; | ||
545 | } | ||
546 | } | ||
547 | |||
548 | /* | ||
549 | * pagefault handler calls into here because it is out of memory but | ||
550 | * doesn't know exactly how or why. | ||
551 | */ | ||
552 | void pagefault_out_of_memory(void) | ||
553 | { | ||
554 | unsigned long freed = 0; | ||
555 | |||
556 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); | ||
557 | if (freed > 0) | ||
558 | /* Got some memory back in the last second. */ | ||
559 | return; | ||
560 | |||
561 | /* | ||
562 | * If this is from memcg, oom-killer is already invoked. | ||
563 | * and not worth to go system-wide-oom. | ||
564 | */ | ||
565 | if (mem_cgroup_oom_called(current)) | ||
566 | goto rest_and_return; | ||
567 | |||
568 | if (sysctl_panic_on_oom) | ||
569 | panic("out of memory from page fault. panic_on_oom is selected.\n"); | ||
570 | |||
571 | read_lock(&tasklist_lock); | ||
572 | __out_of_memory(0, 0); /* unknown gfp_mask and order */ | ||
573 | read_unlock(&tasklist_lock); | ||
574 | |||
575 | /* | ||
576 | * Give "p" a good chance of killing itself before we | ||
577 | * retry to allocate memory. | ||
578 | */ | ||
579 | rest_and_return: | ||
580 | if (!test_thread_flag(TIF_MEMDIE)) | ||
581 | schedule_timeout_uninterruptible(1); | ||
510 | } | 582 | } |
511 | 583 | ||
512 | /** | 584 | /** |
@@ -522,8 +594,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
522 | */ | 594 | */ |
523 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | 595 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) |
524 | { | 596 | { |
525 | struct task_struct *p; | ||
526 | unsigned long points = 0; | ||
527 | unsigned long freed = 0; | 597 | unsigned long freed = 0; |
528 | enum oom_constraint constraint; | 598 | enum oom_constraint constraint; |
529 | 599 | ||
@@ -544,7 +614,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
544 | 614 | ||
545 | switch (constraint) { | 615 | switch (constraint) { |
546 | case CONSTRAINT_MEMORY_POLICY: | 616 | case CONSTRAINT_MEMORY_POLICY: |
547 | oom_kill_process(current, gfp_mask, order, points, NULL, | 617 | oom_kill_process(current, gfp_mask, order, 0, NULL, |
548 | "No available memory (MPOL_BIND)"); | 618 | "No available memory (MPOL_BIND)"); |
549 | break; | 619 | break; |
550 | 620 | ||
@@ -553,35 +623,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
553 | panic("out of memory. panic_on_oom is selected\n"); | 623 | panic("out of memory. panic_on_oom is selected\n"); |
554 | /* Fall-through */ | 624 | /* Fall-through */ |
555 | case CONSTRAINT_CPUSET: | 625 | case CONSTRAINT_CPUSET: |
556 | if (sysctl_oom_kill_allocating_task) { | 626 | __out_of_memory(gfp_mask, order); |
557 | oom_kill_process(current, gfp_mask, order, points, NULL, | ||
558 | "Out of memory (oom_kill_allocating_task)"); | ||
559 | break; | ||
560 | } | ||
561 | retry: | ||
562 | /* | ||
563 | * Rambo mode: Shoot down a process and hope it solves whatever | ||
564 | * issues we may have. | ||
565 | */ | ||
566 | p = select_bad_process(&points, NULL); | ||
567 | |||
568 | if (PTR_ERR(p) == -1UL) | ||
569 | goto out; | ||
570 | |||
571 | /* Found nothing?!?! Either we hang forever, or we panic. */ | ||
572 | if (!p) { | ||
573 | read_unlock(&tasklist_lock); | ||
574 | panic("Out of memory and no killable processes...\n"); | ||
575 | } | ||
576 | |||
577 | if (oom_kill_process(p, gfp_mask, order, points, NULL, | ||
578 | "Out of memory")) | ||
579 | goto retry; | ||
580 | |||
581 | break; | 627 | break; |
582 | } | 628 | } |
583 | 629 | ||
584 | out: | ||
585 | read_unlock(&tasklist_lock); | 630 | read_unlock(&tasklist_lock); |
586 | 631 | ||
587 | /* | 632 | /* |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2970e35fd03f..b493db7841dc 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void) | |||
69 | int dirty_background_ratio = 5; | 69 | int dirty_background_ratio = 5; |
70 | 70 | ||
71 | /* | 71 | /* |
72 | * dirty_background_bytes starts at 0 (disabled) so that it is a function of | ||
73 | * dirty_background_ratio * the amount of dirtyable memory | ||
74 | */ | ||
75 | unsigned long dirty_background_bytes; | ||
76 | |||
77 | /* | ||
72 | * free highmem will not be subtracted from the total free memory | 78 | * free highmem will not be subtracted from the total free memory |
73 | * for calculating free ratios if vm_highmem_is_dirtyable is true | 79 | * for calculating free ratios if vm_highmem_is_dirtyable is true |
74 | */ | 80 | */ |
@@ -80,6 +86,12 @@ int vm_highmem_is_dirtyable; | |||
80 | int vm_dirty_ratio = 10; | 86 | int vm_dirty_ratio = 10; |
81 | 87 | ||
82 | /* | 88 | /* |
89 | * vm_dirty_bytes starts at 0 (disabled) so that it is a function of | ||
90 | * vm_dirty_ratio * the amount of dirtyable memory | ||
91 | */ | ||
92 | unsigned long vm_dirty_bytes; | ||
93 | |||
94 | /* | ||
83 | * The interval between `kupdate'-style writebacks, in jiffies | 95 | * The interval between `kupdate'-style writebacks, in jiffies |
84 | */ | 96 | */ |
85 | int dirty_writeback_interval = 5 * HZ; | 97 | int dirty_writeback_interval = 5 * HZ; |
@@ -135,23 +147,75 @@ static int calc_period_shift(void) | |||
135 | { | 147 | { |
136 | unsigned long dirty_total; | 148 | unsigned long dirty_total; |
137 | 149 | ||
138 | dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; | 150 | if (vm_dirty_bytes) |
151 | dirty_total = vm_dirty_bytes / PAGE_SIZE; | ||
152 | else | ||
153 | dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / | ||
154 | 100; | ||
139 | return 2 + ilog2(dirty_total - 1); | 155 | return 2 + ilog2(dirty_total - 1); |
140 | } | 156 | } |
141 | 157 | ||
142 | /* | 158 | /* |
143 | * update the period when the dirty ratio changes. | 159 | * update the period when the dirty threshold changes. |
144 | */ | 160 | */ |
161 | static void update_completion_period(void) | ||
162 | { | ||
163 | int shift = calc_period_shift(); | ||
164 | prop_change_shift(&vm_completions, shift); | ||
165 | prop_change_shift(&vm_dirties, shift); | ||
166 | } | ||
167 | |||
168 | int dirty_background_ratio_handler(struct ctl_table *table, int write, | ||
169 | struct file *filp, void __user *buffer, size_t *lenp, | ||
170 | loff_t *ppos) | ||
171 | { | ||
172 | int ret; | ||
173 | |||
174 | ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
175 | if (ret == 0 && write) | ||
176 | dirty_background_bytes = 0; | ||
177 | return ret; | ||
178 | } | ||
179 | |||
180 | int dirty_background_bytes_handler(struct ctl_table *table, int write, | ||
181 | struct file *filp, void __user *buffer, size_t *lenp, | ||
182 | loff_t *ppos) | ||
183 | { | ||
184 | int ret; | ||
185 | |||
186 | ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
187 | if (ret == 0 && write) | ||
188 | dirty_background_ratio = 0; | ||
189 | return ret; | ||
190 | } | ||
191 | |||
145 | int dirty_ratio_handler(struct ctl_table *table, int write, | 192 | int dirty_ratio_handler(struct ctl_table *table, int write, |
146 | struct file *filp, void __user *buffer, size_t *lenp, | 193 | struct file *filp, void __user *buffer, size_t *lenp, |
147 | loff_t *ppos) | 194 | loff_t *ppos) |
148 | { | 195 | { |
149 | int old_ratio = vm_dirty_ratio; | 196 | int old_ratio = vm_dirty_ratio; |
150 | int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | 197 | int ret; |
198 | |||
199 | ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
151 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { | 200 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { |
152 | int shift = calc_period_shift(); | 201 | update_completion_period(); |
153 | prop_change_shift(&vm_completions, shift); | 202 | vm_dirty_bytes = 0; |
154 | prop_change_shift(&vm_dirties, shift); | 203 | } |
204 | return ret; | ||
205 | } | ||
206 | |||
207 | |||
208 | int dirty_bytes_handler(struct ctl_table *table, int write, | ||
209 | struct file *filp, void __user *buffer, size_t *lenp, | ||
210 | loff_t *ppos) | ||
211 | { | ||
212 | int old_bytes = vm_dirty_bytes; | ||
213 | int ret; | ||
214 | |||
215 | ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
216 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { | ||
217 | update_completion_period(); | ||
218 | vm_dirty_ratio = 0; | ||
155 | } | 219 | } |
156 | return ret; | 220 | return ret; |
157 | } | 221 | } |
@@ -362,26 +426,32 @@ unsigned long determine_dirtyable_memory(void) | |||
362 | } | 426 | } |
363 | 427 | ||
364 | void | 428 | void |
365 | get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, | 429 | get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, |
366 | struct backing_dev_info *bdi) | 430 | unsigned long *pbdi_dirty, struct backing_dev_info *bdi) |
367 | { | 431 | { |
368 | int background_ratio; /* Percentages */ | 432 | unsigned long background; |
369 | int dirty_ratio; | 433 | unsigned long dirty; |
370 | long background; | ||
371 | long dirty; | ||
372 | unsigned long available_memory = determine_dirtyable_memory(); | 434 | unsigned long available_memory = determine_dirtyable_memory(); |
373 | struct task_struct *tsk; | 435 | struct task_struct *tsk; |
374 | 436 | ||
375 | dirty_ratio = vm_dirty_ratio; | 437 | if (vm_dirty_bytes) |
376 | if (dirty_ratio < 5) | 438 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); |
377 | dirty_ratio = 5; | 439 | else { |
440 | int dirty_ratio; | ||
378 | 441 | ||
379 | background_ratio = dirty_background_ratio; | 442 | dirty_ratio = vm_dirty_ratio; |
380 | if (background_ratio >= dirty_ratio) | 443 | if (dirty_ratio < 5) |
381 | background_ratio = dirty_ratio / 2; | 444 | dirty_ratio = 5; |
445 | dirty = (dirty_ratio * available_memory) / 100; | ||
446 | } | ||
447 | |||
448 | if (dirty_background_bytes) | ||
449 | background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); | ||
450 | else | ||
451 | background = (dirty_background_ratio * available_memory) / 100; | ||
382 | 452 | ||
383 | background = (background_ratio * available_memory) / 100; | 453 | if (background >= dirty) |
384 | dirty = (dirty_ratio * available_memory) / 100; | 454 | background = dirty / 2; |
385 | tsk = current; | 455 | tsk = current; |
386 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { | 456 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { |
387 | background += background / 4; | 457 | background += background / 4; |
@@ -423,9 +493,9 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
423 | { | 493 | { |
424 | long nr_reclaimable, bdi_nr_reclaimable; | 494 | long nr_reclaimable, bdi_nr_reclaimable; |
425 | long nr_writeback, bdi_nr_writeback; | 495 | long nr_writeback, bdi_nr_writeback; |
426 | long background_thresh; | 496 | unsigned long background_thresh; |
427 | long dirty_thresh; | 497 | unsigned long dirty_thresh; |
428 | long bdi_thresh; | 498 | unsigned long bdi_thresh; |
429 | unsigned long pages_written = 0; | 499 | unsigned long pages_written = 0; |
430 | unsigned long write_chunk = sync_writeback_pages(); | 500 | unsigned long write_chunk = sync_writeback_pages(); |
431 | 501 | ||
@@ -580,8 +650,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); | |||
580 | 650 | ||
581 | void throttle_vm_writeout(gfp_t gfp_mask) | 651 | void throttle_vm_writeout(gfp_t gfp_mask) |
582 | { | 652 | { |
583 | long background_thresh; | 653 | unsigned long background_thresh; |
584 | long dirty_thresh; | 654 | unsigned long dirty_thresh; |
585 | 655 | ||
586 | for ( ; ; ) { | 656 | for ( ; ; ) { |
587 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); | 657 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); |
@@ -624,8 +694,8 @@ static void background_writeout(unsigned long _min_pages) | |||
624 | }; | 694 | }; |
625 | 695 | ||
626 | for ( ; ; ) { | 696 | for ( ; ; ) { |
627 | long background_thresh; | 697 | unsigned long background_thresh; |
628 | long dirty_thresh; | 698 | unsigned long dirty_thresh; |
629 | 699 | ||
630 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); | 700 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); |
631 | if (global_page_state(NR_FILE_DIRTY) + | 701 | if (global_page_state(NR_FILE_DIRTY) + |
@@ -868,9 +938,11 @@ int write_cache_pages(struct address_space *mapping, | |||
868 | int done = 0; | 938 | int done = 0; |
869 | struct pagevec pvec; | 939 | struct pagevec pvec; |
870 | int nr_pages; | 940 | int nr_pages; |
941 | pgoff_t uninitialized_var(writeback_index); | ||
871 | pgoff_t index; | 942 | pgoff_t index; |
872 | pgoff_t end; /* Inclusive */ | 943 | pgoff_t end; /* Inclusive */ |
873 | int scanned = 0; | 944 | pgoff_t done_index; |
945 | int cycled; | ||
874 | int range_whole = 0; | 946 | int range_whole = 0; |
875 | long nr_to_write = wbc->nr_to_write; | 947 | long nr_to_write = wbc->nr_to_write; |
876 | 948 | ||
@@ -881,83 +953,134 @@ int write_cache_pages(struct address_space *mapping, | |||
881 | 953 | ||
882 | pagevec_init(&pvec, 0); | 954 | pagevec_init(&pvec, 0); |
883 | if (wbc->range_cyclic) { | 955 | if (wbc->range_cyclic) { |
884 | index = mapping->writeback_index; /* Start from prev offset */ | 956 | writeback_index = mapping->writeback_index; /* prev offset */ |
957 | index = writeback_index; | ||
958 | if (index == 0) | ||
959 | cycled = 1; | ||
960 | else | ||
961 | cycled = 0; | ||
885 | end = -1; | 962 | end = -1; |
886 | } else { | 963 | } else { |
887 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | 964 | index = wbc->range_start >> PAGE_CACHE_SHIFT; |
888 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | 965 | end = wbc->range_end >> PAGE_CACHE_SHIFT; |
889 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | 966 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) |
890 | range_whole = 1; | 967 | range_whole = 1; |
891 | scanned = 1; | 968 | cycled = 1; /* ignore range_cyclic tests */ |
892 | } | 969 | } |
893 | retry: | 970 | retry: |
894 | while (!done && (index <= end) && | 971 | done_index = index; |
895 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | 972 | while (!done && (index <= end)) { |
896 | PAGECACHE_TAG_DIRTY, | 973 | int i; |
897 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { | 974 | |
898 | unsigned i; | 975 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, |
976 | PAGECACHE_TAG_DIRTY, | ||
977 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); | ||
978 | if (nr_pages == 0) | ||
979 | break; | ||
899 | 980 | ||
900 | scanned = 1; | ||
901 | for (i = 0; i < nr_pages; i++) { | 981 | for (i = 0; i < nr_pages; i++) { |
902 | struct page *page = pvec.pages[i]; | 982 | struct page *page = pvec.pages[i]; |
903 | 983 | ||
904 | /* | 984 | /* |
905 | * At this point we hold neither mapping->tree_lock nor | 985 | * At this point, the page may be truncated or |
906 | * lock on the page itself: the page may be truncated or | 986 | * invalidated (changing page->mapping to NULL), or |
907 | * invalidated (changing page->mapping to NULL), or even | 987 | * even swizzled back from swapper_space to tmpfs file |
908 | * swizzled back from swapper_space to tmpfs file | 988 | * mapping. However, page->index will not change |
909 | * mapping | 989 | * because we have a reference on the page. |
910 | */ | 990 | */ |
991 | if (page->index > end) { | ||
992 | /* | ||
993 | * can't be range_cyclic (1st pass) because | ||
994 | * end == -1 in that case. | ||
995 | */ | ||
996 | done = 1; | ||
997 | break; | ||
998 | } | ||
999 | |||
1000 | done_index = page->index + 1; | ||
1001 | |||
911 | lock_page(page); | 1002 | lock_page(page); |
912 | 1003 | ||
1004 | /* | ||
1005 | * Page truncated or invalidated. We can freely skip it | ||
1006 | * then, even for data integrity operations: the page | ||
1007 | * has disappeared concurrently, so there could be no | ||
1008 | * real expectation of this data interity operation | ||
1009 | * even if there is now a new, dirty page at the same | ||
1010 | * pagecache address. | ||
1011 | */ | ||
913 | if (unlikely(page->mapping != mapping)) { | 1012 | if (unlikely(page->mapping != mapping)) { |
1013 | continue_unlock: | ||
914 | unlock_page(page); | 1014 | unlock_page(page); |
915 | continue; | 1015 | continue; |
916 | } | 1016 | } |
917 | 1017 | ||
918 | if (!wbc->range_cyclic && page->index > end) { | 1018 | if (!PageDirty(page)) { |
919 | done = 1; | 1019 | /* someone wrote it for us */ |
920 | unlock_page(page); | 1020 | goto continue_unlock; |
921 | continue; | ||
922 | } | 1021 | } |
923 | 1022 | ||
924 | if (wbc->sync_mode != WB_SYNC_NONE) | 1023 | if (PageWriteback(page)) { |
925 | wait_on_page_writeback(page); | 1024 | if (wbc->sync_mode != WB_SYNC_NONE) |
926 | 1025 | wait_on_page_writeback(page); | |
927 | if (PageWriteback(page) || | 1026 | else |
928 | !clear_page_dirty_for_io(page)) { | 1027 | goto continue_unlock; |
929 | unlock_page(page); | ||
930 | continue; | ||
931 | } | 1028 | } |
932 | 1029 | ||
933 | ret = (*writepage)(page, wbc, data); | 1030 | BUG_ON(PageWriteback(page)); |
1031 | if (!clear_page_dirty_for_io(page)) | ||
1032 | goto continue_unlock; | ||
934 | 1033 | ||
935 | if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { | 1034 | ret = (*writepage)(page, wbc, data); |
936 | unlock_page(page); | 1035 | if (unlikely(ret)) { |
937 | ret = 0; | 1036 | if (ret == AOP_WRITEPAGE_ACTIVATE) { |
1037 | unlock_page(page); | ||
1038 | ret = 0; | ||
1039 | } else { | ||
1040 | /* | ||
1041 | * done_index is set past this page, | ||
1042 | * so media errors will not choke | ||
1043 | * background writeout for the entire | ||
1044 | * file. This has consequences for | ||
1045 | * range_cyclic semantics (ie. it may | ||
1046 | * not be suitable for data integrity | ||
1047 | * writeout). | ||
1048 | */ | ||
1049 | done = 1; | ||
1050 | break; | ||
1051 | } | ||
1052 | } | ||
1053 | |||
1054 | if (wbc->sync_mode == WB_SYNC_NONE) { | ||
1055 | wbc->nr_to_write--; | ||
1056 | if (wbc->nr_to_write <= 0) { | ||
1057 | done = 1; | ||
1058 | break; | ||
1059 | } | ||
938 | } | 1060 | } |
939 | if (ret || (--nr_to_write <= 0)) | ||
940 | done = 1; | ||
941 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | 1061 | if (wbc->nonblocking && bdi_write_congested(bdi)) { |
942 | wbc->encountered_congestion = 1; | 1062 | wbc->encountered_congestion = 1; |
943 | done = 1; | 1063 | done = 1; |
1064 | break; | ||
944 | } | 1065 | } |
945 | } | 1066 | } |
946 | pagevec_release(&pvec); | 1067 | pagevec_release(&pvec); |
947 | cond_resched(); | 1068 | cond_resched(); |
948 | } | 1069 | } |
949 | if (!scanned && !done) { | 1070 | if (!cycled) { |
950 | /* | 1071 | /* |
1072 | * range_cyclic: | ||
951 | * We hit the last page and there is more work to be done: wrap | 1073 | * We hit the last page and there is more work to be done: wrap |
952 | * back to the start of the file | 1074 | * back to the start of the file |
953 | */ | 1075 | */ |
954 | scanned = 1; | 1076 | cycled = 1; |
955 | index = 0; | 1077 | index = 0; |
1078 | end = writeback_index - 1; | ||
956 | goto retry; | 1079 | goto retry; |
957 | } | 1080 | } |
958 | if (!wbc->no_nrwrite_index_update) { | 1081 | if (!wbc->no_nrwrite_index_update) { |
959 | if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) | 1082 | if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) |
960 | mapping->writeback_index = index; | 1083 | mapping->writeback_index = done_index; |
961 | wbc->nr_to_write = nr_to_write; | 1084 | wbc->nr_to_write = nr_to_write; |
962 | } | 1085 | } |
963 | 1086 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d8ac01474563..5675b3073854 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -69,7 +69,7 @@ EXPORT_SYMBOL(node_states); | |||
69 | 69 | ||
70 | unsigned long totalram_pages __read_mostly; | 70 | unsigned long totalram_pages __read_mostly; |
71 | unsigned long totalreserve_pages __read_mostly; | 71 | unsigned long totalreserve_pages __read_mostly; |
72 | long nr_swap_pages; | 72 | unsigned long highest_memmap_pfn __read_mostly; |
73 | int percpu_pagelist_fraction; | 73 | int percpu_pagelist_fraction; |
74 | 74 | ||
75 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 75 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
@@ -223,19 +223,41 @@ static inline int bad_range(struct zone *zone, struct page *page) | |||
223 | 223 | ||
224 | static void bad_page(struct page *page) | 224 | static void bad_page(struct page *page) |
225 | { | 225 | { |
226 | printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG | 226 | static unsigned long resume; |
227 | "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", | 227 | static unsigned long nr_shown; |
228 | current->comm, page, (int)(2*sizeof(unsigned long)), | 228 | static unsigned long nr_unshown; |
229 | (unsigned long)page->flags, page->mapping, | 229 | |
230 | page_mapcount(page), page_count(page)); | 230 | /* |
231 | * Allow a burst of 60 reports, then keep quiet for that minute; | ||
232 | * or allow a steady drip of one report per second. | ||
233 | */ | ||
234 | if (nr_shown == 60) { | ||
235 | if (time_before(jiffies, resume)) { | ||
236 | nr_unshown++; | ||
237 | goto out; | ||
238 | } | ||
239 | if (nr_unshown) { | ||
240 | printk(KERN_ALERT | ||
241 | "BUG: Bad page state: %lu messages suppressed\n", | ||
242 | nr_unshown); | ||
243 | nr_unshown = 0; | ||
244 | } | ||
245 | nr_shown = 0; | ||
246 | } | ||
247 | if (nr_shown++ == 0) | ||
248 | resume = jiffies + 60 * HZ; | ||
249 | |||
250 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", | ||
251 | current->comm, page_to_pfn(page)); | ||
252 | printk(KERN_ALERT | ||
253 | "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", | ||
254 | page, (void *)page->flags, page_count(page), | ||
255 | page_mapcount(page), page->mapping, page->index); | ||
231 | 256 | ||
232 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" | ||
233 | KERN_EMERG "Backtrace:\n"); | ||
234 | dump_stack(); | 257 | dump_stack(); |
235 | page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD; | 258 | out: |
236 | set_page_count(page, 0); | 259 | /* Leave bad fields for debug, except PageBuddy could make trouble */ |
237 | reset_page_mapcount(page); | 260 | __ClearPageBuddy(page); |
238 | page->mapping = NULL; | ||
239 | add_taint(TAINT_BAD_PAGE); | 261 | add_taint(TAINT_BAD_PAGE); |
240 | } | 262 | } |
241 | 263 | ||
@@ -292,25 +314,31 @@ void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
292 | } | 314 | } |
293 | #endif | 315 | #endif |
294 | 316 | ||
295 | static void destroy_compound_page(struct page *page, unsigned long order) | 317 | static int destroy_compound_page(struct page *page, unsigned long order) |
296 | { | 318 | { |
297 | int i; | 319 | int i; |
298 | int nr_pages = 1 << order; | 320 | int nr_pages = 1 << order; |
321 | int bad = 0; | ||
299 | 322 | ||
300 | if (unlikely(compound_order(page) != order)) | 323 | if (unlikely(compound_order(page) != order) || |
324 | unlikely(!PageHead(page))) { | ||
301 | bad_page(page); | 325 | bad_page(page); |
326 | bad++; | ||
327 | } | ||
302 | 328 | ||
303 | if (unlikely(!PageHead(page))) | ||
304 | bad_page(page); | ||
305 | __ClearPageHead(page); | 329 | __ClearPageHead(page); |
330 | |||
306 | for (i = 1; i < nr_pages; i++) { | 331 | for (i = 1; i < nr_pages; i++) { |
307 | struct page *p = page + i; | 332 | struct page *p = page + i; |
308 | 333 | ||
309 | if (unlikely(!PageTail(p) | | 334 | if (unlikely(!PageTail(p) | (p->first_page != page))) { |
310 | (p->first_page != page))) | ||
311 | bad_page(page); | 335 | bad_page(page); |
336 | bad++; | ||
337 | } | ||
312 | __ClearPageTail(p); | 338 | __ClearPageTail(p); |
313 | } | 339 | } |
340 | |||
341 | return bad; | ||
314 | } | 342 | } |
315 | 343 | ||
316 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | 344 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) |
@@ -430,7 +458,8 @@ static inline void __free_one_page(struct page *page, | |||
430 | int migratetype = get_pageblock_migratetype(page); | 458 | int migratetype = get_pageblock_migratetype(page); |
431 | 459 | ||
432 | if (unlikely(PageCompound(page))) | 460 | if (unlikely(PageCompound(page))) |
433 | destroy_compound_page(page, order); | 461 | if (unlikely(destroy_compound_page(page, order))) |
462 | return; | ||
434 | 463 | ||
435 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 464 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
436 | 465 | ||
@@ -467,18 +496,13 @@ static inline int free_pages_check(struct page *page) | |||
467 | if (unlikely(page_mapcount(page) | | 496 | if (unlikely(page_mapcount(page) | |
468 | (page->mapping != NULL) | | 497 | (page->mapping != NULL) | |
469 | (page_count(page) != 0) | | 498 | (page_count(page) != 0) | |
470 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) | 499 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { |
471 | bad_page(page); | 500 | bad_page(page); |
472 | if (PageDirty(page)) | 501 | return 1; |
473 | __ClearPageDirty(page); | 502 | } |
474 | if (PageSwapBacked(page)) | 503 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
475 | __ClearPageSwapBacked(page); | 504 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
476 | /* | 505 | return 0; |
477 | * For now, we report if PG_reserved was found set, but do not | ||
478 | * clear it, and do not free the page. But we shall soon need | ||
479 | * to do more, for when the ZERO_PAGE count wraps negative. | ||
480 | */ | ||
481 | return PageReserved(page); | ||
482 | } | 506 | } |
483 | 507 | ||
484 | /* | 508 | /* |
@@ -523,11 +547,11 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
523 | { | 547 | { |
524 | unsigned long flags; | 548 | unsigned long flags; |
525 | int i; | 549 | int i; |
526 | int reserved = 0; | 550 | int bad = 0; |
527 | 551 | ||
528 | for (i = 0 ; i < (1 << order) ; ++i) | 552 | for (i = 0 ; i < (1 << order) ; ++i) |
529 | reserved += free_pages_check(page + i); | 553 | bad += free_pages_check(page + i); |
530 | if (reserved) | 554 | if (bad) |
531 | return; | 555 | return; |
532 | 556 | ||
533 | if (!PageHighMem(page)) { | 557 | if (!PageHighMem(page)) { |
@@ -612,23 +636,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
612 | if (unlikely(page_mapcount(page) | | 636 | if (unlikely(page_mapcount(page) | |
613 | (page->mapping != NULL) | | 637 | (page->mapping != NULL) | |
614 | (page_count(page) != 0) | | 638 | (page_count(page) != 0) | |
615 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) | 639 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { |
616 | bad_page(page); | 640 | bad_page(page); |
617 | |||
618 | /* | ||
619 | * For now, we report if PG_reserved was found set, but do not | ||
620 | * clear it, and do not allocate the page: as a safety net. | ||
621 | */ | ||
622 | if (PageReserved(page)) | ||
623 | return 1; | 641 | return 1; |
642 | } | ||
624 | 643 | ||
625 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | | ||
626 | 1 << PG_referenced | 1 << PG_arch_1 | | ||
627 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk | ||
628 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
629 | | 1 << PG_mlocked | ||
630 | #endif | ||
631 | ); | ||
632 | set_page_private(page, 0); | 644 | set_page_private(page, 0); |
633 | set_page_refcounted(page); | 645 | set_page_refcounted(page); |
634 | 646 | ||
@@ -2609,6 +2621,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
2609 | unsigned long pfn; | 2621 | unsigned long pfn; |
2610 | struct zone *z; | 2622 | struct zone *z; |
2611 | 2623 | ||
2624 | if (highest_memmap_pfn < end_pfn - 1) | ||
2625 | highest_memmap_pfn = end_pfn - 1; | ||
2626 | |||
2612 | z = &NODE_DATA(nid)->node_zones[zone]; | 2627 | z = &NODE_DATA(nid)->node_zones[zone]; |
2613 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 2628 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
2614 | /* | 2629 | /* |
@@ -3381,10 +3396,8 @@ static void __init setup_usemap(struct pglist_data *pgdat, | |||
3381 | { | 3396 | { |
3382 | unsigned long usemapsize = usemap_size(zonesize); | 3397 | unsigned long usemapsize = usemap_size(zonesize); |
3383 | zone->pageblock_flags = NULL; | 3398 | zone->pageblock_flags = NULL; |
3384 | if (usemapsize) { | 3399 | if (usemapsize) |
3385 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); | 3400 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); |
3386 | memset(zone->pageblock_flags, 0, usemapsize); | ||
3387 | } | ||
3388 | } | 3401 | } |
3389 | #else | 3402 | #else |
3390 | static void inline setup_usemap(struct pglist_data *pgdat, | 3403 | static void inline setup_usemap(struct pglist_data *pgdat, |
@@ -3469,9 +3482,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
3469 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; | 3482 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; |
3470 | if (realsize >= memmap_pages) { | 3483 | if (realsize >= memmap_pages) { |
3471 | realsize -= memmap_pages; | 3484 | realsize -= memmap_pages; |
3472 | printk(KERN_DEBUG | 3485 | if (memmap_pages) |
3473 | " %s zone: %lu pages used for memmap\n", | 3486 | printk(KERN_DEBUG |
3474 | zone_names[j], memmap_pages); | 3487 | " %s zone: %lu pages used for memmap\n", |
3488 | zone_names[j], memmap_pages); | ||
3475 | } else | 3489 | } else |
3476 | printk(KERN_WARNING | 3490 | printk(KERN_WARNING |
3477 | " %s zone: %lu pages exceeds realsize %lu\n", | 3491 | " %s zone: %lu pages exceeds realsize %lu\n", |
@@ -3509,10 +3523,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
3509 | INIT_LIST_HEAD(&zone->lru[l].list); | 3523 | INIT_LIST_HEAD(&zone->lru[l].list); |
3510 | zone->lru[l].nr_scan = 0; | 3524 | zone->lru[l].nr_scan = 0; |
3511 | } | 3525 | } |
3512 | zone->recent_rotated[0] = 0; | 3526 | zone->reclaim_stat.recent_rotated[0] = 0; |
3513 | zone->recent_rotated[1] = 0; | 3527 | zone->reclaim_stat.recent_rotated[1] = 0; |
3514 | zone->recent_scanned[0] = 0; | 3528 | zone->reclaim_stat.recent_scanned[0] = 0; |
3515 | zone->recent_scanned[1] = 0; | 3529 | zone->reclaim_stat.recent_scanned[1] = 0; |
3516 | zap_zone_vm_stats(zone); | 3530 | zap_zone_vm_stats(zone); |
3517 | zone->flags = 0; | 3531 | zone->flags = 0; |
3518 | if (!size) | 3532 | if (!size) |
@@ -4316,7 +4330,7 @@ void setup_per_zone_pages_min(void) | |||
4316 | * 1TB 101 10GB | 4330 | * 1TB 101 10GB |
4317 | * 10TB 320 32GB | 4331 | * 10TB 320 32GB |
4318 | */ | 4332 | */ |
4319 | void setup_per_zone_inactive_ratio(void) | 4333 | static void setup_per_zone_inactive_ratio(void) |
4320 | { | 4334 | { |
4321 | struct zone *zone; | 4335 | struct zone *zone; |
4322 | 4336 | ||
@@ -4573,19 +4587,6 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4573 | return table; | 4587 | return table; |
4574 | } | 4588 | } |
4575 | 4589 | ||
4576 | #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE | ||
4577 | struct page *pfn_to_page(unsigned long pfn) | ||
4578 | { | ||
4579 | return __pfn_to_page(pfn); | ||
4580 | } | ||
4581 | unsigned long page_to_pfn(struct page *page) | ||
4582 | { | ||
4583 | return __page_to_pfn(page); | ||
4584 | } | ||
4585 | EXPORT_SYMBOL(pfn_to_page); | ||
4586 | EXPORT_SYMBOL(page_to_pfn); | ||
4587 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ | ||
4588 | |||
4589 | /* Return a pointer to the bitmap storing bits affecting a block of pages */ | 4590 | /* Return a pointer to the bitmap storing bits affecting a block of pages */ |
4590 | static inline unsigned long *get_pageblock_bitmap(struct zone *zone, | 4591 | static inline unsigned long *get_pageblock_bitmap(struct zone *zone, |
4591 | unsigned long pfn) | 4592 | unsigned long pfn) |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index ab27ff750519..7006a11350c8 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/memory.h> | 8 | #include <linux/memory.h> |
9 | #include <linux/vmalloc.h> | 9 | #include <linux/vmalloc.h> |
10 | #include <linux/cgroup.h> | 10 | #include <linux/cgroup.h> |
11 | #include <linux/swapops.h> | ||
11 | 12 | ||
12 | static void __meminit | 13 | static void __meminit |
13 | __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) | 14 | __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) |
@@ -15,6 +16,7 @@ __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) | |||
15 | pc->flags = 0; | 16 | pc->flags = 0; |
16 | pc->mem_cgroup = NULL; | 17 | pc->mem_cgroup = NULL; |
17 | pc->page = pfn_to_page(pfn); | 18 | pc->page = pfn_to_page(pfn); |
19 | INIT_LIST_HEAD(&pc->lru); | ||
18 | } | 20 | } |
19 | static unsigned long total_usage; | 21 | static unsigned long total_usage; |
20 | 22 | ||
@@ -72,7 +74,7 @@ void __init page_cgroup_init(void) | |||
72 | 74 | ||
73 | int nid, fail; | 75 | int nid, fail; |
74 | 76 | ||
75 | if (mem_cgroup_subsys.disabled) | 77 | if (mem_cgroup_disabled()) |
76 | return; | 78 | return; |
77 | 79 | ||
78 | for_each_online_node(nid) { | 80 | for_each_online_node(nid) { |
@@ -101,15 +103,13 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) | |||
101 | } | 103 | } |
102 | 104 | ||
103 | /* __alloc_bootmem...() is protected by !slab_available() */ | 105 | /* __alloc_bootmem...() is protected by !slab_available() */ |
104 | int __init_refok init_section_page_cgroup(unsigned long pfn) | 106 | static int __init_refok init_section_page_cgroup(unsigned long pfn) |
105 | { | 107 | { |
106 | struct mem_section *section; | 108 | struct mem_section *section = __pfn_to_section(pfn); |
107 | struct page_cgroup *base, *pc; | 109 | struct page_cgroup *base, *pc; |
108 | unsigned long table_size; | 110 | unsigned long table_size; |
109 | int nid, index; | 111 | int nid, index; |
110 | 112 | ||
111 | section = __pfn_to_section(pfn); | ||
112 | |||
113 | if (!section->page_cgroup) { | 113 | if (!section->page_cgroup) { |
114 | nid = page_to_nid(pfn_to_page(pfn)); | 114 | nid = page_to_nid(pfn_to_page(pfn)); |
115 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | 115 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; |
@@ -145,7 +145,6 @@ int __init_refok init_section_page_cgroup(unsigned long pfn) | |||
145 | __init_page_cgroup(pc, pfn + index); | 145 | __init_page_cgroup(pc, pfn + index); |
146 | } | 146 | } |
147 | 147 | ||
148 | section = __pfn_to_section(pfn); | ||
149 | section->page_cgroup = base - pfn; | 148 | section->page_cgroup = base - pfn; |
150 | total_usage += table_size; | 149 | total_usage += table_size; |
151 | return 0; | 150 | return 0; |
@@ -248,7 +247,7 @@ void __init page_cgroup_init(void) | |||
248 | unsigned long pfn; | 247 | unsigned long pfn; |
249 | int fail = 0; | 248 | int fail = 0; |
250 | 249 | ||
251 | if (mem_cgroup_subsys.disabled) | 250 | if (mem_cgroup_disabled()) |
252 | return; | 251 | return; |
253 | 252 | ||
254 | for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { | 253 | for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { |
@@ -273,3 +272,199 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | |||
273 | } | 272 | } |
274 | 273 | ||
275 | #endif | 274 | #endif |
275 | |||
276 | |||
277 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
278 | |||
279 | static DEFINE_MUTEX(swap_cgroup_mutex); | ||
280 | struct swap_cgroup_ctrl { | ||
281 | struct page **map; | ||
282 | unsigned long length; | ||
283 | }; | ||
284 | |||
285 | struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; | ||
286 | |||
287 | /* | ||
288 | * This 8bytes seems big..maybe we can reduce this when we can use "id" for | ||
289 | * cgroup rather than pointer. | ||
290 | */ | ||
291 | struct swap_cgroup { | ||
292 | struct mem_cgroup *val; | ||
293 | }; | ||
294 | #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) | ||
295 | #define SC_POS_MASK (SC_PER_PAGE - 1) | ||
296 | |||
297 | /* | ||
298 | * SwapCgroup implements "lookup" and "exchange" operations. | ||
299 | * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge | ||
300 | * against SwapCache. At swap_free(), this is accessed directly from swap. | ||
301 | * | ||
302 | * This means, | ||
303 | * - we have no race in "exchange" when we're accessed via SwapCache because | ||
304 | * SwapCache(and its swp_entry) is under lock. | ||
305 | * - When called via swap_free(), there is no user of this entry and no race. | ||
306 | * Then, we don't need lock around "exchange". | ||
307 | * | ||
308 | * TODO: we can push these buffers out to HIGHMEM. | ||
309 | */ | ||
310 | |||
311 | /* | ||
312 | * allocate buffer for swap_cgroup. | ||
313 | */ | ||
314 | static int swap_cgroup_prepare(int type) | ||
315 | { | ||
316 | struct page *page; | ||
317 | struct swap_cgroup_ctrl *ctrl; | ||
318 | unsigned long idx, max; | ||
319 | |||
320 | if (!do_swap_account) | ||
321 | return 0; | ||
322 | ctrl = &swap_cgroup_ctrl[type]; | ||
323 | |||
324 | for (idx = 0; idx < ctrl->length; idx++) { | ||
325 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
326 | if (!page) | ||
327 | goto not_enough_page; | ||
328 | ctrl->map[idx] = page; | ||
329 | } | ||
330 | return 0; | ||
331 | not_enough_page: | ||
332 | max = idx; | ||
333 | for (idx = 0; idx < max; idx++) | ||
334 | __free_page(ctrl->map[idx]); | ||
335 | |||
336 | return -ENOMEM; | ||
337 | } | ||
338 | |||
339 | /** | ||
340 | * swap_cgroup_record - record mem_cgroup for this swp_entry. | ||
341 | * @ent: swap entry to be recorded into | ||
342 | * @mem: mem_cgroup to be recorded | ||
343 | * | ||
344 | * Returns old value at success, NULL at failure. | ||
345 | * (Of course, old value can be NULL.) | ||
346 | */ | ||
347 | struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem) | ||
348 | { | ||
349 | int type = swp_type(ent); | ||
350 | unsigned long offset = swp_offset(ent); | ||
351 | unsigned long idx = offset / SC_PER_PAGE; | ||
352 | unsigned long pos = offset & SC_POS_MASK; | ||
353 | struct swap_cgroup_ctrl *ctrl; | ||
354 | struct page *mappage; | ||
355 | struct swap_cgroup *sc; | ||
356 | struct mem_cgroup *old; | ||
357 | |||
358 | if (!do_swap_account) | ||
359 | return NULL; | ||
360 | |||
361 | ctrl = &swap_cgroup_ctrl[type]; | ||
362 | |||
363 | mappage = ctrl->map[idx]; | ||
364 | sc = page_address(mappage); | ||
365 | sc += pos; | ||
366 | old = sc->val; | ||
367 | sc->val = mem; | ||
368 | |||
369 | return old; | ||
370 | } | ||
371 | |||
372 | /** | ||
373 | * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry | ||
374 | * @ent: swap entry to be looked up. | ||
375 | * | ||
376 | * Returns pointer to mem_cgroup at success. NULL at failure. | ||
377 | */ | ||
378 | struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent) | ||
379 | { | ||
380 | int type = swp_type(ent); | ||
381 | unsigned long offset = swp_offset(ent); | ||
382 | unsigned long idx = offset / SC_PER_PAGE; | ||
383 | unsigned long pos = offset & SC_POS_MASK; | ||
384 | struct swap_cgroup_ctrl *ctrl; | ||
385 | struct page *mappage; | ||
386 | struct swap_cgroup *sc; | ||
387 | struct mem_cgroup *ret; | ||
388 | |||
389 | if (!do_swap_account) | ||
390 | return NULL; | ||
391 | |||
392 | ctrl = &swap_cgroup_ctrl[type]; | ||
393 | mappage = ctrl->map[idx]; | ||
394 | sc = page_address(mappage); | ||
395 | sc += pos; | ||
396 | ret = sc->val; | ||
397 | return ret; | ||
398 | } | ||
399 | |||
400 | int swap_cgroup_swapon(int type, unsigned long max_pages) | ||
401 | { | ||
402 | void *array; | ||
403 | unsigned long array_size; | ||
404 | unsigned long length; | ||
405 | struct swap_cgroup_ctrl *ctrl; | ||
406 | |||
407 | if (!do_swap_account) | ||
408 | return 0; | ||
409 | |||
410 | length = ((max_pages/SC_PER_PAGE) + 1); | ||
411 | array_size = length * sizeof(void *); | ||
412 | |||
413 | array = vmalloc(array_size); | ||
414 | if (!array) | ||
415 | goto nomem; | ||
416 | |||
417 | memset(array, 0, array_size); | ||
418 | ctrl = &swap_cgroup_ctrl[type]; | ||
419 | mutex_lock(&swap_cgroup_mutex); | ||
420 | ctrl->length = length; | ||
421 | ctrl->map = array; | ||
422 | if (swap_cgroup_prepare(type)) { | ||
423 | /* memory shortage */ | ||
424 | ctrl->map = NULL; | ||
425 | ctrl->length = 0; | ||
426 | vfree(array); | ||
427 | mutex_unlock(&swap_cgroup_mutex); | ||
428 | goto nomem; | ||
429 | } | ||
430 | mutex_unlock(&swap_cgroup_mutex); | ||
431 | |||
432 | printk(KERN_INFO | ||
433 | "swap_cgroup: uses %ld bytes of vmalloc for pointer array space" | ||
434 | " and %ld bytes to hold mem_cgroup pointers on swap\n", | ||
435 | array_size, length * PAGE_SIZE); | ||
436 | printk(KERN_INFO | ||
437 | "swap_cgroup can be disabled by noswapaccount boot option.\n"); | ||
438 | |||
439 | return 0; | ||
440 | nomem: | ||
441 | printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); | ||
442 | printk(KERN_INFO | ||
443 | "swap_cgroup can be disabled by noswapaccount boot option\n"); | ||
444 | return -ENOMEM; | ||
445 | } | ||
446 | |||
447 | void swap_cgroup_swapoff(int type) | ||
448 | { | ||
449 | int i; | ||
450 | struct swap_cgroup_ctrl *ctrl; | ||
451 | |||
452 | if (!do_swap_account) | ||
453 | return; | ||
454 | |||
455 | mutex_lock(&swap_cgroup_mutex); | ||
456 | ctrl = &swap_cgroup_ctrl[type]; | ||
457 | if (ctrl->map) { | ||
458 | for (i = 0; i < ctrl->length; i++) { | ||
459 | struct page *page = ctrl->map[i]; | ||
460 | if (page) | ||
461 | __free_page(page); | ||
462 | } | ||
463 | vfree(ctrl->map); | ||
464 | ctrl->map = NULL; | ||
465 | ctrl->length = 0; | ||
466 | } | ||
467 | mutex_unlock(&swap_cgroup_mutex); | ||
468 | } | ||
469 | |||
470 | #endif | ||
diff --git a/mm/page_io.c b/mm/page_io.c index 065c4480eaf0..dc6ce0afbded 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -98,7 +98,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
98 | struct bio *bio; | 98 | struct bio *bio; |
99 | int ret = 0, rw = WRITE; | 99 | int ret = 0, rw = WRITE; |
100 | 100 | ||
101 | if (remove_exclusive_swap_page(page)) { | 101 | if (try_to_free_swap(page)) { |
102 | unlock_page(page); | 102 | unlock_page(page); |
103 | goto out; | 103 | goto out; |
104 | } | 104 | } |
@@ -125,8 +125,8 @@ int swap_readpage(struct file *file, struct page *page) | |||
125 | struct bio *bio; | 125 | struct bio *bio; |
126 | int ret = 0; | 126 | int ret = 0; |
127 | 127 | ||
128 | BUG_ON(!PageLocked(page)); | 128 | VM_BUG_ON(!PageLocked(page)); |
129 | BUG_ON(PageUptodate(page)); | 129 | VM_BUG_ON(PageUptodate(page)); |
130 | bio = get_swap_bio(GFP_KERNEL, page_private(page), page, | 130 | bio = get_swap_bio(GFP_KERNEL, page_private(page), page, |
131 | end_swap_bio_read); | 131 | end_swap_bio_read); |
132 | if (bio == NULL) { | 132 | if (bio == NULL) { |
@@ -47,9 +47,9 @@ | |||
47 | #include <linux/rmap.h> | 47 | #include <linux/rmap.h> |
48 | #include <linux/rcupdate.h> | 48 | #include <linux/rcupdate.h> |
49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
50 | #include <linux/kallsyms.h> | ||
51 | #include <linux/memcontrol.h> | 50 | #include <linux/memcontrol.h> |
52 | #include <linux/mmu_notifier.h> | 51 | #include <linux/mmu_notifier.h> |
52 | #include <linux/migrate.h> | ||
53 | 53 | ||
54 | #include <asm/tlbflush.h> | 54 | #include <asm/tlbflush.h> |
55 | 55 | ||
@@ -191,7 +191,7 @@ void __init anon_vma_init(void) | |||
191 | * Getting a lock on a stable anon_vma from a page off the LRU is | 191 | * Getting a lock on a stable anon_vma from a page off the LRU is |
192 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. | 192 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. |
193 | */ | 193 | */ |
194 | struct anon_vma *page_lock_anon_vma(struct page *page) | 194 | static struct anon_vma *page_lock_anon_vma(struct page *page) |
195 | { | 195 | { |
196 | struct anon_vma *anon_vma; | 196 | struct anon_vma *anon_vma; |
197 | unsigned long anon_mapping; | 197 | unsigned long anon_mapping; |
@@ -211,7 +211,7 @@ out: | |||
211 | return NULL; | 211 | return NULL; |
212 | } | 212 | } |
213 | 213 | ||
214 | void page_unlock_anon_vma(struct anon_vma *anon_vma) | 214 | static void page_unlock_anon_vma(struct anon_vma *anon_vma) |
215 | { | 215 | { |
216 | spin_unlock(&anon_vma->lock); | 216 | spin_unlock(&anon_vma->lock); |
217 | rcu_read_unlock(); | 217 | rcu_read_unlock(); |
@@ -359,8 +359,17 @@ static int page_referenced_one(struct page *page, | |||
359 | goto out_unmap; | 359 | goto out_unmap; |
360 | } | 360 | } |
361 | 361 | ||
362 | if (ptep_clear_flush_young_notify(vma, address, pte)) | 362 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
363 | referenced++; | 363 | /* |
364 | * Don't treat a reference through a sequentially read | ||
365 | * mapping as such. If the page has been used in | ||
366 | * another mapping, we will catch it; if this other | ||
367 | * mapping is already gone, the unmap path will have | ||
368 | * set PG_referenced or activated the page. | ||
369 | */ | ||
370 | if (likely(!VM_SequentialReadHint(vma))) | ||
371 | referenced++; | ||
372 | } | ||
364 | 373 | ||
365 | /* Pretend the page is referenced if the task has the | 374 | /* Pretend the page is referenced if the task has the |
366 | swap token and is in the middle of a page fault. */ | 375 | swap token and is in the middle of a page fault. */ |
@@ -661,9 +670,14 @@ void page_add_anon_rmap(struct page *page, | |||
661 | void page_add_new_anon_rmap(struct page *page, | 670 | void page_add_new_anon_rmap(struct page *page, |
662 | struct vm_area_struct *vma, unsigned long address) | 671 | struct vm_area_struct *vma, unsigned long address) |
663 | { | 672 | { |
664 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 673 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
665 | atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ | 674 | SetPageSwapBacked(page); |
675 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | ||
666 | __page_set_anon_rmap(page, vma, address); | 676 | __page_set_anon_rmap(page, vma, address); |
677 | if (page_evictable(page, vma)) | ||
678 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | ||
679 | else | ||
680 | add_page_to_unevictable_list(page); | ||
667 | } | 681 | } |
668 | 682 | ||
669 | /** | 683 | /** |
@@ -693,7 +707,6 @@ void page_add_file_rmap(struct page *page) | |||
693 | */ | 707 | */ |
694 | void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) | 708 | void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) |
695 | { | 709 | { |
696 | BUG_ON(page_mapcount(page) == 0); | ||
697 | if (PageAnon(page)) | 710 | if (PageAnon(page)) |
698 | __page_check_anon_rmap(page, vma, address); | 711 | __page_check_anon_rmap(page, vma, address); |
699 | atomic_inc(&page->_mapcount); | 712 | atomic_inc(&page->_mapcount); |
@@ -703,28 +716,12 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long | |||
703 | /** | 716 | /** |
704 | * page_remove_rmap - take down pte mapping from a page | 717 | * page_remove_rmap - take down pte mapping from a page |
705 | * @page: page to remove mapping from | 718 | * @page: page to remove mapping from |
706 | * @vma: the vm area in which the mapping is removed | ||
707 | * | 719 | * |
708 | * The caller needs to hold the pte lock. | 720 | * The caller needs to hold the pte lock. |
709 | */ | 721 | */ |
710 | void page_remove_rmap(struct page *page, struct vm_area_struct *vma) | 722 | void page_remove_rmap(struct page *page) |
711 | { | 723 | { |
712 | if (atomic_add_negative(-1, &page->_mapcount)) { | 724 | if (atomic_add_negative(-1, &page->_mapcount)) { |
713 | if (unlikely(page_mapcount(page) < 0)) { | ||
714 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); | ||
715 | printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); | ||
716 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); | ||
717 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); | ||
718 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); | ||
719 | print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); | ||
720 | if (vma->vm_ops) { | ||
721 | print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); | ||
722 | } | ||
723 | if (vma->vm_file && vma->vm_file->f_op) | ||
724 | print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); | ||
725 | BUG(); | ||
726 | } | ||
727 | |||
728 | /* | 725 | /* |
729 | * Now that the last pte has gone, s390 must transfer dirty | 726 | * Now that the last pte has gone, s390 must transfer dirty |
730 | * flag from storage key to struct page. We can usually skip | 727 | * flag from storage key to struct page. We can usually skip |
@@ -818,8 +815,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
818 | spin_unlock(&mmlist_lock); | 815 | spin_unlock(&mmlist_lock); |
819 | } | 816 | } |
820 | dec_mm_counter(mm, anon_rss); | 817 | dec_mm_counter(mm, anon_rss); |
821 | #ifdef CONFIG_MIGRATION | 818 | } else if (PAGE_MIGRATION) { |
822 | } else { | ||
823 | /* | 819 | /* |
824 | * Store the pfn of the page in a special migration | 820 | * Store the pfn of the page in a special migration |
825 | * pte. do_swap_page() will wait until the migration | 821 | * pte. do_swap_page() will wait until the migration |
@@ -827,23 +823,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
827 | */ | 823 | */ |
828 | BUG_ON(!migration); | 824 | BUG_ON(!migration); |
829 | entry = make_migration_entry(page, pte_write(pteval)); | 825 | entry = make_migration_entry(page, pte_write(pteval)); |
830 | #endif | ||
831 | } | 826 | } |
832 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 827 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
833 | BUG_ON(pte_file(*pte)); | 828 | BUG_ON(pte_file(*pte)); |
834 | } else | 829 | } else if (PAGE_MIGRATION && migration) { |
835 | #ifdef CONFIG_MIGRATION | ||
836 | if (migration) { | ||
837 | /* Establish migration entry for a file page */ | 830 | /* Establish migration entry for a file page */ |
838 | swp_entry_t entry; | 831 | swp_entry_t entry; |
839 | entry = make_migration_entry(page, pte_write(pteval)); | 832 | entry = make_migration_entry(page, pte_write(pteval)); |
840 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 833 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
841 | } else | 834 | } else |
842 | #endif | ||
843 | dec_mm_counter(mm, file_rss); | 835 | dec_mm_counter(mm, file_rss); |
844 | 836 | ||
845 | 837 | ||
846 | page_remove_rmap(page, vma); | 838 | page_remove_rmap(page); |
847 | page_cache_release(page); | 839 | page_cache_release(page); |
848 | 840 | ||
849 | out_unmap: | 841 | out_unmap: |
@@ -958,7 +950,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
958 | if (pte_dirty(pteval)) | 950 | if (pte_dirty(pteval)) |
959 | set_page_dirty(page); | 951 | set_page_dirty(page); |
960 | 952 | ||
961 | page_remove_rmap(page, vma); | 953 | page_remove_rmap(page); |
962 | page_cache_release(page); | 954 | page_cache_release(page); |
963 | dec_mm_counter(mm, file_rss); | 955 | dec_mm_counter(mm, file_rss); |
964 | (*mapcount)--; | 956 | (*mapcount)--; |
diff --git a/mm/shmem.c b/mm/shmem.c index f1b0d4871f3a..5d0de96c9789 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -14,31 +14,39 @@ | |||
14 | * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> | 14 | * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> |
15 | * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> | 15 | * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> |
16 | * | 16 | * |
17 | * tiny-shmem: | ||
18 | * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> | ||
19 | * | ||
17 | * This file is released under the GPL. | 20 | * This file is released under the GPL. |
18 | */ | 21 | */ |
19 | 22 | ||
23 | #include <linux/fs.h> | ||
24 | #include <linux/init.h> | ||
25 | #include <linux/vfs.h> | ||
26 | #include <linux/mount.h> | ||
27 | #include <linux/file.h> | ||
28 | #include <linux/mm.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <linux/swap.h> | ||
31 | |||
32 | static struct vfsmount *shm_mnt; | ||
33 | |||
34 | #ifdef CONFIG_SHMEM | ||
20 | /* | 35 | /* |
21 | * This virtual memory filesystem is heavily based on the ramfs. It | 36 | * This virtual memory filesystem is heavily based on the ramfs. It |
22 | * extends ramfs by the ability to use swap and honor resource limits | 37 | * extends ramfs by the ability to use swap and honor resource limits |
23 | * which makes it a completely usable filesystem. | 38 | * which makes it a completely usable filesystem. |
24 | */ | 39 | */ |
25 | 40 | ||
26 | #include <linux/module.h> | ||
27 | #include <linux/init.h> | ||
28 | #include <linux/fs.h> | ||
29 | #include <linux/xattr.h> | 41 | #include <linux/xattr.h> |
30 | #include <linux/exportfs.h> | 42 | #include <linux/exportfs.h> |
31 | #include <linux/generic_acl.h> | 43 | #include <linux/generic_acl.h> |
32 | #include <linux/mm.h> | ||
33 | #include <linux/mman.h> | 44 | #include <linux/mman.h> |
34 | #include <linux/file.h> | ||
35 | #include <linux/swap.h> | ||
36 | #include <linux/pagemap.h> | 45 | #include <linux/pagemap.h> |
37 | #include <linux/string.h> | 46 | #include <linux/string.h> |
38 | #include <linux/slab.h> | 47 | #include <linux/slab.h> |
39 | #include <linux/backing-dev.h> | 48 | #include <linux/backing-dev.h> |
40 | #include <linux/shmem_fs.h> | 49 | #include <linux/shmem_fs.h> |
41 | #include <linux/mount.h> | ||
42 | #include <linux/writeback.h> | 50 | #include <linux/writeback.h> |
43 | #include <linux/vfs.h> | 51 | #include <linux/vfs.h> |
44 | #include <linux/blkdev.h> | 52 | #include <linux/blkdev.h> |
@@ -920,7 +928,11 @@ found: | |||
920 | error = 1; | 928 | error = 1; |
921 | if (!inode) | 929 | if (!inode) |
922 | goto out; | 930 | goto out; |
923 | /* Precharge page using GFP_KERNEL while we can wait */ | 931 | /* |
932 | * Charge page using GFP_KERNEL while we can wait. | ||
933 | * Charged back to the user(not to caller) when swap account is used. | ||
934 | * add_to_page_cache() will be called with GFP_NOWAIT. | ||
935 | */ | ||
924 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); | 936 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); |
925 | if (error) | 937 | if (error) |
926 | goto out; | 938 | goto out; |
@@ -1312,15 +1324,19 @@ repeat: | |||
1312 | } else { | 1324 | } else { |
1313 | shmem_swp_unmap(entry); | 1325 | shmem_swp_unmap(entry); |
1314 | spin_unlock(&info->lock); | 1326 | spin_unlock(&info->lock); |
1315 | unlock_page(swappage); | ||
1316 | page_cache_release(swappage); | ||
1317 | if (error == -ENOMEM) { | 1327 | if (error == -ENOMEM) { |
1318 | /* allow reclaim from this memory cgroup */ | 1328 | /* allow reclaim from this memory cgroup */ |
1319 | error = mem_cgroup_shrink_usage(current->mm, | 1329 | error = mem_cgroup_shrink_usage(swappage, |
1330 | current->mm, | ||
1320 | gfp); | 1331 | gfp); |
1321 | if (error) | 1332 | if (error) { |
1333 | unlock_page(swappage); | ||
1334 | page_cache_release(swappage); | ||
1322 | goto failed; | 1335 | goto failed; |
1336 | } | ||
1323 | } | 1337 | } |
1338 | unlock_page(swappage); | ||
1339 | page_cache_release(swappage); | ||
1324 | goto repeat; | 1340 | goto repeat; |
1325 | } | 1341 | } |
1326 | } else if (sgp == SGP_READ && !filepage) { | 1342 | } else if (sgp == SGP_READ && !filepage) { |
@@ -1371,7 +1387,7 @@ repeat: | |||
1371 | 1387 | ||
1372 | /* Precharge page while we can wait, compensate after */ | 1388 | /* Precharge page while we can wait, compensate after */ |
1373 | error = mem_cgroup_cache_charge(filepage, current->mm, | 1389 | error = mem_cgroup_cache_charge(filepage, current->mm, |
1374 | gfp & ~__GFP_HIGHMEM); | 1390 | GFP_KERNEL); |
1375 | if (error) { | 1391 | if (error) { |
1376 | page_cache_release(filepage); | 1392 | page_cache_release(filepage); |
1377 | shmem_unacct_blocks(info->flags, 1); | 1393 | shmem_unacct_blocks(info->flags, 1); |
@@ -1444,7 +1460,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1444 | if (error) | 1460 | if (error) |
1445 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); | 1461 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); |
1446 | 1462 | ||
1447 | mark_page_accessed(vmf->page); | ||
1448 | return ret | VM_FAULT_LOCKED; | 1463 | return ret | VM_FAULT_LOCKED; |
1449 | } | 1464 | } |
1450 | 1465 | ||
@@ -2486,7 +2501,6 @@ static struct file_system_type tmpfs_fs_type = { | |||
2486 | .get_sb = shmem_get_sb, | 2501 | .get_sb = shmem_get_sb, |
2487 | .kill_sb = kill_litter_super, | 2502 | .kill_sb = kill_litter_super, |
2488 | }; | 2503 | }; |
2489 | static struct vfsmount *shm_mnt; | ||
2490 | 2504 | ||
2491 | static int __init init_tmpfs(void) | 2505 | static int __init init_tmpfs(void) |
2492 | { | 2506 | { |
@@ -2525,7 +2539,51 @@ out4: | |||
2525 | shm_mnt = ERR_PTR(error); | 2539 | shm_mnt = ERR_PTR(error); |
2526 | return error; | 2540 | return error; |
2527 | } | 2541 | } |
2528 | module_init(init_tmpfs) | 2542 | |
2543 | #else /* !CONFIG_SHMEM */ | ||
2544 | |||
2545 | /* | ||
2546 | * tiny-shmem: simple shmemfs and tmpfs using ramfs code | ||
2547 | * | ||
2548 | * This is intended for small system where the benefits of the full | ||
2549 | * shmem code (swap-backed and resource-limited) are outweighed by | ||
2550 | * their complexity. On systems without swap this code should be | ||
2551 | * effectively equivalent, but much lighter weight. | ||
2552 | */ | ||
2553 | |||
2554 | #include <linux/ramfs.h> | ||
2555 | |||
2556 | static struct file_system_type tmpfs_fs_type = { | ||
2557 | .name = "tmpfs", | ||
2558 | .get_sb = ramfs_get_sb, | ||
2559 | .kill_sb = kill_litter_super, | ||
2560 | }; | ||
2561 | |||
2562 | static int __init init_tmpfs(void) | ||
2563 | { | ||
2564 | BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); | ||
2565 | |||
2566 | shm_mnt = kern_mount(&tmpfs_fs_type); | ||
2567 | BUG_ON(IS_ERR(shm_mnt)); | ||
2568 | |||
2569 | return 0; | ||
2570 | } | ||
2571 | |||
2572 | int shmem_unuse(swp_entry_t entry, struct page *page) | ||
2573 | { | ||
2574 | return 0; | ||
2575 | } | ||
2576 | |||
2577 | #define shmem_file_operations ramfs_file_operations | ||
2578 | #define shmem_vm_ops generic_file_vm_ops | ||
2579 | #define shmem_get_inode ramfs_get_inode | ||
2580 | #define shmem_acct_size(a, b) 0 | ||
2581 | #define shmem_unacct_size(a, b) do {} while (0) | ||
2582 | #define SHMEM_MAX_BYTES LLONG_MAX | ||
2583 | |||
2584 | #endif /* CONFIG_SHMEM */ | ||
2585 | |||
2586 | /* common code */ | ||
2529 | 2587 | ||
2530 | /** | 2588 | /** |
2531 | * shmem_file_setup - get an unlinked file living in tmpfs | 2589 | * shmem_file_setup - get an unlinked file living in tmpfs |
@@ -2569,12 +2627,20 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
2569 | if (!inode) | 2627 | if (!inode) |
2570 | goto close_file; | 2628 | goto close_file; |
2571 | 2629 | ||
2630 | #ifdef CONFIG_SHMEM | ||
2572 | SHMEM_I(inode)->flags = flags & VM_ACCOUNT; | 2631 | SHMEM_I(inode)->flags = flags & VM_ACCOUNT; |
2632 | #endif | ||
2573 | d_instantiate(dentry, inode); | 2633 | d_instantiate(dentry, inode); |
2574 | inode->i_size = size; | 2634 | inode->i_size = size; |
2575 | inode->i_nlink = 0; /* It is unlinked */ | 2635 | inode->i_nlink = 0; /* It is unlinked */ |
2576 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | 2636 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, |
2577 | &shmem_file_operations); | 2637 | &shmem_file_operations); |
2638 | |||
2639 | #ifndef CONFIG_MMU | ||
2640 | error = ramfs_nommu_expand_for_mapping(inode, size); | ||
2641 | if (error) | ||
2642 | goto close_file; | ||
2643 | #endif | ||
2578 | return file; | 2644 | return file; |
2579 | 2645 | ||
2580 | close_file: | 2646 | close_file: |
@@ -2606,3 +2672,5 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
2606 | vma->vm_ops = &shmem_vm_ops; | 2672 | vma->vm_ops = &shmem_vm_ops; |
2607 | return 0; | 2673 | return 0; |
2608 | } | 2674 | } |
2675 | |||
2676 | module_init(init_tmpfs) | ||
@@ -2285,7 +2285,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
2285 | * Add some empty padding so that we can catch | 2285 | * Add some empty padding so that we can catch |
2286 | * overwrites from earlier objects rather than let | 2286 | * overwrites from earlier objects rather than let |
2287 | * tracking information or the free pointer be | 2287 | * tracking information or the free pointer be |
2288 | * corrupted if an user writes before the start | 2288 | * corrupted if a user writes before the start |
2289 | * of the object. | 2289 | * of the object. |
2290 | */ | 2290 | */ |
2291 | size += sizeof(void *); | 2291 | size += sizeof(void *); |
@@ -151,6 +151,26 @@ void rotate_reclaimable_page(struct page *page) | |||
151 | } | 151 | } |
152 | } | 152 | } |
153 | 153 | ||
154 | static void update_page_reclaim_stat(struct zone *zone, struct page *page, | ||
155 | int file, int rotated) | ||
156 | { | ||
157 | struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat; | ||
158 | struct zone_reclaim_stat *memcg_reclaim_stat; | ||
159 | |||
160 | memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page); | ||
161 | |||
162 | reclaim_stat->recent_scanned[file]++; | ||
163 | if (rotated) | ||
164 | reclaim_stat->recent_rotated[file]++; | ||
165 | |||
166 | if (!memcg_reclaim_stat) | ||
167 | return; | ||
168 | |||
169 | memcg_reclaim_stat->recent_scanned[file]++; | ||
170 | if (rotated) | ||
171 | memcg_reclaim_stat->recent_rotated[file]++; | ||
172 | } | ||
173 | |||
154 | /* | 174 | /* |
155 | * FIXME: speed this up? | 175 | * FIXME: speed this up? |
156 | */ | 176 | */ |
@@ -168,10 +188,8 @@ void activate_page(struct page *page) | |||
168 | lru += LRU_ACTIVE; | 188 | lru += LRU_ACTIVE; |
169 | add_page_to_lru_list(zone, page, lru); | 189 | add_page_to_lru_list(zone, page, lru); |
170 | __count_vm_event(PGACTIVATE); | 190 | __count_vm_event(PGACTIVATE); |
171 | mem_cgroup_move_lists(page, lru); | ||
172 | 191 | ||
173 | zone->recent_rotated[!!file]++; | 192 | update_page_reclaim_stat(zone, page, !!file, 1); |
174 | zone->recent_scanned[!!file]++; | ||
175 | } | 193 | } |
176 | spin_unlock_irq(&zone->lru_lock); | 194 | spin_unlock_irq(&zone->lru_lock); |
177 | } | 195 | } |
@@ -246,25 +264,6 @@ void add_page_to_unevictable_list(struct page *page) | |||
246 | spin_unlock_irq(&zone->lru_lock); | 264 | spin_unlock_irq(&zone->lru_lock); |
247 | } | 265 | } |
248 | 266 | ||
249 | /** | ||
250 | * lru_cache_add_active_or_unevictable | ||
251 | * @page: the page to be added to LRU | ||
252 | * @vma: vma in which page is mapped for determining reclaimability | ||
253 | * | ||
254 | * place @page on active or unevictable LRU list, depending on | ||
255 | * page_evictable(). Note that if the page is not evictable, | ||
256 | * it goes directly back onto it's zone's unevictable list. It does | ||
257 | * NOT use a per cpu pagevec. | ||
258 | */ | ||
259 | void lru_cache_add_active_or_unevictable(struct page *page, | ||
260 | struct vm_area_struct *vma) | ||
261 | { | ||
262 | if (page_evictable(page, vma)) | ||
263 | lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page)); | ||
264 | else | ||
265 | add_page_to_unevictable_list(page); | ||
266 | } | ||
267 | |||
268 | /* | 267 | /* |
269 | * Drain pages out of the cpu's pagevecs. | 268 | * Drain pages out of the cpu's pagevecs. |
270 | * Either "cpu" is the current CPU, and preemption has already been | 269 | * Either "cpu" is the current CPU, and preemption has already been |
@@ -398,28 +397,6 @@ void __pagevec_release(struct pagevec *pvec) | |||
398 | EXPORT_SYMBOL(__pagevec_release); | 397 | EXPORT_SYMBOL(__pagevec_release); |
399 | 398 | ||
400 | /* | 399 | /* |
401 | * pagevec_release() for pages which are known to not be on the LRU | ||
402 | * | ||
403 | * This function reinitialises the caller's pagevec. | ||
404 | */ | ||
405 | void __pagevec_release_nonlru(struct pagevec *pvec) | ||
406 | { | ||
407 | int i; | ||
408 | struct pagevec pages_to_free; | ||
409 | |||
410 | pagevec_init(&pages_to_free, pvec->cold); | ||
411 | for (i = 0; i < pagevec_count(pvec); i++) { | ||
412 | struct page *page = pvec->pages[i]; | ||
413 | |||
414 | VM_BUG_ON(PageLRU(page)); | ||
415 | if (put_page_testzero(page)) | ||
416 | pagevec_add(&pages_to_free, page); | ||
417 | } | ||
418 | pagevec_free(&pages_to_free); | ||
419 | pagevec_reinit(pvec); | ||
420 | } | ||
421 | |||
422 | /* | ||
423 | * Add the passed pages to the LRU, then drop the caller's refcount | 400 | * Add the passed pages to the LRU, then drop the caller's refcount |
424 | * on them. Reinitialises the caller's pagevec. | 401 | * on them. Reinitialises the caller's pagevec. |
425 | */ | 402 | */ |
@@ -427,12 +404,14 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) | |||
427 | { | 404 | { |
428 | int i; | 405 | int i; |
429 | struct zone *zone = NULL; | 406 | struct zone *zone = NULL; |
407 | |||
430 | VM_BUG_ON(is_unevictable_lru(lru)); | 408 | VM_BUG_ON(is_unevictable_lru(lru)); |
431 | 409 | ||
432 | for (i = 0; i < pagevec_count(pvec); i++) { | 410 | for (i = 0; i < pagevec_count(pvec); i++) { |
433 | struct page *page = pvec->pages[i]; | 411 | struct page *page = pvec->pages[i]; |
434 | struct zone *pagezone = page_zone(page); | 412 | struct zone *pagezone = page_zone(page); |
435 | int file; | 413 | int file; |
414 | int active; | ||
436 | 415 | ||
437 | if (pagezone != zone) { | 416 | if (pagezone != zone) { |
438 | if (zone) | 417 | if (zone) |
@@ -444,12 +423,11 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) | |||
444 | VM_BUG_ON(PageUnevictable(page)); | 423 | VM_BUG_ON(PageUnevictable(page)); |
445 | VM_BUG_ON(PageLRU(page)); | 424 | VM_BUG_ON(PageLRU(page)); |
446 | SetPageLRU(page); | 425 | SetPageLRU(page); |
426 | active = is_active_lru(lru); | ||
447 | file = is_file_lru(lru); | 427 | file = is_file_lru(lru); |
448 | zone->recent_scanned[file]++; | 428 | if (active) |
449 | if (is_active_lru(lru)) { | ||
450 | SetPageActive(page); | 429 | SetPageActive(page); |
451 | zone->recent_rotated[file]++; | 430 | update_page_reclaim_stat(zone, page, file, active); |
452 | } | ||
453 | add_page_to_lru_list(zone, page, lru); | 431 | add_page_to_lru_list(zone, page, lru); |
454 | } | 432 | } |
455 | if (zone) | 433 | if (zone) |
@@ -495,8 +473,7 @@ void pagevec_swap_free(struct pagevec *pvec) | |||
495 | struct page *page = pvec->pages[i]; | 473 | struct page *page = pvec->pages[i]; |
496 | 474 | ||
497 | if (PageSwapCache(page) && trylock_page(page)) { | 475 | if (PageSwapCache(page) && trylock_page(page)) { |
498 | if (PageSwapCache(page)) | 476 | try_to_free_swap(page); |
499 | remove_exclusive_swap_page_ref(page); | ||
500 | unlock_page(page); | 477 | unlock_page(page); |
501 | } | 478 | } |
502 | } | 479 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 3353c9029cef..3ecea98ecb45 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/backing-dev.h> | 17 | #include <linux/backing-dev.h> |
18 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
19 | #include <linux/migrate.h> | 19 | #include <linux/migrate.h> |
20 | #include <linux/page_cgroup.h> | ||
20 | 21 | ||
21 | #include <asm/pgtable.h> | 22 | #include <asm/pgtable.h> |
22 | 23 | ||
@@ -72,10 +73,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
72 | { | 73 | { |
73 | int error; | 74 | int error; |
74 | 75 | ||
75 | BUG_ON(!PageLocked(page)); | 76 | VM_BUG_ON(!PageLocked(page)); |
76 | BUG_ON(PageSwapCache(page)); | 77 | VM_BUG_ON(PageSwapCache(page)); |
77 | BUG_ON(PagePrivate(page)); | 78 | VM_BUG_ON(!PageSwapBacked(page)); |
78 | BUG_ON(!PageSwapBacked(page)); | 79 | |
79 | error = radix_tree_preload(gfp_mask); | 80 | error = radix_tree_preload(gfp_mask); |
80 | if (!error) { | 81 | if (!error) { |
81 | page_cache_get(page); | 82 | page_cache_get(page); |
@@ -108,10 +109,11 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
108 | */ | 109 | */ |
109 | void __delete_from_swap_cache(struct page *page) | 110 | void __delete_from_swap_cache(struct page *page) |
110 | { | 111 | { |
111 | BUG_ON(!PageLocked(page)); | 112 | swp_entry_t ent = {.val = page_private(page)}; |
112 | BUG_ON(!PageSwapCache(page)); | 113 | |
113 | BUG_ON(PageWriteback(page)); | 114 | VM_BUG_ON(!PageLocked(page)); |
114 | BUG_ON(PagePrivate(page)); | 115 | VM_BUG_ON(!PageSwapCache(page)); |
116 | VM_BUG_ON(PageWriteback(page)); | ||
115 | 117 | ||
116 | radix_tree_delete(&swapper_space.page_tree, page_private(page)); | 118 | radix_tree_delete(&swapper_space.page_tree, page_private(page)); |
117 | set_page_private(page, 0); | 119 | set_page_private(page, 0); |
@@ -119,6 +121,7 @@ void __delete_from_swap_cache(struct page *page) | |||
119 | total_swapcache_pages--; | 121 | total_swapcache_pages--; |
120 | __dec_zone_page_state(page, NR_FILE_PAGES); | 122 | __dec_zone_page_state(page, NR_FILE_PAGES); |
121 | INC_CACHE_INFO(del_total); | 123 | INC_CACHE_INFO(del_total); |
124 | mem_cgroup_uncharge_swapcache(page, ent); | ||
122 | } | 125 | } |
123 | 126 | ||
124 | /** | 127 | /** |
@@ -129,13 +132,13 @@ void __delete_from_swap_cache(struct page *page) | |||
129 | * Allocate swap space for the page and add the page to the | 132 | * Allocate swap space for the page and add the page to the |
130 | * swap cache. Caller needs to hold the page lock. | 133 | * swap cache. Caller needs to hold the page lock. |
131 | */ | 134 | */ |
132 | int add_to_swap(struct page * page, gfp_t gfp_mask) | 135 | int add_to_swap(struct page *page) |
133 | { | 136 | { |
134 | swp_entry_t entry; | 137 | swp_entry_t entry; |
135 | int err; | 138 | int err; |
136 | 139 | ||
137 | BUG_ON(!PageLocked(page)); | 140 | VM_BUG_ON(!PageLocked(page)); |
138 | BUG_ON(!PageUptodate(page)); | 141 | VM_BUG_ON(!PageUptodate(page)); |
139 | 142 | ||
140 | for (;;) { | 143 | for (;;) { |
141 | entry = get_swap_page(); | 144 | entry = get_swap_page(); |
@@ -154,7 +157,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask) | |||
154 | * Add it to the swap cache and mark it dirty | 157 | * Add it to the swap cache and mark it dirty |
155 | */ | 158 | */ |
156 | err = add_to_swap_cache(page, entry, | 159 | err = add_to_swap_cache(page, entry, |
157 | gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); | 160 | __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); |
158 | 161 | ||
159 | switch (err) { | 162 | switch (err) { |
160 | case 0: /* Success */ | 163 | case 0: /* Success */ |
@@ -196,14 +199,14 @@ void delete_from_swap_cache(struct page *page) | |||
196 | * If we are the only user, then try to free up the swap cache. | 199 | * If we are the only user, then try to free up the swap cache. |
197 | * | 200 | * |
198 | * Its ok to check for PageSwapCache without the page lock | 201 | * Its ok to check for PageSwapCache without the page lock |
199 | * here because we are going to recheck again inside | 202 | * here because we are going to recheck again inside |
200 | * exclusive_swap_page() _with_ the lock. | 203 | * try_to_free_swap() _with_ the lock. |
201 | * - Marcelo | 204 | * - Marcelo |
202 | */ | 205 | */ |
203 | static inline void free_swap_cache(struct page *page) | 206 | static inline void free_swap_cache(struct page *page) |
204 | { | 207 | { |
205 | if (PageSwapCache(page) && trylock_page(page)) { | 208 | if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { |
206 | remove_exclusive_swap_page(page); | 209 | try_to_free_swap(page); |
207 | unlock_page(page); | 210 | unlock_page(page); |
208 | } | 211 | } |
209 | } | 212 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 54a9f87e5162..da422c47e2ee 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/namei.h> | 16 | #include <linux/namei.h> |
17 | #include <linux/shm.h> | 17 | #include <linux/shm.h> |
18 | #include <linux/blkdev.h> | 18 | #include <linux/blkdev.h> |
19 | #include <linux/random.h> | ||
19 | #include <linux/writeback.h> | 20 | #include <linux/writeback.h> |
20 | #include <linux/proc_fs.h> | 21 | #include <linux/proc_fs.h> |
21 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
@@ -32,9 +33,11 @@ | |||
32 | #include <asm/pgtable.h> | 33 | #include <asm/pgtable.h> |
33 | #include <asm/tlbflush.h> | 34 | #include <asm/tlbflush.h> |
34 | #include <linux/swapops.h> | 35 | #include <linux/swapops.h> |
36 | #include <linux/page_cgroup.h> | ||
35 | 37 | ||
36 | static DEFINE_SPINLOCK(swap_lock); | 38 | static DEFINE_SPINLOCK(swap_lock); |
37 | static unsigned int nr_swapfiles; | 39 | static unsigned int nr_swapfiles; |
40 | long nr_swap_pages; | ||
38 | long total_swap_pages; | 41 | long total_swap_pages; |
39 | static int swap_overflow; | 42 | static int swap_overflow; |
40 | static int least_priority; | 43 | static int least_priority; |
@@ -83,15 +86,96 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
83 | up_read(&swap_unplug_sem); | 86 | up_read(&swap_unplug_sem); |
84 | } | 87 | } |
85 | 88 | ||
89 | /* | ||
90 | * swapon tell device that all the old swap contents can be discarded, | ||
91 | * to allow the swap device to optimize its wear-levelling. | ||
92 | */ | ||
93 | static int discard_swap(struct swap_info_struct *si) | ||
94 | { | ||
95 | struct swap_extent *se; | ||
96 | int err = 0; | ||
97 | |||
98 | list_for_each_entry(se, &si->extent_list, list) { | ||
99 | sector_t start_block = se->start_block << (PAGE_SHIFT - 9); | ||
100 | sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); | ||
101 | |||
102 | if (se->start_page == 0) { | ||
103 | /* Do not discard the swap header page! */ | ||
104 | start_block += 1 << (PAGE_SHIFT - 9); | ||
105 | nr_blocks -= 1 << (PAGE_SHIFT - 9); | ||
106 | if (!nr_blocks) | ||
107 | continue; | ||
108 | } | ||
109 | |||
110 | err = blkdev_issue_discard(si->bdev, start_block, | ||
111 | nr_blocks, GFP_KERNEL); | ||
112 | if (err) | ||
113 | break; | ||
114 | |||
115 | cond_resched(); | ||
116 | } | ||
117 | return err; /* That will often be -EOPNOTSUPP */ | ||
118 | } | ||
119 | |||
120 | /* | ||
121 | * swap allocation tell device that a cluster of swap can now be discarded, | ||
122 | * to allow the swap device to optimize its wear-levelling. | ||
123 | */ | ||
124 | static void discard_swap_cluster(struct swap_info_struct *si, | ||
125 | pgoff_t start_page, pgoff_t nr_pages) | ||
126 | { | ||
127 | struct swap_extent *se = si->curr_swap_extent; | ||
128 | int found_extent = 0; | ||
129 | |||
130 | while (nr_pages) { | ||
131 | struct list_head *lh; | ||
132 | |||
133 | if (se->start_page <= start_page && | ||
134 | start_page < se->start_page + se->nr_pages) { | ||
135 | pgoff_t offset = start_page - se->start_page; | ||
136 | sector_t start_block = se->start_block + offset; | ||
137 | sector_t nr_blocks = se->nr_pages - offset; | ||
138 | |||
139 | if (nr_blocks > nr_pages) | ||
140 | nr_blocks = nr_pages; | ||
141 | start_page += nr_blocks; | ||
142 | nr_pages -= nr_blocks; | ||
143 | |||
144 | if (!found_extent++) | ||
145 | si->curr_swap_extent = se; | ||
146 | |||
147 | start_block <<= PAGE_SHIFT - 9; | ||
148 | nr_blocks <<= PAGE_SHIFT - 9; | ||
149 | if (blkdev_issue_discard(si->bdev, start_block, | ||
150 | nr_blocks, GFP_NOIO)) | ||
151 | break; | ||
152 | } | ||
153 | |||
154 | lh = se->list.next; | ||
155 | if (lh == &si->extent_list) | ||
156 | lh = lh->next; | ||
157 | se = list_entry(lh, struct swap_extent, list); | ||
158 | } | ||
159 | } | ||
160 | |||
161 | static int wait_for_discard(void *word) | ||
162 | { | ||
163 | schedule(); | ||
164 | return 0; | ||
165 | } | ||
166 | |||
86 | #define SWAPFILE_CLUSTER 256 | 167 | #define SWAPFILE_CLUSTER 256 |
87 | #define LATENCY_LIMIT 256 | 168 | #define LATENCY_LIMIT 256 |
88 | 169 | ||
89 | static inline unsigned long scan_swap_map(struct swap_info_struct *si) | 170 | static inline unsigned long scan_swap_map(struct swap_info_struct *si) |
90 | { | 171 | { |
91 | unsigned long offset, last_in_cluster; | 172 | unsigned long offset; |
173 | unsigned long scan_base; | ||
174 | unsigned long last_in_cluster = 0; | ||
92 | int latency_ration = LATENCY_LIMIT; | 175 | int latency_ration = LATENCY_LIMIT; |
176 | int found_free_cluster = 0; | ||
93 | 177 | ||
94 | /* | 178 | /* |
95 | * We try to cluster swap pages by allocating them sequentially | 179 | * We try to cluster swap pages by allocating them sequentially |
96 | * in swap. Once we've allocated SWAPFILE_CLUSTER pages this | 180 | * in swap. Once we've allocated SWAPFILE_CLUSTER pages this |
97 | * way, however, we resort to first-free allocation, starting | 181 | * way, however, we resort to first-free allocation, starting |
@@ -99,16 +183,42 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) | |||
99 | * all over the entire swap partition, so that we reduce | 183 | * all over the entire swap partition, so that we reduce |
100 | * overall disk seek times between swap pages. -- sct | 184 | * overall disk seek times between swap pages. -- sct |
101 | * But we do now try to find an empty cluster. -Andrea | 185 | * But we do now try to find an empty cluster. -Andrea |
186 | * And we let swap pages go all over an SSD partition. Hugh | ||
102 | */ | 187 | */ |
103 | 188 | ||
104 | si->flags += SWP_SCANNING; | 189 | si->flags += SWP_SCANNING; |
105 | if (unlikely(!si->cluster_nr)) { | 190 | scan_base = offset = si->cluster_next; |
106 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 191 | |
107 | if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) | 192 | if (unlikely(!si->cluster_nr--)) { |
108 | goto lowest; | 193 | if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { |
194 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | ||
195 | goto checks; | ||
196 | } | ||
197 | if (si->flags & SWP_DISCARDABLE) { | ||
198 | /* | ||
199 | * Start range check on racing allocations, in case | ||
200 | * they overlap the cluster we eventually decide on | ||
201 | * (we scan without swap_lock to allow preemption). | ||
202 | * It's hardly conceivable that cluster_nr could be | ||
203 | * wrapped during our scan, but don't depend on it. | ||
204 | */ | ||
205 | if (si->lowest_alloc) | ||
206 | goto checks; | ||
207 | si->lowest_alloc = si->max; | ||
208 | si->highest_alloc = 0; | ||
209 | } | ||
109 | spin_unlock(&swap_lock); | 210 | spin_unlock(&swap_lock); |
110 | 211 | ||
111 | offset = si->lowest_bit; | 212 | /* |
213 | * If seek is expensive, start searching for new cluster from | ||
214 | * start of partition, to minimize the span of allocated swap. | ||
215 | * But if seek is cheap, search from our current position, so | ||
216 | * that swap is allocated from all over the partition: if the | ||
217 | * Flash Translation Layer only remaps within limited zones, | ||
218 | * we don't want to wear out the first zone too quickly. | ||
219 | */ | ||
220 | if (!(si->flags & SWP_SOLIDSTATE)) | ||
221 | scan_base = offset = si->lowest_bit; | ||
112 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; | 222 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; |
113 | 223 | ||
114 | /* Locate the first empty (unaligned) cluster */ | 224 | /* Locate the first empty (unaligned) cluster */ |
@@ -117,43 +227,124 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) | |||
117 | last_in_cluster = offset + SWAPFILE_CLUSTER; | 227 | last_in_cluster = offset + SWAPFILE_CLUSTER; |
118 | else if (offset == last_in_cluster) { | 228 | else if (offset == last_in_cluster) { |
119 | spin_lock(&swap_lock); | 229 | spin_lock(&swap_lock); |
120 | si->cluster_next = offset-SWAPFILE_CLUSTER+1; | 230 | offset -= SWAPFILE_CLUSTER - 1; |
121 | goto cluster; | 231 | si->cluster_next = offset; |
232 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | ||
233 | found_free_cluster = 1; | ||
234 | goto checks; | ||
122 | } | 235 | } |
123 | if (unlikely(--latency_ration < 0)) { | 236 | if (unlikely(--latency_ration < 0)) { |
124 | cond_resched(); | 237 | cond_resched(); |
125 | latency_ration = LATENCY_LIMIT; | 238 | latency_ration = LATENCY_LIMIT; |
126 | } | 239 | } |
127 | } | 240 | } |
241 | |||
242 | offset = si->lowest_bit; | ||
243 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; | ||
244 | |||
245 | /* Locate the first empty (unaligned) cluster */ | ||
246 | for (; last_in_cluster < scan_base; offset++) { | ||
247 | if (si->swap_map[offset]) | ||
248 | last_in_cluster = offset + SWAPFILE_CLUSTER; | ||
249 | else if (offset == last_in_cluster) { | ||
250 | spin_lock(&swap_lock); | ||
251 | offset -= SWAPFILE_CLUSTER - 1; | ||
252 | si->cluster_next = offset; | ||
253 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | ||
254 | found_free_cluster = 1; | ||
255 | goto checks; | ||
256 | } | ||
257 | if (unlikely(--latency_ration < 0)) { | ||
258 | cond_resched(); | ||
259 | latency_ration = LATENCY_LIMIT; | ||
260 | } | ||
261 | } | ||
262 | |||
263 | offset = scan_base; | ||
128 | spin_lock(&swap_lock); | 264 | spin_lock(&swap_lock); |
129 | goto lowest; | 265 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
266 | si->lowest_alloc = 0; | ||
130 | } | 267 | } |
131 | 268 | ||
132 | si->cluster_nr--; | 269 | checks: |
133 | cluster: | 270 | if (!(si->flags & SWP_WRITEOK)) |
134 | offset = si->cluster_next; | ||
135 | if (offset > si->highest_bit) | ||
136 | lowest: offset = si->lowest_bit; | ||
137 | checks: if (!(si->flags & SWP_WRITEOK)) | ||
138 | goto no_page; | 271 | goto no_page; |
139 | if (!si->highest_bit) | 272 | if (!si->highest_bit) |
140 | goto no_page; | 273 | goto no_page; |
141 | if (!si->swap_map[offset]) { | 274 | if (offset > si->highest_bit) |
142 | if (offset == si->lowest_bit) | 275 | scan_base = offset = si->lowest_bit; |
143 | si->lowest_bit++; | 276 | if (si->swap_map[offset]) |
144 | if (offset == si->highest_bit) | 277 | goto scan; |
145 | si->highest_bit--; | 278 | |
146 | si->inuse_pages++; | 279 | if (offset == si->lowest_bit) |
147 | if (si->inuse_pages == si->pages) { | 280 | si->lowest_bit++; |
148 | si->lowest_bit = si->max; | 281 | if (offset == si->highest_bit) |
149 | si->highest_bit = 0; | 282 | si->highest_bit--; |
283 | si->inuse_pages++; | ||
284 | if (si->inuse_pages == si->pages) { | ||
285 | si->lowest_bit = si->max; | ||
286 | si->highest_bit = 0; | ||
287 | } | ||
288 | si->swap_map[offset] = 1; | ||
289 | si->cluster_next = offset + 1; | ||
290 | si->flags -= SWP_SCANNING; | ||
291 | |||
292 | if (si->lowest_alloc) { | ||
293 | /* | ||
294 | * Only set when SWP_DISCARDABLE, and there's a scan | ||
295 | * for a free cluster in progress or just completed. | ||
296 | */ | ||
297 | if (found_free_cluster) { | ||
298 | /* | ||
299 | * To optimize wear-levelling, discard the | ||
300 | * old data of the cluster, taking care not to | ||
301 | * discard any of its pages that have already | ||
302 | * been allocated by racing tasks (offset has | ||
303 | * already stepped over any at the beginning). | ||
304 | */ | ||
305 | if (offset < si->highest_alloc && | ||
306 | si->lowest_alloc <= last_in_cluster) | ||
307 | last_in_cluster = si->lowest_alloc - 1; | ||
308 | si->flags |= SWP_DISCARDING; | ||
309 | spin_unlock(&swap_lock); | ||
310 | |||
311 | if (offset < last_in_cluster) | ||
312 | discard_swap_cluster(si, offset, | ||
313 | last_in_cluster - offset + 1); | ||
314 | |||
315 | spin_lock(&swap_lock); | ||
316 | si->lowest_alloc = 0; | ||
317 | si->flags &= ~SWP_DISCARDING; | ||
318 | |||
319 | smp_mb(); /* wake_up_bit advises this */ | ||
320 | wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); | ||
321 | |||
322 | } else if (si->flags & SWP_DISCARDING) { | ||
323 | /* | ||
324 | * Delay using pages allocated by racing tasks | ||
325 | * until the whole discard has been issued. We | ||
326 | * could defer that delay until swap_writepage, | ||
327 | * but it's easier to keep this self-contained. | ||
328 | */ | ||
329 | spin_unlock(&swap_lock); | ||
330 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), | ||
331 | wait_for_discard, TASK_UNINTERRUPTIBLE); | ||
332 | spin_lock(&swap_lock); | ||
333 | } else { | ||
334 | /* | ||
335 | * Note pages allocated by racing tasks while | ||
336 | * scan for a free cluster is in progress, so | ||
337 | * that its final discard can exclude them. | ||
338 | */ | ||
339 | if (offset < si->lowest_alloc) | ||
340 | si->lowest_alloc = offset; | ||
341 | if (offset > si->highest_alloc) | ||
342 | si->highest_alloc = offset; | ||
150 | } | 343 | } |
151 | si->swap_map[offset] = 1; | ||
152 | si->cluster_next = offset + 1; | ||
153 | si->flags -= SWP_SCANNING; | ||
154 | return offset; | ||
155 | } | 344 | } |
345 | return offset; | ||
156 | 346 | ||
347 | scan: | ||
157 | spin_unlock(&swap_lock); | 348 | spin_unlock(&swap_lock); |
158 | while (++offset <= si->highest_bit) { | 349 | while (++offset <= si->highest_bit) { |
159 | if (!si->swap_map[offset]) { | 350 | if (!si->swap_map[offset]) { |
@@ -165,8 +356,18 @@ checks: if (!(si->flags & SWP_WRITEOK)) | |||
165 | latency_ration = LATENCY_LIMIT; | 356 | latency_ration = LATENCY_LIMIT; |
166 | } | 357 | } |
167 | } | 358 | } |
359 | offset = si->lowest_bit; | ||
360 | while (++offset < scan_base) { | ||
361 | if (!si->swap_map[offset]) { | ||
362 | spin_lock(&swap_lock); | ||
363 | goto checks; | ||
364 | } | ||
365 | if (unlikely(--latency_ration < 0)) { | ||
366 | cond_resched(); | ||
367 | latency_ration = LATENCY_LIMIT; | ||
368 | } | ||
369 | } | ||
168 | spin_lock(&swap_lock); | 370 | spin_lock(&swap_lock); |
169 | goto lowest; | ||
170 | 371 | ||
171 | no_page: | 372 | no_page: |
172 | si->flags -= SWP_SCANNING; | 373 | si->flags -= SWP_SCANNING; |
@@ -268,10 +469,11 @@ bad_nofile: | |||
268 | printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); | 469 | printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); |
269 | out: | 470 | out: |
270 | return NULL; | 471 | return NULL; |
271 | } | 472 | } |
272 | 473 | ||
273 | static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) | 474 | static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) |
274 | { | 475 | { |
476 | unsigned long offset = swp_offset(ent); | ||
275 | int count = p->swap_map[offset]; | 477 | int count = p->swap_map[offset]; |
276 | 478 | ||
277 | if (count < SWAP_MAP_MAX) { | 479 | if (count < SWAP_MAP_MAX) { |
@@ -286,6 +488,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) | |||
286 | swap_list.next = p - swap_info; | 488 | swap_list.next = p - swap_info; |
287 | nr_swap_pages++; | 489 | nr_swap_pages++; |
288 | p->inuse_pages--; | 490 | p->inuse_pages--; |
491 | mem_cgroup_uncharge_swap(ent); | ||
289 | } | 492 | } |
290 | } | 493 | } |
291 | return count; | 494 | return count; |
@@ -301,7 +504,7 @@ void swap_free(swp_entry_t entry) | |||
301 | 504 | ||
302 | p = swap_info_get(entry); | 505 | p = swap_info_get(entry); |
303 | if (p) { | 506 | if (p) { |
304 | swap_entry_free(p, swp_offset(entry)); | 507 | swap_entry_free(p, entry); |
305 | spin_unlock(&swap_lock); | 508 | spin_unlock(&swap_lock); |
306 | } | 509 | } |
307 | } | 510 | } |
@@ -326,101 +529,62 @@ static inline int page_swapcount(struct page *page) | |||
326 | } | 529 | } |
327 | 530 | ||
328 | /* | 531 | /* |
329 | * We can use this swap cache entry directly | 532 | * We can write to an anon page without COW if there are no other references |
330 | * if there are no other references to it. | 533 | * to it. And as a side-effect, free up its swap: because the old content |
534 | * on disk will never be read, and seeking back there to write new content | ||
535 | * later would only waste time away from clustering. | ||
331 | */ | 536 | */ |
332 | int can_share_swap_page(struct page *page) | 537 | int reuse_swap_page(struct page *page) |
333 | { | 538 | { |
334 | int count; | 539 | int count; |
335 | 540 | ||
336 | BUG_ON(!PageLocked(page)); | 541 | VM_BUG_ON(!PageLocked(page)); |
337 | count = page_mapcount(page); | 542 | count = page_mapcount(page); |
338 | if (count <= 1 && PageSwapCache(page)) | 543 | if (count <= 1 && PageSwapCache(page)) { |
339 | count += page_swapcount(page); | 544 | count += page_swapcount(page); |
545 | if (count == 1 && !PageWriteback(page)) { | ||
546 | delete_from_swap_cache(page); | ||
547 | SetPageDirty(page); | ||
548 | } | ||
549 | } | ||
340 | return count == 1; | 550 | return count == 1; |
341 | } | 551 | } |
342 | 552 | ||
343 | /* | 553 | /* |
344 | * Work out if there are any other processes sharing this | 554 | * If swap is getting full, or if there are no more mappings of this page, |
345 | * swap cache page. Free it if you can. Return success. | 555 | * then try_to_free_swap is called to free its swap space. |
346 | */ | 556 | */ |
347 | static int remove_exclusive_swap_page_count(struct page *page, int count) | 557 | int try_to_free_swap(struct page *page) |
348 | { | 558 | { |
349 | int retval; | 559 | VM_BUG_ON(!PageLocked(page)); |
350 | struct swap_info_struct * p; | ||
351 | swp_entry_t entry; | ||
352 | |||
353 | BUG_ON(PagePrivate(page)); | ||
354 | BUG_ON(!PageLocked(page)); | ||
355 | 560 | ||
356 | if (!PageSwapCache(page)) | 561 | if (!PageSwapCache(page)) |
357 | return 0; | 562 | return 0; |
358 | if (PageWriteback(page)) | 563 | if (PageWriteback(page)) |
359 | return 0; | 564 | return 0; |
360 | if (page_count(page) != count) /* us + cache + ptes */ | 565 | if (page_swapcount(page)) |
361 | return 0; | 566 | return 0; |
362 | 567 | ||
363 | entry.val = page_private(page); | 568 | delete_from_swap_cache(page); |
364 | p = swap_info_get(entry); | 569 | SetPageDirty(page); |
365 | if (!p) | 570 | return 1; |
366 | return 0; | ||
367 | |||
368 | /* Is the only swap cache user the cache itself? */ | ||
369 | retval = 0; | ||
370 | if (p->swap_map[swp_offset(entry)] == 1) { | ||
371 | /* Recheck the page count with the swapcache lock held.. */ | ||
372 | spin_lock_irq(&swapper_space.tree_lock); | ||
373 | if ((page_count(page) == count) && !PageWriteback(page)) { | ||
374 | __delete_from_swap_cache(page); | ||
375 | SetPageDirty(page); | ||
376 | retval = 1; | ||
377 | } | ||
378 | spin_unlock_irq(&swapper_space.tree_lock); | ||
379 | } | ||
380 | spin_unlock(&swap_lock); | ||
381 | |||
382 | if (retval) { | ||
383 | swap_free(entry); | ||
384 | page_cache_release(page); | ||
385 | } | ||
386 | |||
387 | return retval; | ||
388 | } | ||
389 | |||
390 | /* | ||
391 | * Most of the time the page should have two references: one for the | ||
392 | * process and one for the swap cache. | ||
393 | */ | ||
394 | int remove_exclusive_swap_page(struct page *page) | ||
395 | { | ||
396 | return remove_exclusive_swap_page_count(page, 2); | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * The pageout code holds an extra reference to the page. That raises | ||
401 | * the reference count to test for to 2 for a page that is only in the | ||
402 | * swap cache plus 1 for each process that maps the page. | ||
403 | */ | ||
404 | int remove_exclusive_swap_page_ref(struct page *page) | ||
405 | { | ||
406 | return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page)); | ||
407 | } | 571 | } |
408 | 572 | ||
409 | /* | 573 | /* |
410 | * Free the swap entry like above, but also try to | 574 | * Free the swap entry like above, but also try to |
411 | * free the page cache entry if it is the last user. | 575 | * free the page cache entry if it is the last user. |
412 | */ | 576 | */ |
413 | void free_swap_and_cache(swp_entry_t entry) | 577 | int free_swap_and_cache(swp_entry_t entry) |
414 | { | 578 | { |
415 | struct swap_info_struct * p; | 579 | struct swap_info_struct *p; |
416 | struct page *page = NULL; | 580 | struct page *page = NULL; |
417 | 581 | ||
418 | if (is_migration_entry(entry)) | 582 | if (is_migration_entry(entry)) |
419 | return; | 583 | return 1; |
420 | 584 | ||
421 | p = swap_info_get(entry); | 585 | p = swap_info_get(entry); |
422 | if (p) { | 586 | if (p) { |
423 | if (swap_entry_free(p, swp_offset(entry)) == 1) { | 587 | if (swap_entry_free(p, entry) == 1) { |
424 | page = find_get_page(&swapper_space, entry.val); | 588 | page = find_get_page(&swapper_space, entry.val); |
425 | if (page && !trylock_page(page)) { | 589 | if (page && !trylock_page(page)) { |
426 | page_cache_release(page); | 590 | page_cache_release(page); |
@@ -430,20 +594,19 @@ void free_swap_and_cache(swp_entry_t entry) | |||
430 | spin_unlock(&swap_lock); | 594 | spin_unlock(&swap_lock); |
431 | } | 595 | } |
432 | if (page) { | 596 | if (page) { |
433 | int one_user; | 597 | /* |
434 | 598 | * Not mapped elsewhere, or swap space full? Free it! | |
435 | BUG_ON(PagePrivate(page)); | 599 | * Also recheck PageSwapCache now page is locked (above). |
436 | one_user = (page_count(page) == 2); | 600 | */ |
437 | /* Only cache user (+us), or swap space full? Free it! */ | ||
438 | /* Also recheck PageSwapCache after page is locked (above) */ | ||
439 | if (PageSwapCache(page) && !PageWriteback(page) && | 601 | if (PageSwapCache(page) && !PageWriteback(page) && |
440 | (one_user || vm_swap_full())) { | 602 | (!page_mapped(page) || vm_swap_full())) { |
441 | delete_from_swap_cache(page); | 603 | delete_from_swap_cache(page); |
442 | SetPageDirty(page); | 604 | SetPageDirty(page); |
443 | } | 605 | } |
444 | unlock_page(page); | 606 | unlock_page(page); |
445 | page_cache_release(page); | 607 | page_cache_release(page); |
446 | } | 608 | } |
609 | return p != NULL; | ||
447 | } | 610 | } |
448 | 611 | ||
449 | #ifdef CONFIG_HIBERNATION | 612 | #ifdef CONFIG_HIBERNATION |
@@ -530,17 +693,18 @@ unsigned int count_swap_pages(int type, int free) | |||
530 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | 693 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, |
531 | unsigned long addr, swp_entry_t entry, struct page *page) | 694 | unsigned long addr, swp_entry_t entry, struct page *page) |
532 | { | 695 | { |
696 | struct mem_cgroup *ptr = NULL; | ||
533 | spinlock_t *ptl; | 697 | spinlock_t *ptl; |
534 | pte_t *pte; | 698 | pte_t *pte; |
535 | int ret = 1; | 699 | int ret = 1; |
536 | 700 | ||
537 | if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) | 701 | if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) |
538 | ret = -ENOMEM; | 702 | ret = -ENOMEM; |
539 | 703 | ||
540 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 704 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
541 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { | 705 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { |
542 | if (ret > 0) | 706 | if (ret > 0) |
543 | mem_cgroup_uncharge_page(page); | 707 | mem_cgroup_cancel_charge_swapin(ptr); |
544 | ret = 0; | 708 | ret = 0; |
545 | goto out; | 709 | goto out; |
546 | } | 710 | } |
@@ -550,6 +714,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
550 | set_pte_at(vma->vm_mm, addr, pte, | 714 | set_pte_at(vma->vm_mm, addr, pte, |
551 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | 715 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
552 | page_add_anon_rmap(page, vma, addr); | 716 | page_add_anon_rmap(page, vma, addr); |
717 | mem_cgroup_commit_charge_swapin(page, ptr); | ||
553 | swap_free(entry); | 718 | swap_free(entry); |
554 | /* | 719 | /* |
555 | * Move the page to the active list so it is not | 720 | * Move the page to the active list so it is not |
@@ -776,10 +941,10 @@ static int try_to_unuse(unsigned int type) | |||
776 | break; | 941 | break; |
777 | } | 942 | } |
778 | 943 | ||
779 | /* | 944 | /* |
780 | * Get a page for the entry, using the existing swap | 945 | * Get a page for the entry, using the existing swap |
781 | * cache page if there is one. Otherwise, get a clean | 946 | * cache page if there is one. Otherwise, get a clean |
782 | * page and read the swap into it. | 947 | * page and read the swap into it. |
783 | */ | 948 | */ |
784 | swap_map = &si->swap_map[i]; | 949 | swap_map = &si->swap_map[i]; |
785 | entry = swp_entry(type, i); | 950 | entry = swp_entry(type, i); |
@@ -930,7 +1095,16 @@ static int try_to_unuse(unsigned int type) | |||
930 | lock_page(page); | 1095 | lock_page(page); |
931 | wait_on_page_writeback(page); | 1096 | wait_on_page_writeback(page); |
932 | } | 1097 | } |
933 | if (PageSwapCache(page)) | 1098 | |
1099 | /* | ||
1100 | * It is conceivable that a racing task removed this page from | ||
1101 | * swap cache just before we acquired the page lock at the top, | ||
1102 | * or while we dropped it in unuse_mm(). The page might even | ||
1103 | * be back in swap cache on another swap area: that we must not | ||
1104 | * delete, since it may not have been written out to swap yet. | ||
1105 | */ | ||
1106 | if (PageSwapCache(page) && | ||
1107 | likely(page_private(page) == entry.val)) | ||
934 | delete_from_swap_cache(page); | 1108 | delete_from_swap_cache(page); |
935 | 1109 | ||
936 | /* | 1110 | /* |
@@ -1203,26 +1377,6 @@ out: | |||
1203 | return ret; | 1377 | return ret; |
1204 | } | 1378 | } |
1205 | 1379 | ||
1206 | #if 0 /* We don't need this yet */ | ||
1207 | #include <linux/backing-dev.h> | ||
1208 | int page_queue_congested(struct page *page) | ||
1209 | { | ||
1210 | struct backing_dev_info *bdi; | ||
1211 | |||
1212 | BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ | ||
1213 | |||
1214 | if (PageSwapCache(page)) { | ||
1215 | swp_entry_t entry = { .val = page_private(page) }; | ||
1216 | struct swap_info_struct *sis; | ||
1217 | |||
1218 | sis = get_swap_info_struct(swp_type(entry)); | ||
1219 | bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info; | ||
1220 | } else | ||
1221 | bdi = page->mapping->backing_dev_info; | ||
1222 | return bdi_write_congested(bdi); | ||
1223 | } | ||
1224 | #endif | ||
1225 | |||
1226 | asmlinkage long sys_swapoff(const char __user * specialfile) | 1380 | asmlinkage long sys_swapoff(const char __user * specialfile) |
1227 | { | 1381 | { |
1228 | struct swap_info_struct * p = NULL; | 1382 | struct swap_info_struct * p = NULL; |
@@ -1233,7 +1387,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) | |||
1233 | char * pathname; | 1387 | char * pathname; |
1234 | int i, type, prev; | 1388 | int i, type, prev; |
1235 | int err; | 1389 | int err; |
1236 | 1390 | ||
1237 | if (!capable(CAP_SYS_ADMIN)) | 1391 | if (!capable(CAP_SYS_ADMIN)) |
1238 | return -EPERM; | 1392 | return -EPERM; |
1239 | 1393 | ||
@@ -1253,7 +1407,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) | |||
1253 | spin_lock(&swap_lock); | 1407 | spin_lock(&swap_lock); |
1254 | for (type = swap_list.head; type >= 0; type = swap_info[type].next) { | 1408 | for (type = swap_list.head; type >= 0; type = swap_info[type].next) { |
1255 | p = swap_info + type; | 1409 | p = swap_info + type; |
1256 | if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { | 1410 | if (p->flags & SWP_WRITEOK) { |
1257 | if (p->swap_file->f_mapping == mapping) | 1411 | if (p->swap_file->f_mapping == mapping) |
1258 | break; | 1412 | break; |
1259 | } | 1413 | } |
@@ -1343,6 +1497,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile) | |||
1343 | spin_unlock(&swap_lock); | 1497 | spin_unlock(&swap_lock); |
1344 | mutex_unlock(&swapon_mutex); | 1498 | mutex_unlock(&swapon_mutex); |
1345 | vfree(swap_map); | 1499 | vfree(swap_map); |
1500 | /* Destroy swap account informatin */ | ||
1501 | swap_cgroup_swapoff(type); | ||
1502 | |||
1346 | inode = mapping->host; | 1503 | inode = mapping->host; |
1347 | if (S_ISBLK(inode->i_mode)) { | 1504 | if (S_ISBLK(inode->i_mode)) { |
1348 | struct block_device *bdev = I_BDEV(inode); | 1505 | struct block_device *bdev = I_BDEV(inode); |
@@ -1426,12 +1583,12 @@ static int swap_show(struct seq_file *swap, void *v) | |||
1426 | file = ptr->swap_file; | 1583 | file = ptr->swap_file; |
1427 | len = seq_path(swap, &file->f_path, " \t\n\\"); | 1584 | len = seq_path(swap, &file->f_path, " \t\n\\"); |
1428 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", | 1585 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", |
1429 | len < 40 ? 40 - len : 1, " ", | 1586 | len < 40 ? 40 - len : 1, " ", |
1430 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? | 1587 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? |
1431 | "partition" : "file\t", | 1588 | "partition" : "file\t", |
1432 | ptr->pages << (PAGE_SHIFT - 10), | 1589 | ptr->pages << (PAGE_SHIFT - 10), |
1433 | ptr->inuse_pages << (PAGE_SHIFT - 10), | 1590 | ptr->inuse_pages << (PAGE_SHIFT - 10), |
1434 | ptr->prio); | 1591 | ptr->prio); |
1435 | return 0; | 1592 | return 0; |
1436 | } | 1593 | } |
1437 | 1594 | ||
@@ -1487,12 +1644,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1487 | int i, prev; | 1644 | int i, prev; |
1488 | int error; | 1645 | int error; |
1489 | union swap_header *swap_header = NULL; | 1646 | union swap_header *swap_header = NULL; |
1490 | int swap_header_version; | ||
1491 | unsigned int nr_good_pages = 0; | 1647 | unsigned int nr_good_pages = 0; |
1492 | int nr_extents = 0; | 1648 | int nr_extents = 0; |
1493 | sector_t span; | 1649 | sector_t span; |
1494 | unsigned long maxpages = 1; | 1650 | unsigned long maxpages = 1; |
1495 | int swapfilesize; | 1651 | unsigned long swapfilepages; |
1496 | unsigned short *swap_map = NULL; | 1652 | unsigned short *swap_map = NULL; |
1497 | struct page *page = NULL; | 1653 | struct page *page = NULL; |
1498 | struct inode *inode = NULL; | 1654 | struct inode *inode = NULL; |
@@ -1570,7 +1726,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1570 | goto bad_swap; | 1726 | goto bad_swap; |
1571 | } | 1727 | } |
1572 | 1728 | ||
1573 | swapfilesize = i_size_read(inode) >> PAGE_SHIFT; | 1729 | swapfilepages = i_size_read(inode) >> PAGE_SHIFT; |
1574 | 1730 | ||
1575 | /* | 1731 | /* |
1576 | * Read the swap header. | 1732 | * Read the swap header. |
@@ -1584,102 +1740,92 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1584 | error = PTR_ERR(page); | 1740 | error = PTR_ERR(page); |
1585 | goto bad_swap; | 1741 | goto bad_swap; |
1586 | } | 1742 | } |
1587 | kmap(page); | 1743 | swap_header = kmap(page); |
1588 | swap_header = page_address(page); | ||
1589 | 1744 | ||
1590 | if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) | 1745 | if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { |
1591 | swap_header_version = 1; | ||
1592 | else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) | ||
1593 | swap_header_version = 2; | ||
1594 | else { | ||
1595 | printk(KERN_ERR "Unable to find swap-space signature\n"); | 1746 | printk(KERN_ERR "Unable to find swap-space signature\n"); |
1596 | error = -EINVAL; | 1747 | error = -EINVAL; |
1597 | goto bad_swap; | 1748 | goto bad_swap; |
1598 | } | 1749 | } |
1599 | 1750 | ||
1600 | switch (swap_header_version) { | 1751 | /* swap partition endianess hack... */ |
1601 | case 1: | 1752 | if (swab32(swap_header->info.version) == 1) { |
1602 | printk(KERN_ERR "version 0 swap is no longer supported. " | 1753 | swab32s(&swap_header->info.version); |
1603 | "Use mkswap -v1 %s\n", name); | 1754 | swab32s(&swap_header->info.last_page); |
1755 | swab32s(&swap_header->info.nr_badpages); | ||
1756 | for (i = 0; i < swap_header->info.nr_badpages; i++) | ||
1757 | swab32s(&swap_header->info.badpages[i]); | ||
1758 | } | ||
1759 | /* Check the swap header's sub-version */ | ||
1760 | if (swap_header->info.version != 1) { | ||
1761 | printk(KERN_WARNING | ||
1762 | "Unable to handle swap header version %d\n", | ||
1763 | swap_header->info.version); | ||
1604 | error = -EINVAL; | 1764 | error = -EINVAL; |
1605 | goto bad_swap; | 1765 | goto bad_swap; |
1606 | case 2: | 1766 | } |
1607 | /* swap partition endianess hack... */ | ||
1608 | if (swab32(swap_header->info.version) == 1) { | ||
1609 | swab32s(&swap_header->info.version); | ||
1610 | swab32s(&swap_header->info.last_page); | ||
1611 | swab32s(&swap_header->info.nr_badpages); | ||
1612 | for (i = 0; i < swap_header->info.nr_badpages; i++) | ||
1613 | swab32s(&swap_header->info.badpages[i]); | ||
1614 | } | ||
1615 | /* Check the swap header's sub-version and the size of | ||
1616 | the swap file and bad block lists */ | ||
1617 | if (swap_header->info.version != 1) { | ||
1618 | printk(KERN_WARNING | ||
1619 | "Unable to handle swap header version %d\n", | ||
1620 | swap_header->info.version); | ||
1621 | error = -EINVAL; | ||
1622 | goto bad_swap; | ||
1623 | } | ||
1624 | 1767 | ||
1625 | p->lowest_bit = 1; | 1768 | p->lowest_bit = 1; |
1626 | p->cluster_next = 1; | 1769 | p->cluster_next = 1; |
1627 | 1770 | ||
1628 | /* | 1771 | /* |
1629 | * Find out how many pages are allowed for a single swap | 1772 | * Find out how many pages are allowed for a single swap |
1630 | * device. There are two limiting factors: 1) the number of | 1773 | * device. There are two limiting factors: 1) the number of |
1631 | * bits for the swap offset in the swp_entry_t type and | 1774 | * bits for the swap offset in the swp_entry_t type and |
1632 | * 2) the number of bits in the a swap pte as defined by | 1775 | * 2) the number of bits in the a swap pte as defined by |
1633 | * the different architectures. In order to find the | 1776 | * the different architectures. In order to find the |
1634 | * largest possible bit mask a swap entry with swap type 0 | 1777 | * largest possible bit mask a swap entry with swap type 0 |
1635 | * and swap offset ~0UL is created, encoded to a swap pte, | 1778 | * and swap offset ~0UL is created, encoded to a swap pte, |
1636 | * decoded to a swp_entry_t again and finally the swap | 1779 | * decoded to a swp_entry_t again and finally the swap |
1637 | * offset is extracted. This will mask all the bits from | 1780 | * offset is extracted. This will mask all the bits from |
1638 | * the initial ~0UL mask that can't be encoded in either | 1781 | * the initial ~0UL mask that can't be encoded in either |
1639 | * the swp_entry_t or the architecture definition of a | 1782 | * the swp_entry_t or the architecture definition of a |
1640 | * swap pte. | 1783 | * swap pte. |
1641 | */ | 1784 | */ |
1642 | maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; | 1785 | maxpages = swp_offset(pte_to_swp_entry( |
1643 | if (maxpages > swap_header->info.last_page) | 1786 | swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; |
1644 | maxpages = swap_header->info.last_page; | 1787 | if (maxpages > swap_header->info.last_page) |
1645 | p->highest_bit = maxpages - 1; | 1788 | maxpages = swap_header->info.last_page; |
1789 | p->highest_bit = maxpages - 1; | ||
1646 | 1790 | ||
1647 | error = -EINVAL; | 1791 | error = -EINVAL; |
1648 | if (!maxpages) | 1792 | if (!maxpages) |
1649 | goto bad_swap; | 1793 | goto bad_swap; |
1650 | if (swapfilesize && maxpages > swapfilesize) { | 1794 | if (swapfilepages && maxpages > swapfilepages) { |
1651 | printk(KERN_WARNING | 1795 | printk(KERN_WARNING |
1652 | "Swap area shorter than signature indicates\n"); | 1796 | "Swap area shorter than signature indicates\n"); |
1653 | goto bad_swap; | 1797 | goto bad_swap; |
1654 | } | 1798 | } |
1655 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) | 1799 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) |
1656 | goto bad_swap; | 1800 | goto bad_swap; |
1657 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) | 1801 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) |
1658 | goto bad_swap; | 1802 | goto bad_swap; |
1659 | 1803 | ||
1660 | /* OK, set up the swap map and apply the bad block list */ | 1804 | /* OK, set up the swap map and apply the bad block list */ |
1661 | swap_map = vmalloc(maxpages * sizeof(short)); | 1805 | swap_map = vmalloc(maxpages * sizeof(short)); |
1662 | if (!swap_map) { | 1806 | if (!swap_map) { |
1663 | error = -ENOMEM; | 1807 | error = -ENOMEM; |
1664 | goto bad_swap; | 1808 | goto bad_swap; |
1665 | } | 1809 | } |
1666 | 1810 | ||
1667 | error = 0; | 1811 | memset(swap_map, 0, maxpages * sizeof(short)); |
1668 | memset(swap_map, 0, maxpages * sizeof(short)); | 1812 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
1669 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 1813 | int page_nr = swap_header->info.badpages[i]; |
1670 | int page_nr = swap_header->info.badpages[i]; | 1814 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { |
1671 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) | 1815 | error = -EINVAL; |
1672 | error = -EINVAL; | ||
1673 | else | ||
1674 | swap_map[page_nr] = SWAP_MAP_BAD; | ||
1675 | } | ||
1676 | nr_good_pages = swap_header->info.last_page - | ||
1677 | swap_header->info.nr_badpages - | ||
1678 | 1 /* header page */; | ||
1679 | if (error) | ||
1680 | goto bad_swap; | 1816 | goto bad_swap; |
1817 | } | ||
1818 | swap_map[page_nr] = SWAP_MAP_BAD; | ||
1681 | } | 1819 | } |
1682 | 1820 | ||
1821 | error = swap_cgroup_swapon(type, maxpages); | ||
1822 | if (error) | ||
1823 | goto bad_swap; | ||
1824 | |||
1825 | nr_good_pages = swap_header->info.last_page - | ||
1826 | swap_header->info.nr_badpages - | ||
1827 | 1 /* header page */; | ||
1828 | |||
1683 | if (nr_good_pages) { | 1829 | if (nr_good_pages) { |
1684 | swap_map[0] = SWAP_MAP_BAD; | 1830 | swap_map[0] = SWAP_MAP_BAD; |
1685 | p->max = maxpages; | 1831 | p->max = maxpages; |
@@ -1697,6 +1843,13 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1697 | goto bad_swap; | 1843 | goto bad_swap; |
1698 | } | 1844 | } |
1699 | 1845 | ||
1846 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { | ||
1847 | p->flags |= SWP_SOLIDSTATE; | ||
1848 | p->cluster_next = 1 + (random32() % p->highest_bit); | ||
1849 | } | ||
1850 | if (discard_swap(p) == 0) | ||
1851 | p->flags |= SWP_DISCARDABLE; | ||
1852 | |||
1700 | mutex_lock(&swapon_mutex); | 1853 | mutex_lock(&swapon_mutex); |
1701 | spin_lock(&swap_lock); | 1854 | spin_lock(&swap_lock); |
1702 | if (swap_flags & SWAP_FLAG_PREFER) | 1855 | if (swap_flags & SWAP_FLAG_PREFER) |
@@ -1705,14 +1858,16 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1705 | else | 1858 | else |
1706 | p->prio = --least_priority; | 1859 | p->prio = --least_priority; |
1707 | p->swap_map = swap_map; | 1860 | p->swap_map = swap_map; |
1708 | p->flags = SWP_ACTIVE; | 1861 | p->flags |= SWP_WRITEOK; |
1709 | nr_swap_pages += nr_good_pages; | 1862 | nr_swap_pages += nr_good_pages; |
1710 | total_swap_pages += nr_good_pages; | 1863 | total_swap_pages += nr_good_pages; |
1711 | 1864 | ||
1712 | printk(KERN_INFO "Adding %uk swap on %s. " | 1865 | printk(KERN_INFO "Adding %uk swap on %s. " |
1713 | "Priority:%d extents:%d across:%lluk\n", | 1866 | "Priority:%d extents:%d across:%lluk %s%s\n", |
1714 | nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, | 1867 | nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, |
1715 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10)); | 1868 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
1869 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", | ||
1870 | (p->flags & SWP_DISCARDABLE) ? "D" : ""); | ||
1716 | 1871 | ||
1717 | /* insert swap space into swap_list: */ | 1872 | /* insert swap space into swap_list: */ |
1718 | prev = -1; | 1873 | prev = -1; |
@@ -1738,6 +1893,7 @@ bad_swap: | |||
1738 | bd_release(bdev); | 1893 | bd_release(bdev); |
1739 | } | 1894 | } |
1740 | destroy_swap_extents(p); | 1895 | destroy_swap_extents(p); |
1896 | swap_cgroup_swapoff(type); | ||
1741 | bad_swap_2: | 1897 | bad_swap_2: |
1742 | spin_lock(&swap_lock); | 1898 | spin_lock(&swap_lock); |
1743 | p->swap_file = NULL; | 1899 | p->swap_file = NULL; |
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c deleted file mode 100644 index 3e67d575ee6e..000000000000 --- a/mm/tiny-shmem.c +++ /dev/null | |||
@@ -1,134 +0,0 @@ | |||
1 | /* | ||
2 | * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code | ||
3 | * | ||
4 | * Matt Mackall <mpm@selenic.com> January, 2004 | ||
5 | * derived from mm/shmem.c and fs/ramfs/inode.c | ||
6 | * | ||
7 | * This is intended for small system where the benefits of the full | ||
8 | * shmem code (swap-backed and resource-limited) are outweighed by | ||
9 | * their complexity. On systems without swap this code should be | ||
10 | * effectively equivalent, but much lighter weight. | ||
11 | */ | ||
12 | |||
13 | #include <linux/fs.h> | ||
14 | #include <linux/init.h> | ||
15 | #include <linux/vfs.h> | ||
16 | #include <linux/mount.h> | ||
17 | #include <linux/file.h> | ||
18 | #include <linux/mm.h> | ||
19 | #include <linux/module.h> | ||
20 | #include <linux/swap.h> | ||
21 | #include <linux/ramfs.h> | ||
22 | |||
23 | static struct file_system_type tmpfs_fs_type = { | ||
24 | .name = "tmpfs", | ||
25 | .get_sb = ramfs_get_sb, | ||
26 | .kill_sb = kill_litter_super, | ||
27 | }; | ||
28 | |||
29 | static struct vfsmount *shm_mnt; | ||
30 | |||
31 | static int __init init_tmpfs(void) | ||
32 | { | ||
33 | BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); | ||
34 | |||
35 | shm_mnt = kern_mount(&tmpfs_fs_type); | ||
36 | BUG_ON(IS_ERR(shm_mnt)); | ||
37 | |||
38 | return 0; | ||
39 | } | ||
40 | module_init(init_tmpfs) | ||
41 | |||
42 | /** | ||
43 | * shmem_file_setup - get an unlinked file living in tmpfs | ||
44 | * @name: name for dentry (to be seen in /proc/<pid>/maps | ||
45 | * @size: size to be set for the file | ||
46 | * @flags: vm_flags | ||
47 | */ | ||
48 | struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | ||
49 | { | ||
50 | int error; | ||
51 | struct file *file; | ||
52 | struct inode *inode; | ||
53 | struct dentry *dentry, *root; | ||
54 | struct qstr this; | ||
55 | |||
56 | if (IS_ERR(shm_mnt)) | ||
57 | return (void *)shm_mnt; | ||
58 | |||
59 | error = -ENOMEM; | ||
60 | this.name = name; | ||
61 | this.len = strlen(name); | ||
62 | this.hash = 0; /* will go */ | ||
63 | root = shm_mnt->mnt_root; | ||
64 | dentry = d_alloc(root, &this); | ||
65 | if (!dentry) | ||
66 | goto put_memory; | ||
67 | |||
68 | error = -ENFILE; | ||
69 | file = get_empty_filp(); | ||
70 | if (!file) | ||
71 | goto put_dentry; | ||
72 | |||
73 | error = -ENOSPC; | ||
74 | inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); | ||
75 | if (!inode) | ||
76 | goto close_file; | ||
77 | |||
78 | d_instantiate(dentry, inode); | ||
79 | inode->i_size = size; | ||
80 | inode->i_nlink = 0; /* It is unlinked */ | ||
81 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | ||
82 | &ramfs_file_operations); | ||
83 | |||
84 | #ifndef CONFIG_MMU | ||
85 | error = ramfs_nommu_expand_for_mapping(inode, size); | ||
86 | if (error) | ||
87 | goto close_file; | ||
88 | #endif | ||
89 | return file; | ||
90 | |||
91 | close_file: | ||
92 | put_filp(file); | ||
93 | put_dentry: | ||
94 | dput(dentry); | ||
95 | put_memory: | ||
96 | return ERR_PTR(error); | ||
97 | } | ||
98 | EXPORT_SYMBOL_GPL(shmem_file_setup); | ||
99 | |||
100 | /** | ||
101 | * shmem_zero_setup - setup a shared anonymous mapping | ||
102 | * @vma: the vma to be mmapped is prepared by do_mmap_pgoff | ||
103 | */ | ||
104 | int shmem_zero_setup(struct vm_area_struct *vma) | ||
105 | { | ||
106 | struct file *file; | ||
107 | loff_t size = vma->vm_end - vma->vm_start; | ||
108 | |||
109 | file = shmem_file_setup("dev/zero", size, vma->vm_flags); | ||
110 | if (IS_ERR(file)) | ||
111 | return PTR_ERR(file); | ||
112 | |||
113 | if (vma->vm_file) | ||
114 | fput(vma->vm_file); | ||
115 | vma->vm_file = file; | ||
116 | vma->vm_ops = &generic_file_vm_ops; | ||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | int shmem_unuse(swp_entry_t entry, struct page *page) | ||
121 | { | ||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | #ifndef CONFIG_MMU | ||
126 | unsigned long shmem_get_unmapped_area(struct file *file, | ||
127 | unsigned long addr, | ||
128 | unsigned long len, | ||
129 | unsigned long pgoff, | ||
130 | unsigned long flags) | ||
131 | { | ||
132 | return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags); | ||
133 | } | ||
134 | #endif | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7465f22fec0c..c5db9a7264d9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/highmem.h> | 14 | #include <linux/highmem.h> |
15 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
16 | #include <linux/spinlock.h> | 16 | #include <linux/spinlock.h> |
17 | #include <linux/mutex.h> | ||
17 | #include <linux/interrupt.h> | 18 | #include <linux/interrupt.h> |
18 | #include <linux/proc_fs.h> | 19 | #include <linux/proc_fs.h> |
19 | #include <linux/seq_file.h> | 20 | #include <linux/seq_file.h> |
@@ -381,8 +382,9 @@ found: | |||
381 | goto retry; | 382 | goto retry; |
382 | } | 383 | } |
383 | if (printk_ratelimit()) | 384 | if (printk_ratelimit()) |
384 | printk(KERN_WARNING "vmap allocation failed: " | 385 | printk(KERN_WARNING |
385 | "use vmalloc=<size> to increase size.\n"); | 386 | "vmap allocation for size %lu failed: " |
387 | "use vmalloc=<size> to increase size.\n", size); | ||
386 | return ERR_PTR(-EBUSY); | 388 | return ERR_PTR(-EBUSY); |
387 | } | 389 | } |
388 | 390 | ||
@@ -432,6 +434,27 @@ static void unmap_vmap_area(struct vmap_area *va) | |||
432 | vunmap_page_range(va->va_start, va->va_end); | 434 | vunmap_page_range(va->va_start, va->va_end); |
433 | } | 435 | } |
434 | 436 | ||
437 | static void vmap_debug_free_range(unsigned long start, unsigned long end) | ||
438 | { | ||
439 | /* | ||
440 | * Unmap page tables and force a TLB flush immediately if | ||
441 | * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free | ||
442 | * bugs similarly to those in linear kernel virtual address | ||
443 | * space after a page has been freed. | ||
444 | * | ||
445 | * All the lazy freeing logic is still retained, in order to | ||
446 | * minimise intrusiveness of this debugging feature. | ||
447 | * | ||
448 | * This is going to be *slow* (linear kernel virtual address | ||
449 | * debugging doesn't do a broadcast TLB flush so it is a lot | ||
450 | * faster). | ||
451 | */ | ||
452 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
453 | vunmap_page_range(start, end); | ||
454 | flush_tlb_kernel_range(start, end); | ||
455 | #endif | ||
456 | } | ||
457 | |||
435 | /* | 458 | /* |
436 | * lazy_max_pages is the maximum amount of virtual address space we gather up | 459 | * lazy_max_pages is the maximum amount of virtual address space we gather up |
437 | * before attempting to purge with a TLB flush. | 460 | * before attempting to purge with a TLB flush. |
@@ -472,7 +495,7 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); | |||
472 | static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | 495 | static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, |
473 | int sync, int force_flush) | 496 | int sync, int force_flush) |
474 | { | 497 | { |
475 | static DEFINE_SPINLOCK(purge_lock); | 498 | static DEFINE_MUTEX(purge_lock); |
476 | LIST_HEAD(valist); | 499 | LIST_HEAD(valist); |
477 | struct vmap_area *va; | 500 | struct vmap_area *va; |
478 | int nr = 0; | 501 | int nr = 0; |
@@ -483,10 +506,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | |||
483 | * the case that isn't actually used at the moment anyway. | 506 | * the case that isn't actually used at the moment anyway. |
484 | */ | 507 | */ |
485 | if (!sync && !force_flush) { | 508 | if (!sync && !force_flush) { |
486 | if (!spin_trylock(&purge_lock)) | 509 | if (!mutex_trylock(&purge_lock)) |
487 | return; | 510 | return; |
488 | } else | 511 | } else |
489 | spin_lock(&purge_lock); | 512 | mutex_lock(&purge_lock); |
490 | 513 | ||
491 | rcu_read_lock(); | 514 | rcu_read_lock(); |
492 | list_for_each_entry_rcu(va, &vmap_area_list, list) { | 515 | list_for_each_entry_rcu(va, &vmap_area_list, list) { |
@@ -518,7 +541,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | |||
518 | __free_vmap_area(va); | 541 | __free_vmap_area(va); |
519 | spin_unlock(&vmap_area_lock); | 542 | spin_unlock(&vmap_area_lock); |
520 | } | 543 | } |
521 | spin_unlock(&purge_lock); | 544 | mutex_unlock(&purge_lock); |
522 | } | 545 | } |
523 | 546 | ||
524 | /* | 547 | /* |
@@ -912,6 +935,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) | |||
912 | BUG_ON(addr & (PAGE_SIZE-1)); | 935 | BUG_ON(addr & (PAGE_SIZE-1)); |
913 | 936 | ||
914 | debug_check_no_locks_freed(mem, size); | 937 | debug_check_no_locks_freed(mem, size); |
938 | vmap_debug_free_range(addr, addr+size); | ||
915 | 939 | ||
916 | if (likely(count <= VMAP_MAX_ALLOC)) | 940 | if (likely(count <= VMAP_MAX_ALLOC)) |
917 | vb_free(mem, size); | 941 | vb_free(mem, size); |
@@ -1128,6 +1152,8 @@ struct vm_struct *remove_vm_area(const void *addr) | |||
1128 | if (va && va->flags & VM_VM_AREA) { | 1152 | if (va && va->flags & VM_VM_AREA) { |
1129 | struct vm_struct *vm = va->private; | 1153 | struct vm_struct *vm = va->private; |
1130 | struct vm_struct *tmp, **p; | 1154 | struct vm_struct *tmp, **p; |
1155 | |||
1156 | vmap_debug_free_range(va->va_start, va->va_end); | ||
1131 | free_unmap_vmap_area(va); | 1157 | free_unmap_vmap_area(va); |
1132 | vm->size -= PAGE_SIZE; | 1158 | vm->size -= PAGE_SIZE; |
1133 | 1159 | ||
@@ -1375,7 +1401,8 @@ void *vmalloc_user(unsigned long size) | |||
1375 | struct vm_struct *area; | 1401 | struct vm_struct *area; |
1376 | void *ret; | 1402 | void *ret; |
1377 | 1403 | ||
1378 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); | 1404 | ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, |
1405 | PAGE_KERNEL, -1, __builtin_return_address(0)); | ||
1379 | if (ret) { | 1406 | if (ret) { |
1380 | area = find_vm_area(ret); | 1407 | area = find_vm_area(ret); |
1381 | area->flags |= VM_USERMAP; | 1408 | area->flags |= VM_USERMAP; |
@@ -1420,7 +1447,8 @@ EXPORT_SYMBOL(vmalloc_node); | |||
1420 | 1447 | ||
1421 | void *vmalloc_exec(unsigned long size) | 1448 | void *vmalloc_exec(unsigned long size) |
1422 | { | 1449 | { |
1423 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); | 1450 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, |
1451 | -1, __builtin_return_address(0)); | ||
1424 | } | 1452 | } |
1425 | 1453 | ||
1426 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) | 1454 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) |
@@ -1440,7 +1468,8 @@ void *vmalloc_exec(unsigned long size) | |||
1440 | */ | 1468 | */ |
1441 | void *vmalloc_32(unsigned long size) | 1469 | void *vmalloc_32(unsigned long size) |
1442 | { | 1470 | { |
1443 | return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL); | 1471 | return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, |
1472 | -1, __builtin_return_address(0)); | ||
1444 | } | 1473 | } |
1445 | EXPORT_SYMBOL(vmalloc_32); | 1474 | EXPORT_SYMBOL(vmalloc_32); |
1446 | 1475 | ||
@@ -1456,7 +1485,8 @@ void *vmalloc_32_user(unsigned long size) | |||
1456 | struct vm_struct *area; | 1485 | struct vm_struct *area; |
1457 | void *ret; | 1486 | void *ret; |
1458 | 1487 | ||
1459 | ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); | 1488 | ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, |
1489 | -1, __builtin_return_address(0)); | ||
1460 | if (ret) { | 1490 | if (ret) { |
1461 | area = find_vm_area(ret); | 1491 | area = find_vm_area(ret); |
1462 | area->flags |= VM_USERMAP; | 1492 | area->flags |= VM_USERMAP; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index d196f46c8808..9a27c44aa327 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -52,6 +52,9 @@ struct scan_control { | |||
52 | /* Incremented by the number of inactive pages that were scanned */ | 52 | /* Incremented by the number of inactive pages that were scanned */ |
53 | unsigned long nr_scanned; | 53 | unsigned long nr_scanned; |
54 | 54 | ||
55 | /* Number of pages freed so far during a call to shrink_zones() */ | ||
56 | unsigned long nr_reclaimed; | ||
57 | |||
55 | /* This context's GFP mask */ | 58 | /* This context's GFP mask */ |
56 | gfp_t gfp_mask; | 59 | gfp_t gfp_mask; |
57 | 60 | ||
@@ -122,11 +125,30 @@ static LIST_HEAD(shrinker_list); | |||
122 | static DECLARE_RWSEM(shrinker_rwsem); | 125 | static DECLARE_RWSEM(shrinker_rwsem); |
123 | 126 | ||
124 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 127 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
125 | #define scan_global_lru(sc) (!(sc)->mem_cgroup) | 128 | #define scanning_global_lru(sc) (!(sc)->mem_cgroup) |
126 | #else | 129 | #else |
127 | #define scan_global_lru(sc) (1) | 130 | #define scanning_global_lru(sc) (1) |
128 | #endif | 131 | #endif |
129 | 132 | ||
133 | static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, | ||
134 | struct scan_control *sc) | ||
135 | { | ||
136 | if (!scanning_global_lru(sc)) | ||
137 | return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); | ||
138 | |||
139 | return &zone->reclaim_stat; | ||
140 | } | ||
141 | |||
142 | static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc, | ||
143 | enum lru_list lru) | ||
144 | { | ||
145 | if (!scanning_global_lru(sc)) | ||
146 | return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); | ||
147 | |||
148 | return zone_page_state(zone, NR_LRU_BASE + lru); | ||
149 | } | ||
150 | |||
151 | |||
130 | /* | 152 | /* |
131 | * Add a shrinker callback to be called from the vm | 153 | * Add a shrinker callback to be called from the vm |
132 | */ | 154 | */ |
@@ -509,7 +531,6 @@ redo: | |||
509 | lru = LRU_UNEVICTABLE; | 531 | lru = LRU_UNEVICTABLE; |
510 | add_page_to_unevictable_list(page); | 532 | add_page_to_unevictable_list(page); |
511 | } | 533 | } |
512 | mem_cgroup_move_lists(page, lru); | ||
513 | 534 | ||
514 | /* | 535 | /* |
515 | * page's status can change while we move it among lru. If an evictable | 536 | * page's status can change while we move it among lru. If an evictable |
@@ -544,7 +565,6 @@ void putback_lru_page(struct page *page) | |||
544 | 565 | ||
545 | lru = !!TestClearPageActive(page) + page_is_file_cache(page); | 566 | lru = !!TestClearPageActive(page) + page_is_file_cache(page); |
546 | lru_cache_add_lru(page, lru); | 567 | lru_cache_add_lru(page, lru); |
547 | mem_cgroup_move_lists(page, lru); | ||
548 | put_page(page); | 568 | put_page(page); |
549 | } | 569 | } |
550 | #endif /* CONFIG_UNEVICTABLE_LRU */ | 570 | #endif /* CONFIG_UNEVICTABLE_LRU */ |
@@ -617,7 +637,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
617 | referenced && page_mapping_inuse(page)) | 637 | referenced && page_mapping_inuse(page)) |
618 | goto activate_locked; | 638 | goto activate_locked; |
619 | 639 | ||
620 | #ifdef CONFIG_SWAP | ||
621 | /* | 640 | /* |
622 | * Anonymous process memory has backing store? | 641 | * Anonymous process memory has backing store? |
623 | * Try to allocate it some swap space here. | 642 | * Try to allocate it some swap space here. |
@@ -625,20 +644,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
625 | if (PageAnon(page) && !PageSwapCache(page)) { | 644 | if (PageAnon(page) && !PageSwapCache(page)) { |
626 | if (!(sc->gfp_mask & __GFP_IO)) | 645 | if (!(sc->gfp_mask & __GFP_IO)) |
627 | goto keep_locked; | 646 | goto keep_locked; |
628 | switch (try_to_munlock(page)) { | 647 | if (!add_to_swap(page)) |
629 | case SWAP_FAIL: /* shouldn't happen */ | ||
630 | case SWAP_AGAIN: | ||
631 | goto keep_locked; | ||
632 | case SWAP_MLOCK: | ||
633 | goto cull_mlocked; | ||
634 | case SWAP_SUCCESS: | ||
635 | ; /* fall thru'; add to swap cache */ | ||
636 | } | ||
637 | if (!add_to_swap(page, GFP_ATOMIC)) | ||
638 | goto activate_locked; | 648 | goto activate_locked; |
639 | may_enter_fs = 1; | 649 | may_enter_fs = 1; |
640 | } | 650 | } |
641 | #endif /* CONFIG_SWAP */ | ||
642 | 651 | ||
643 | mapping = page_mapping(page); | 652 | mapping = page_mapping(page); |
644 | 653 | ||
@@ -752,6 +761,8 @@ free_it: | |||
752 | continue; | 761 | continue; |
753 | 762 | ||
754 | cull_mlocked: | 763 | cull_mlocked: |
764 | if (PageSwapCache(page)) | ||
765 | try_to_free_swap(page); | ||
755 | unlock_page(page); | 766 | unlock_page(page); |
756 | putback_lru_page(page); | 767 | putback_lru_page(page); |
757 | continue; | 768 | continue; |
@@ -759,7 +770,7 @@ cull_mlocked: | |||
759 | activate_locked: | 770 | activate_locked: |
760 | /* Not a candidate for swapping, so reclaim swap space. */ | 771 | /* Not a candidate for swapping, so reclaim swap space. */ |
761 | if (PageSwapCache(page) && vm_swap_full()) | 772 | if (PageSwapCache(page) && vm_swap_full()) |
762 | remove_exclusive_swap_page_ref(page); | 773 | try_to_free_swap(page); |
763 | VM_BUG_ON(PageActive(page)); | 774 | VM_BUG_ON(PageActive(page)); |
764 | SetPageActive(page); | 775 | SetPageActive(page); |
765 | pgactivate++; | 776 | pgactivate++; |
@@ -819,6 +830,7 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
819 | return ret; | 830 | return ret; |
820 | 831 | ||
821 | ret = -EBUSY; | 832 | ret = -EBUSY; |
833 | |||
822 | if (likely(get_page_unless_zero(page))) { | 834 | if (likely(get_page_unless_zero(page))) { |
823 | /* | 835 | /* |
824 | * Be careful not to clear PageLRU until after we're | 836 | * Be careful not to clear PageLRU until after we're |
@@ -827,6 +839,7 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
827 | */ | 839 | */ |
828 | ClearPageLRU(page); | 840 | ClearPageLRU(page); |
829 | ret = 0; | 841 | ret = 0; |
842 | mem_cgroup_del_lru(page); | ||
830 | } | 843 | } |
831 | 844 | ||
832 | return ret; | 845 | return ret; |
@@ -1035,6 +1048,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1035 | struct pagevec pvec; | 1048 | struct pagevec pvec; |
1036 | unsigned long nr_scanned = 0; | 1049 | unsigned long nr_scanned = 0; |
1037 | unsigned long nr_reclaimed = 0; | 1050 | unsigned long nr_reclaimed = 0; |
1051 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | ||
1038 | 1052 | ||
1039 | pagevec_init(&pvec, 1); | 1053 | pagevec_init(&pvec, 1); |
1040 | 1054 | ||
@@ -1076,13 +1090,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1076 | __mod_zone_page_state(zone, NR_INACTIVE_ANON, | 1090 | __mod_zone_page_state(zone, NR_INACTIVE_ANON, |
1077 | -count[LRU_INACTIVE_ANON]); | 1091 | -count[LRU_INACTIVE_ANON]); |
1078 | 1092 | ||
1079 | if (scan_global_lru(sc)) { | 1093 | if (scanning_global_lru(sc)) |
1080 | zone->pages_scanned += nr_scan; | 1094 | zone->pages_scanned += nr_scan; |
1081 | zone->recent_scanned[0] += count[LRU_INACTIVE_ANON]; | 1095 | |
1082 | zone->recent_scanned[0] += count[LRU_ACTIVE_ANON]; | 1096 | reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; |
1083 | zone->recent_scanned[1] += count[LRU_INACTIVE_FILE]; | 1097 | reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; |
1084 | zone->recent_scanned[1] += count[LRU_ACTIVE_FILE]; | 1098 | reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE]; |
1085 | } | 1099 | reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE]; |
1100 | |||
1086 | spin_unlock_irq(&zone->lru_lock); | 1101 | spin_unlock_irq(&zone->lru_lock); |
1087 | 1102 | ||
1088 | nr_scanned += nr_scan; | 1103 | nr_scanned += nr_scan; |
@@ -1114,7 +1129,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1114 | if (current_is_kswapd()) { | 1129 | if (current_is_kswapd()) { |
1115 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); | 1130 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); |
1116 | __count_vm_events(KSWAPD_STEAL, nr_freed); | 1131 | __count_vm_events(KSWAPD_STEAL, nr_freed); |
1117 | } else if (scan_global_lru(sc)) | 1132 | } else if (scanning_global_lru(sc)) |
1118 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); | 1133 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); |
1119 | 1134 | ||
1120 | __count_zone_vm_events(PGSTEAL, zone, nr_freed); | 1135 | __count_zone_vm_events(PGSTEAL, zone, nr_freed); |
@@ -1140,10 +1155,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1140 | SetPageLRU(page); | 1155 | SetPageLRU(page); |
1141 | lru = page_lru(page); | 1156 | lru = page_lru(page); |
1142 | add_page_to_lru_list(zone, page, lru); | 1157 | add_page_to_lru_list(zone, page, lru); |
1143 | mem_cgroup_move_lists(page, lru); | 1158 | if (PageActive(page)) { |
1144 | if (PageActive(page) && scan_global_lru(sc)) { | ||
1145 | int file = !!page_is_file_cache(page); | 1159 | int file = !!page_is_file_cache(page); |
1146 | zone->recent_rotated[file]++; | 1160 | reclaim_stat->recent_rotated[file]++; |
1147 | } | 1161 | } |
1148 | if (!pagevec_add(&pvec, page)) { | 1162 | if (!pagevec_add(&pvec, page)) { |
1149 | spin_unlock_irq(&zone->lru_lock); | 1163 | spin_unlock_irq(&zone->lru_lock); |
@@ -1173,11 +1187,6 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) | |||
1173 | zone->prev_priority = priority; | 1187 | zone->prev_priority = priority; |
1174 | } | 1188 | } |
1175 | 1189 | ||
1176 | static inline int zone_is_near_oom(struct zone *zone) | ||
1177 | { | ||
1178 | return zone->pages_scanned >= (zone_lru_pages(zone) * 3); | ||
1179 | } | ||
1180 | |||
1181 | /* | 1190 | /* |
1182 | * This moves pages from the active list to the inactive list. | 1191 | * This moves pages from the active list to the inactive list. |
1183 | * | 1192 | * |
@@ -1208,6 +1217,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1208 | struct page *page; | 1217 | struct page *page; |
1209 | struct pagevec pvec; | 1218 | struct pagevec pvec; |
1210 | enum lru_list lru; | 1219 | enum lru_list lru; |
1220 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | ||
1211 | 1221 | ||
1212 | lru_add_drain(); | 1222 | lru_add_drain(); |
1213 | spin_lock_irq(&zone->lru_lock); | 1223 | spin_lock_irq(&zone->lru_lock); |
@@ -1218,10 +1228,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1218 | * zone->pages_scanned is used for detect zone's oom | 1228 | * zone->pages_scanned is used for detect zone's oom |
1219 | * mem_cgroup remembers nr_scan by itself. | 1229 | * mem_cgroup remembers nr_scan by itself. |
1220 | */ | 1230 | */ |
1221 | if (scan_global_lru(sc)) { | 1231 | if (scanning_global_lru(sc)) { |
1222 | zone->pages_scanned += pgscanned; | 1232 | zone->pages_scanned += pgscanned; |
1223 | zone->recent_scanned[!!file] += pgmoved; | ||
1224 | } | 1233 | } |
1234 | reclaim_stat->recent_scanned[!!file] += pgmoved; | ||
1225 | 1235 | ||
1226 | if (file) | 1236 | if (file) |
1227 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); | 1237 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); |
@@ -1248,6 +1258,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1248 | list_add(&page->lru, &l_inactive); | 1258 | list_add(&page->lru, &l_inactive); |
1249 | } | 1259 | } |
1250 | 1260 | ||
1261 | /* | ||
1262 | * Move the pages to the [file or anon] inactive list. | ||
1263 | */ | ||
1264 | pagevec_init(&pvec, 1); | ||
1265 | pgmoved = 0; | ||
1266 | lru = LRU_BASE + file * LRU_FILE; | ||
1267 | |||
1251 | spin_lock_irq(&zone->lru_lock); | 1268 | spin_lock_irq(&zone->lru_lock); |
1252 | /* | 1269 | /* |
1253 | * Count referenced pages from currently used mappings as | 1270 | * Count referenced pages from currently used mappings as |
@@ -1255,15 +1272,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1255 | * This helps balance scan pressure between file and anonymous | 1272 | * This helps balance scan pressure between file and anonymous |
1256 | * pages in get_scan_ratio. | 1273 | * pages in get_scan_ratio. |
1257 | */ | 1274 | */ |
1258 | zone->recent_rotated[!!file] += pgmoved; | 1275 | reclaim_stat->recent_rotated[!!file] += pgmoved; |
1259 | 1276 | ||
1260 | /* | ||
1261 | * Move the pages to the [file or anon] inactive list. | ||
1262 | */ | ||
1263 | pagevec_init(&pvec, 1); | ||
1264 | |||
1265 | pgmoved = 0; | ||
1266 | lru = LRU_BASE + file * LRU_FILE; | ||
1267 | while (!list_empty(&l_inactive)) { | 1277 | while (!list_empty(&l_inactive)) { |
1268 | page = lru_to_page(&l_inactive); | 1278 | page = lru_to_page(&l_inactive); |
1269 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 1279 | prefetchw_prev_lru_page(page, &l_inactive, flags); |
@@ -1273,7 +1283,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1273 | ClearPageActive(page); | 1283 | ClearPageActive(page); |
1274 | 1284 | ||
1275 | list_move(&page->lru, &zone->lru[lru].list); | 1285 | list_move(&page->lru, &zone->lru[lru].list); |
1276 | mem_cgroup_move_lists(page, lru); | 1286 | mem_cgroup_add_lru_list(page, lru); |
1277 | pgmoved++; | 1287 | pgmoved++; |
1278 | if (!pagevec_add(&pvec, page)) { | 1288 | if (!pagevec_add(&pvec, page)) { |
1279 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | 1289 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); |
@@ -1302,6 +1312,38 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1302 | pagevec_release(&pvec); | 1312 | pagevec_release(&pvec); |
1303 | } | 1313 | } |
1304 | 1314 | ||
1315 | static int inactive_anon_is_low_global(struct zone *zone) | ||
1316 | { | ||
1317 | unsigned long active, inactive; | ||
1318 | |||
1319 | active = zone_page_state(zone, NR_ACTIVE_ANON); | ||
1320 | inactive = zone_page_state(zone, NR_INACTIVE_ANON); | ||
1321 | |||
1322 | if (inactive * zone->inactive_ratio < active) | ||
1323 | return 1; | ||
1324 | |||
1325 | return 0; | ||
1326 | } | ||
1327 | |||
1328 | /** | ||
1329 | * inactive_anon_is_low - check if anonymous pages need to be deactivated | ||
1330 | * @zone: zone to check | ||
1331 | * @sc: scan control of this context | ||
1332 | * | ||
1333 | * Returns true if the zone does not have enough inactive anon pages, | ||
1334 | * meaning some active anon pages need to be deactivated. | ||
1335 | */ | ||
1336 | static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) | ||
1337 | { | ||
1338 | int low; | ||
1339 | |||
1340 | if (scanning_global_lru(sc)) | ||
1341 | low = inactive_anon_is_low_global(zone); | ||
1342 | else | ||
1343 | low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); | ||
1344 | return low; | ||
1345 | } | ||
1346 | |||
1305 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 1347 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
1306 | struct zone *zone, struct scan_control *sc, int priority) | 1348 | struct zone *zone, struct scan_control *sc, int priority) |
1307 | { | 1349 | { |
@@ -1312,8 +1354,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | |||
1312 | return 0; | 1354 | return 0; |
1313 | } | 1355 | } |
1314 | 1356 | ||
1315 | if (lru == LRU_ACTIVE_ANON && | 1357 | if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) { |
1316 | (!scan_global_lru(sc) || inactive_anon_is_low(zone))) { | ||
1317 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | 1358 | shrink_active_list(nr_to_scan, zone, sc, priority, file); |
1318 | return 0; | 1359 | return 0; |
1319 | } | 1360 | } |
@@ -1335,12 +1376,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1335 | unsigned long anon, file, free; | 1376 | unsigned long anon, file, free; |
1336 | unsigned long anon_prio, file_prio; | 1377 | unsigned long anon_prio, file_prio; |
1337 | unsigned long ap, fp; | 1378 | unsigned long ap, fp; |
1338 | 1379 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | |
1339 | anon = zone_page_state(zone, NR_ACTIVE_ANON) + | ||
1340 | zone_page_state(zone, NR_INACTIVE_ANON); | ||
1341 | file = zone_page_state(zone, NR_ACTIVE_FILE) + | ||
1342 | zone_page_state(zone, NR_INACTIVE_FILE); | ||
1343 | free = zone_page_state(zone, NR_FREE_PAGES); | ||
1344 | 1380 | ||
1345 | /* If we have no swap space, do not bother scanning anon pages. */ | 1381 | /* If we have no swap space, do not bother scanning anon pages. */ |
1346 | if (nr_swap_pages <= 0) { | 1382 | if (nr_swap_pages <= 0) { |
@@ -1349,11 +1385,20 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1349 | return; | 1385 | return; |
1350 | } | 1386 | } |
1351 | 1387 | ||
1352 | /* If we have very few page cache pages, force-scan anon pages. */ | 1388 | anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + |
1353 | if (unlikely(file + free <= zone->pages_high)) { | 1389 | zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); |
1354 | percent[0] = 100; | 1390 | file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + |
1355 | percent[1] = 0; | 1391 | zone_nr_pages(zone, sc, LRU_INACTIVE_FILE); |
1356 | return; | 1392 | |
1393 | if (scanning_global_lru(sc)) { | ||
1394 | free = zone_page_state(zone, NR_FREE_PAGES); | ||
1395 | /* If we have very few page cache pages, | ||
1396 | force-scan anon pages. */ | ||
1397 | if (unlikely(file + free <= zone->pages_high)) { | ||
1398 | percent[0] = 100; | ||
1399 | percent[1] = 0; | ||
1400 | return; | ||
1401 | } | ||
1357 | } | 1402 | } |
1358 | 1403 | ||
1359 | /* | 1404 | /* |
@@ -1367,17 +1412,17 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1367 | * | 1412 | * |
1368 | * anon in [0], file in [1] | 1413 | * anon in [0], file in [1] |
1369 | */ | 1414 | */ |
1370 | if (unlikely(zone->recent_scanned[0] > anon / 4)) { | 1415 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { |
1371 | spin_lock_irq(&zone->lru_lock); | 1416 | spin_lock_irq(&zone->lru_lock); |
1372 | zone->recent_scanned[0] /= 2; | 1417 | reclaim_stat->recent_scanned[0] /= 2; |
1373 | zone->recent_rotated[0] /= 2; | 1418 | reclaim_stat->recent_rotated[0] /= 2; |
1374 | spin_unlock_irq(&zone->lru_lock); | 1419 | spin_unlock_irq(&zone->lru_lock); |
1375 | } | 1420 | } |
1376 | 1421 | ||
1377 | if (unlikely(zone->recent_scanned[1] > file / 4)) { | 1422 | if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { |
1378 | spin_lock_irq(&zone->lru_lock); | 1423 | spin_lock_irq(&zone->lru_lock); |
1379 | zone->recent_scanned[1] /= 2; | 1424 | reclaim_stat->recent_scanned[1] /= 2; |
1380 | zone->recent_rotated[1] /= 2; | 1425 | reclaim_stat->recent_rotated[1] /= 2; |
1381 | spin_unlock_irq(&zone->lru_lock); | 1426 | spin_unlock_irq(&zone->lru_lock); |
1382 | } | 1427 | } |
1383 | 1428 | ||
@@ -1393,11 +1438,11 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1393 | * proportional to the fraction of recently scanned pages on | 1438 | * proportional to the fraction of recently scanned pages on |
1394 | * each list that were recently referenced and in active use. | 1439 | * each list that were recently referenced and in active use. |
1395 | */ | 1440 | */ |
1396 | ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1); | 1441 | ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); |
1397 | ap /= zone->recent_rotated[0] + 1; | 1442 | ap /= reclaim_stat->recent_rotated[0] + 1; |
1398 | 1443 | ||
1399 | fp = (file_prio + 1) * (zone->recent_scanned[1] + 1); | 1444 | fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); |
1400 | fp /= zone->recent_rotated[1] + 1; | 1445 | fp /= reclaim_stat->recent_rotated[1] + 1; |
1401 | 1446 | ||
1402 | /* Normalize to percentages */ | 1447 | /* Normalize to percentages */ |
1403 | percent[0] = 100 * ap / (ap + fp + 1); | 1448 | percent[0] = 100 * ap / (ap + fp + 1); |
@@ -1408,69 +1453,72 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1408 | /* | 1453 | /* |
1409 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1454 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
1410 | */ | 1455 | */ |
1411 | static unsigned long shrink_zone(int priority, struct zone *zone, | 1456 | static void shrink_zone(int priority, struct zone *zone, |
1412 | struct scan_control *sc) | 1457 | struct scan_control *sc) |
1413 | { | 1458 | { |
1414 | unsigned long nr[NR_LRU_LISTS]; | 1459 | unsigned long nr[NR_LRU_LISTS]; |
1415 | unsigned long nr_to_scan; | 1460 | unsigned long nr_to_scan; |
1416 | unsigned long nr_reclaimed = 0; | ||
1417 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ | 1461 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ |
1418 | enum lru_list l; | 1462 | enum lru_list l; |
1463 | unsigned long nr_reclaimed = sc->nr_reclaimed; | ||
1464 | unsigned long swap_cluster_max = sc->swap_cluster_max; | ||
1419 | 1465 | ||
1420 | get_scan_ratio(zone, sc, percent); | 1466 | get_scan_ratio(zone, sc, percent); |
1421 | 1467 | ||
1422 | for_each_evictable_lru(l) { | 1468 | for_each_evictable_lru(l) { |
1423 | if (scan_global_lru(sc)) { | 1469 | int file = is_file_lru(l); |
1424 | int file = is_file_lru(l); | 1470 | int scan; |
1425 | int scan; | 1471 | |
1426 | 1472 | scan = zone_page_state(zone, NR_LRU_BASE + l); | |
1427 | scan = zone_page_state(zone, NR_LRU_BASE + l); | 1473 | if (priority) { |
1428 | if (priority) { | 1474 | scan >>= priority; |
1429 | scan >>= priority; | 1475 | scan = (scan * percent[file]) / 100; |
1430 | scan = (scan * percent[file]) / 100; | 1476 | } |
1431 | } | 1477 | if (scanning_global_lru(sc)) { |
1432 | zone->lru[l].nr_scan += scan; | 1478 | zone->lru[l].nr_scan += scan; |
1433 | nr[l] = zone->lru[l].nr_scan; | 1479 | nr[l] = zone->lru[l].nr_scan; |
1434 | if (nr[l] >= sc->swap_cluster_max) | 1480 | if (nr[l] >= swap_cluster_max) |
1435 | zone->lru[l].nr_scan = 0; | 1481 | zone->lru[l].nr_scan = 0; |
1436 | else | 1482 | else |
1437 | nr[l] = 0; | 1483 | nr[l] = 0; |
1438 | } else { | 1484 | } else |
1439 | /* | 1485 | nr[l] = scan; |
1440 | * This reclaim occurs not because zone memory shortage | ||
1441 | * but because memory controller hits its limit. | ||
1442 | * Don't modify zone reclaim related data. | ||
1443 | */ | ||
1444 | nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone, | ||
1445 | priority, l); | ||
1446 | } | ||
1447 | } | 1486 | } |
1448 | 1487 | ||
1449 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1488 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
1450 | nr[LRU_INACTIVE_FILE]) { | 1489 | nr[LRU_INACTIVE_FILE]) { |
1451 | for_each_evictable_lru(l) { | 1490 | for_each_evictable_lru(l) { |
1452 | if (nr[l]) { | 1491 | if (nr[l]) { |
1453 | nr_to_scan = min(nr[l], | 1492 | nr_to_scan = min(nr[l], swap_cluster_max); |
1454 | (unsigned long)sc->swap_cluster_max); | ||
1455 | nr[l] -= nr_to_scan; | 1493 | nr[l] -= nr_to_scan; |
1456 | 1494 | ||
1457 | nr_reclaimed += shrink_list(l, nr_to_scan, | 1495 | nr_reclaimed += shrink_list(l, nr_to_scan, |
1458 | zone, sc, priority); | 1496 | zone, sc, priority); |
1459 | } | 1497 | } |
1460 | } | 1498 | } |
1499 | /* | ||
1500 | * On large memory systems, scan >> priority can become | ||
1501 | * really large. This is fine for the starting priority; | ||
1502 | * we want to put equal scanning pressure on each zone. | ||
1503 | * However, if the VM has a harder time of freeing pages, | ||
1504 | * with multiple processes reclaiming pages, the total | ||
1505 | * freeing target can get unreasonably large. | ||
1506 | */ | ||
1507 | if (nr_reclaimed > swap_cluster_max && | ||
1508 | priority < DEF_PRIORITY && !current_is_kswapd()) | ||
1509 | break; | ||
1461 | } | 1510 | } |
1462 | 1511 | ||
1512 | sc->nr_reclaimed = nr_reclaimed; | ||
1513 | |||
1463 | /* | 1514 | /* |
1464 | * Even if we did not try to evict anon pages at all, we want to | 1515 | * Even if we did not try to evict anon pages at all, we want to |
1465 | * rebalance the anon lru active/inactive ratio. | 1516 | * rebalance the anon lru active/inactive ratio. |
1466 | */ | 1517 | */ |
1467 | if (!scan_global_lru(sc) || inactive_anon_is_low(zone)) | 1518 | if (inactive_anon_is_low(zone, sc)) |
1468 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | ||
1469 | else if (!scan_global_lru(sc)) | ||
1470 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | 1519 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); |
1471 | 1520 | ||
1472 | throttle_vm_writeout(sc->gfp_mask); | 1521 | throttle_vm_writeout(sc->gfp_mask); |
1473 | return nr_reclaimed; | ||
1474 | } | 1522 | } |
1475 | 1523 | ||
1476 | /* | 1524 | /* |
@@ -1484,16 +1532,13 @@ static unsigned long shrink_zone(int priority, struct zone *zone, | |||
1484 | * b) The zones may be over pages_high but they must go *over* pages_high to | 1532 | * b) The zones may be over pages_high but they must go *over* pages_high to |
1485 | * satisfy the `incremental min' zone defense algorithm. | 1533 | * satisfy the `incremental min' zone defense algorithm. |
1486 | * | 1534 | * |
1487 | * Returns the number of reclaimed pages. | ||
1488 | * | ||
1489 | * If a zone is deemed to be full of pinned pages then just give it a light | 1535 | * If a zone is deemed to be full of pinned pages then just give it a light |
1490 | * scan then give up on it. | 1536 | * scan then give up on it. |
1491 | */ | 1537 | */ |
1492 | static unsigned long shrink_zones(int priority, struct zonelist *zonelist, | 1538 | static void shrink_zones(int priority, struct zonelist *zonelist, |
1493 | struct scan_control *sc) | 1539 | struct scan_control *sc) |
1494 | { | 1540 | { |
1495 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); | 1541 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); |
1496 | unsigned long nr_reclaimed = 0; | ||
1497 | struct zoneref *z; | 1542 | struct zoneref *z; |
1498 | struct zone *zone; | 1543 | struct zone *zone; |
1499 | 1544 | ||
@@ -1505,7 +1550,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, | |||
1505 | * Take care memory controller reclaiming has small influence | 1550 | * Take care memory controller reclaiming has small influence |
1506 | * to global LRU. | 1551 | * to global LRU. |
1507 | */ | 1552 | */ |
1508 | if (scan_global_lru(sc)) { | 1553 | if (scanning_global_lru(sc)) { |
1509 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1554 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
1510 | continue; | 1555 | continue; |
1511 | note_zone_scanning_priority(zone, priority); | 1556 | note_zone_scanning_priority(zone, priority); |
@@ -1524,10 +1569,8 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, | |||
1524 | priority); | 1569 | priority); |
1525 | } | 1570 | } |
1526 | 1571 | ||
1527 | nr_reclaimed += shrink_zone(priority, zone, sc); | 1572 | shrink_zone(priority, zone, sc); |
1528 | } | 1573 | } |
1529 | |||
1530 | return nr_reclaimed; | ||
1531 | } | 1574 | } |
1532 | 1575 | ||
1533 | /* | 1576 | /* |
@@ -1552,7 +1595,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1552 | int priority; | 1595 | int priority; |
1553 | unsigned long ret = 0; | 1596 | unsigned long ret = 0; |
1554 | unsigned long total_scanned = 0; | 1597 | unsigned long total_scanned = 0; |
1555 | unsigned long nr_reclaimed = 0; | ||
1556 | struct reclaim_state *reclaim_state = current->reclaim_state; | 1598 | struct reclaim_state *reclaim_state = current->reclaim_state; |
1557 | unsigned long lru_pages = 0; | 1599 | unsigned long lru_pages = 0; |
1558 | struct zoneref *z; | 1600 | struct zoneref *z; |
@@ -1561,12 +1603,12 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1561 | 1603 | ||
1562 | delayacct_freepages_start(); | 1604 | delayacct_freepages_start(); |
1563 | 1605 | ||
1564 | if (scan_global_lru(sc)) | 1606 | if (scanning_global_lru(sc)) |
1565 | count_vm_event(ALLOCSTALL); | 1607 | count_vm_event(ALLOCSTALL); |
1566 | /* | 1608 | /* |
1567 | * mem_cgroup will not do shrink_slab. | 1609 | * mem_cgroup will not do shrink_slab. |
1568 | */ | 1610 | */ |
1569 | if (scan_global_lru(sc)) { | 1611 | if (scanning_global_lru(sc)) { |
1570 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 1612 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1571 | 1613 | ||
1572 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1614 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
@@ -1580,21 +1622,21 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1580 | sc->nr_scanned = 0; | 1622 | sc->nr_scanned = 0; |
1581 | if (!priority) | 1623 | if (!priority) |
1582 | disable_swap_token(); | 1624 | disable_swap_token(); |
1583 | nr_reclaimed += shrink_zones(priority, zonelist, sc); | 1625 | shrink_zones(priority, zonelist, sc); |
1584 | /* | 1626 | /* |
1585 | * Don't shrink slabs when reclaiming memory from | 1627 | * Don't shrink slabs when reclaiming memory from |
1586 | * over limit cgroups | 1628 | * over limit cgroups |
1587 | */ | 1629 | */ |
1588 | if (scan_global_lru(sc)) { | 1630 | if (scanning_global_lru(sc)) { |
1589 | shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); | 1631 | shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); |
1590 | if (reclaim_state) { | 1632 | if (reclaim_state) { |
1591 | nr_reclaimed += reclaim_state->reclaimed_slab; | 1633 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; |
1592 | reclaim_state->reclaimed_slab = 0; | 1634 | reclaim_state->reclaimed_slab = 0; |
1593 | } | 1635 | } |
1594 | } | 1636 | } |
1595 | total_scanned += sc->nr_scanned; | 1637 | total_scanned += sc->nr_scanned; |
1596 | if (nr_reclaimed >= sc->swap_cluster_max) { | 1638 | if (sc->nr_reclaimed >= sc->swap_cluster_max) { |
1597 | ret = nr_reclaimed; | 1639 | ret = sc->nr_reclaimed; |
1598 | goto out; | 1640 | goto out; |
1599 | } | 1641 | } |
1600 | 1642 | ||
@@ -1616,8 +1658,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1616 | congestion_wait(WRITE, HZ/10); | 1658 | congestion_wait(WRITE, HZ/10); |
1617 | } | 1659 | } |
1618 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 1660 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
1619 | if (!sc->all_unreclaimable && scan_global_lru(sc)) | 1661 | if (!sc->all_unreclaimable && scanning_global_lru(sc)) |
1620 | ret = nr_reclaimed; | 1662 | ret = sc->nr_reclaimed; |
1621 | out: | 1663 | out: |
1622 | /* | 1664 | /* |
1623 | * Now that we've scanned all the zones at this priority level, note | 1665 | * Now that we've scanned all the zones at this priority level, note |
@@ -1629,7 +1671,7 @@ out: | |||
1629 | if (priority < 0) | 1671 | if (priority < 0) |
1630 | priority = 0; | 1672 | priority = 0; |
1631 | 1673 | ||
1632 | if (scan_global_lru(sc)) { | 1674 | if (scanning_global_lru(sc)) { |
1633 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 1675 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1634 | 1676 | ||
1635 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1677 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
@@ -1665,19 +1707,24 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
1665 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 1707 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
1666 | 1708 | ||
1667 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | 1709 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, |
1668 | gfp_t gfp_mask) | 1710 | gfp_t gfp_mask, |
1711 | bool noswap, | ||
1712 | unsigned int swappiness) | ||
1669 | { | 1713 | { |
1670 | struct scan_control sc = { | 1714 | struct scan_control sc = { |
1671 | .may_writepage = !laptop_mode, | 1715 | .may_writepage = !laptop_mode, |
1672 | .may_swap = 1, | 1716 | .may_swap = 1, |
1673 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1717 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
1674 | .swappiness = vm_swappiness, | 1718 | .swappiness = swappiness, |
1675 | .order = 0, | 1719 | .order = 0, |
1676 | .mem_cgroup = mem_cont, | 1720 | .mem_cgroup = mem_cont, |
1677 | .isolate_pages = mem_cgroup_isolate_pages, | 1721 | .isolate_pages = mem_cgroup_isolate_pages, |
1678 | }; | 1722 | }; |
1679 | struct zonelist *zonelist; | 1723 | struct zonelist *zonelist; |
1680 | 1724 | ||
1725 | if (noswap) | ||
1726 | sc.may_swap = 0; | ||
1727 | |||
1681 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 1728 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
1682 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 1729 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
1683 | zonelist = NODE_DATA(numa_node_id())->node_zonelists; | 1730 | zonelist = NODE_DATA(numa_node_id())->node_zonelists; |
@@ -1712,7 +1759,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1712 | int priority; | 1759 | int priority; |
1713 | int i; | 1760 | int i; |
1714 | unsigned long total_scanned; | 1761 | unsigned long total_scanned; |
1715 | unsigned long nr_reclaimed; | ||
1716 | struct reclaim_state *reclaim_state = current->reclaim_state; | 1762 | struct reclaim_state *reclaim_state = current->reclaim_state; |
1717 | struct scan_control sc = { | 1763 | struct scan_control sc = { |
1718 | .gfp_mask = GFP_KERNEL, | 1764 | .gfp_mask = GFP_KERNEL, |
@@ -1731,7 +1777,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1731 | 1777 | ||
1732 | loop_again: | 1778 | loop_again: |
1733 | total_scanned = 0; | 1779 | total_scanned = 0; |
1734 | nr_reclaimed = 0; | 1780 | sc.nr_reclaimed = 0; |
1735 | sc.may_writepage = !laptop_mode; | 1781 | sc.may_writepage = !laptop_mode; |
1736 | count_vm_event(PAGEOUTRUN); | 1782 | count_vm_event(PAGEOUTRUN); |
1737 | 1783 | ||
@@ -1766,7 +1812,7 @@ loop_again: | |||
1766 | * Do some background aging of the anon list, to give | 1812 | * Do some background aging of the anon list, to give |
1767 | * pages a chance to be referenced before reclaiming. | 1813 | * pages a chance to be referenced before reclaiming. |
1768 | */ | 1814 | */ |
1769 | if (inactive_anon_is_low(zone)) | 1815 | if (inactive_anon_is_low(zone, &sc)) |
1770 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 1816 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
1771 | &sc, priority, 0); | 1817 | &sc, priority, 0); |
1772 | 1818 | ||
@@ -1817,11 +1863,11 @@ loop_again: | |||
1817 | */ | 1863 | */ |
1818 | if (!zone_watermark_ok(zone, order, 8*zone->pages_high, | 1864 | if (!zone_watermark_ok(zone, order, 8*zone->pages_high, |
1819 | end_zone, 0)) | 1865 | end_zone, 0)) |
1820 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 1866 | shrink_zone(priority, zone, &sc); |
1821 | reclaim_state->reclaimed_slab = 0; | 1867 | reclaim_state->reclaimed_slab = 0; |
1822 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 1868 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
1823 | lru_pages); | 1869 | lru_pages); |
1824 | nr_reclaimed += reclaim_state->reclaimed_slab; | 1870 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
1825 | total_scanned += sc.nr_scanned; | 1871 | total_scanned += sc.nr_scanned; |
1826 | if (zone_is_all_unreclaimable(zone)) | 1872 | if (zone_is_all_unreclaimable(zone)) |
1827 | continue; | 1873 | continue; |
@@ -1835,7 +1881,7 @@ loop_again: | |||
1835 | * even in laptop mode | 1881 | * even in laptop mode |
1836 | */ | 1882 | */ |
1837 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 1883 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && |
1838 | total_scanned > nr_reclaimed + nr_reclaimed / 2) | 1884 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
1839 | sc.may_writepage = 1; | 1885 | sc.may_writepage = 1; |
1840 | } | 1886 | } |
1841 | if (all_zones_ok) | 1887 | if (all_zones_ok) |
@@ -1853,7 +1899,7 @@ loop_again: | |||
1853 | * matches the direct reclaim path behaviour in terms of impact | 1899 | * matches the direct reclaim path behaviour in terms of impact |
1854 | * on zone->*_priority. | 1900 | * on zone->*_priority. |
1855 | */ | 1901 | */ |
1856 | if (nr_reclaimed >= SWAP_CLUSTER_MAX) | 1902 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) |
1857 | break; | 1903 | break; |
1858 | } | 1904 | } |
1859 | out: | 1905 | out: |
@@ -1872,10 +1918,27 @@ out: | |||
1872 | 1918 | ||
1873 | try_to_freeze(); | 1919 | try_to_freeze(); |
1874 | 1920 | ||
1921 | /* | ||
1922 | * Fragmentation may mean that the system cannot be | ||
1923 | * rebalanced for high-order allocations in all zones. | ||
1924 | * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, | ||
1925 | * it means the zones have been fully scanned and are still | ||
1926 | * not balanced. For high-order allocations, there is | ||
1927 | * little point trying all over again as kswapd may | ||
1928 | * infinite loop. | ||
1929 | * | ||
1930 | * Instead, recheck all watermarks at order-0 as they | ||
1931 | * are the most important. If watermarks are ok, kswapd will go | ||
1932 | * back to sleep. High-order users can still perform direct | ||
1933 | * reclaim if they wish. | ||
1934 | */ | ||
1935 | if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) | ||
1936 | order = sc.order = 0; | ||
1937 | |||
1875 | goto loop_again; | 1938 | goto loop_again; |
1876 | } | 1939 | } |
1877 | 1940 | ||
1878 | return nr_reclaimed; | 1941 | return sc.nr_reclaimed; |
1879 | } | 1942 | } |
1880 | 1943 | ||
1881 | /* | 1944 | /* |
@@ -2227,7 +2290,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2227 | struct task_struct *p = current; | 2290 | struct task_struct *p = current; |
2228 | struct reclaim_state reclaim_state; | 2291 | struct reclaim_state reclaim_state; |
2229 | int priority; | 2292 | int priority; |
2230 | unsigned long nr_reclaimed = 0; | ||
2231 | struct scan_control sc = { | 2293 | struct scan_control sc = { |
2232 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 2294 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
2233 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 2295 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
@@ -2260,9 +2322,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2260 | priority = ZONE_RECLAIM_PRIORITY; | 2322 | priority = ZONE_RECLAIM_PRIORITY; |
2261 | do { | 2323 | do { |
2262 | note_zone_scanning_priority(zone, priority); | 2324 | note_zone_scanning_priority(zone, priority); |
2263 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 2325 | shrink_zone(priority, zone, &sc); |
2264 | priority--; | 2326 | priority--; |
2265 | } while (priority >= 0 && nr_reclaimed < nr_pages); | 2327 | } while (priority >= 0 && sc.nr_reclaimed < nr_pages); |
2266 | } | 2328 | } |
2267 | 2329 | ||
2268 | slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); | 2330 | slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); |
@@ -2286,13 +2348,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2286 | * Update nr_reclaimed by the number of slab pages we | 2348 | * Update nr_reclaimed by the number of slab pages we |
2287 | * reclaimed from this zone. | 2349 | * reclaimed from this zone. |
2288 | */ | 2350 | */ |
2289 | nr_reclaimed += slab_reclaimable - | 2351 | sc.nr_reclaimed += slab_reclaimable - |
2290 | zone_page_state(zone, NR_SLAB_RECLAIMABLE); | 2352 | zone_page_state(zone, NR_SLAB_RECLAIMABLE); |
2291 | } | 2353 | } |
2292 | 2354 | ||
2293 | p->reclaim_state = NULL; | 2355 | p->reclaim_state = NULL; |
2294 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 2356 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
2295 | return nr_reclaimed >= nr_pages; | 2357 | return sc.nr_reclaimed >= nr_pages; |
2296 | } | 2358 | } |
2297 | 2359 | ||
2298 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 2360 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
@@ -2393,6 +2455,7 @@ retry: | |||
2393 | 2455 | ||
2394 | __dec_zone_state(zone, NR_UNEVICTABLE); | 2456 | __dec_zone_state(zone, NR_UNEVICTABLE); |
2395 | list_move(&page->lru, &zone->lru[l].list); | 2457 | list_move(&page->lru, &zone->lru[l].list); |
2458 | mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); | ||
2396 | __inc_zone_state(zone, NR_INACTIVE_ANON + l); | 2459 | __inc_zone_state(zone, NR_INACTIVE_ANON + l); |
2397 | __count_vm_event(UNEVICTABLE_PGRESCUED); | 2460 | __count_vm_event(UNEVICTABLE_PGRESCUED); |
2398 | } else { | 2461 | } else { |
@@ -2401,6 +2464,7 @@ retry: | |||
2401 | */ | 2464 | */ |
2402 | SetPageUnevictable(page); | 2465 | SetPageUnevictable(page); |
2403 | list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); | 2466 | list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); |
2467 | mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); | ||
2404 | if (page_evictable(page, NULL)) | 2468 | if (page_evictable(page, NULL)) |
2405 | goto retry; | 2469 | goto retry; |
2406 | } | 2470 | } |
@@ -2472,7 +2536,7 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) | |||
2472 | * back onto @zone's unevictable list. | 2536 | * back onto @zone's unevictable list. |
2473 | */ | 2537 | */ |
2474 | #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ | 2538 | #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ |
2475 | void scan_zone_unevictable_pages(struct zone *zone) | 2539 | static void scan_zone_unevictable_pages(struct zone *zone) |
2476 | { | 2540 | { |
2477 | struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; | 2541 | struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; |
2478 | unsigned long scan; | 2542 | unsigned long scan; |
@@ -2514,7 +2578,7 @@ void scan_zone_unevictable_pages(struct zone *zone) | |||
2514 | * that has possibly/probably made some previously unevictable pages | 2578 | * that has possibly/probably made some previously unevictable pages |
2515 | * evictable. | 2579 | * evictable. |
2516 | */ | 2580 | */ |
2517 | void scan_all_zones_unevictable_pages(void) | 2581 | static void scan_all_zones_unevictable_pages(void) |
2518 | { | 2582 | { |
2519 | struct zone *zone; | 2583 | struct zone *zone; |
2520 | 2584 | ||