aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/fadvise.c18
-rw-r--r--mm/filemap.c9
-rw-r--r--mm/fremap.c6
-rw-r--r--mm/hugetlb.c28
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c155
-rw-r--r--mm/memory.c2
-rw-r--r--mm/mempolicy.c24
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/mincore.c4
-rw-r--r--mm/mlock.c63
-rw-r--r--mm/mmap.c113
-rw-r--r--mm/mprotect.c9
-rw-r--r--mm/mremap.c6
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c35
-rw-r--r--mm/page-writeback.c35
-rw-r--r--mm/page_alloc.c27
-rw-r--r--mm/page_cgroup.c3
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/rmap.c3
-rw-r--r--mm/shmem.c43
-rw-r--r--mm/slab.c1
-rw-r--r--mm/slob.c44
-rw-r--r--mm/slub.c75
-rw-r--r--mm/swapfile.c13
-rw-r--r--mm/util.c20
-rw-r--r--mm/vmalloc.c40
-rw-r--r--mm/vmscan.c32
29 files changed, 512 insertions, 312 deletions
diff --git a/mm/fadvise.c b/mm/fadvise.c
index a1da969bd980..54a0f8040afa 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -24,7 +24,7 @@
24 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could 24 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
25 * deactivate the pages and clear PG_Referenced. 25 * deactivate the pages and clear PG_Referenced.
26 */ 26 */
27asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) 27SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
28{ 28{
29 struct file *file = fget(fd); 29 struct file *file = fget(fd);
30 struct address_space *mapping; 30 struct address_space *mapping;
@@ -126,12 +126,26 @@ out:
126 fput(file); 126 fput(file);
127 return ret; 127 return ret;
128} 128}
129#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
130asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice)
131{
132 return SYSC_fadvise64_64((int) fd, offset, len, (int) advice);
133}
134SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64);
135#endif
129 136
130#ifdef __ARCH_WANT_SYS_FADVISE64 137#ifdef __ARCH_WANT_SYS_FADVISE64
131 138
132asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice) 139SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice)
133{ 140{
134 return sys_fadvise64_64(fd, offset, len, advice); 141 return sys_fadvise64_64(fd, offset, len, advice);
135} 142}
143#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
144asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice)
145{
146 return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice);
147}
148SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64);
149#endif
136 150
137#endif 151#endif
diff --git a/mm/filemap.c b/mm/filemap.c
index ceba0bd03662..23acefe51808 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1374,7 +1374,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
1374 return 0; 1374 return 0;
1375} 1375}
1376 1376
1377asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) 1377SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
1378{ 1378{
1379 ssize_t ret; 1379 ssize_t ret;
1380 struct file *file; 1380 struct file *file;
@@ -1393,6 +1393,13 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1393 } 1393 }
1394 return ret; 1394 return ret;
1395} 1395}
1396#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
1397asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
1398{
1399 return SYSC_readahead((int) fd, offset, (size_t) count);
1400}
1401SYSCALL_ALIAS(sys_readahead, SyS_readahead);
1402#endif
1396 1403
1397#ifdef CONFIG_MMU 1404#ifdef CONFIG_MMU
1398/** 1405/**
diff --git a/mm/fremap.c b/mm/fremap.c
index 62d5bbda921a..b6ec85abbb39 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -120,8 +120,8 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
120 * and the vma's default protection is used. Arbitrary protections 120 * and the vma's default protection is used. Arbitrary protections
121 * might be implemented in the future. 121 * might be implemented in the future.
122 */ 122 */
123asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, 123SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
124 unsigned long prot, unsigned long pgoff, unsigned long flags) 124 unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
125{ 125{
126 struct mm_struct *mm = current->mm; 126 struct mm_struct *mm = current->mm;
127 struct address_space *mapping; 127 struct address_space *mapping;
@@ -198,7 +198,7 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
198 flags &= MAP_NONBLOCK; 198 flags &= MAP_NONBLOCK;
199 get_file(file); 199 get_file(file);
200 addr = mmap_region(file, start, size, 200 addr = mmap_region(file, start, size,
201 flags, vma->vm_flags, pgoff, 1); 201 flags, vma->vm_flags, pgoff);
202 fput(file); 202 fput(file);
203 if (IS_ERR_VALUE(addr)) { 203 if (IS_ERR_VALUE(addr)) {
204 err = addr; 204 err = addr;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 618e98304080..107da3d809a8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2269,12 +2269,18 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2269 2269
2270int hugetlb_reserve_pages(struct inode *inode, 2270int hugetlb_reserve_pages(struct inode *inode,
2271 long from, long to, 2271 long from, long to,
2272 struct vm_area_struct *vma) 2272 struct vm_area_struct *vma,
2273 int acctflag)
2273{ 2274{
2274 long ret, chg; 2275 long ret, chg;
2275 struct hstate *h = hstate_inode(inode); 2276 struct hstate *h = hstate_inode(inode);
2276 2277
2277 if (vma && vma->vm_flags & VM_NORESERVE) 2278 /*
2279 * Only apply hugepage reservation if asked. At fault time, an
2280 * attempt will be made for VM_NORESERVE to allocate a page
2281 * and filesystem quota without using reserves
2282 */
2283 if (acctflag & VM_NORESERVE)
2278 return 0; 2284 return 0;
2279 2285
2280 /* 2286 /*
@@ -2299,13 +2305,31 @@ int hugetlb_reserve_pages(struct inode *inode,
2299 if (chg < 0) 2305 if (chg < 0)
2300 return chg; 2306 return chg;
2301 2307
2308 /* There must be enough filesystem quota for the mapping */
2302 if (hugetlb_get_quota(inode->i_mapping, chg)) 2309 if (hugetlb_get_quota(inode->i_mapping, chg))
2303 return -ENOSPC; 2310 return -ENOSPC;
2311
2312 /*
2313 * Check enough hugepages are available for the reservation.
2314 * Hand back the quota if there are not
2315 */
2304 ret = hugetlb_acct_memory(h, chg); 2316 ret = hugetlb_acct_memory(h, chg);
2305 if (ret < 0) { 2317 if (ret < 0) {
2306 hugetlb_put_quota(inode->i_mapping, chg); 2318 hugetlb_put_quota(inode->i_mapping, chg);
2307 return ret; 2319 return ret;
2308 } 2320 }
2321
2322 /*
2323 * Account for the reservations made. Shared mappings record regions
2324 * that have reservations as they are shared by multiple VMAs.
2325 * When the last VMA disappears, the region map says how much
2326 * the reservation was and the page cache tells how much of
2327 * the reservation was consumed. Private mappings are per-VMA and
2328 * only the consumed reservations are tracked. When the VMA
2329 * disappears, the original reservation is the VMA size and the
2330 * consumed reservations are stored in the map. Hence, nothing
2331 * else has to be done for private mappings here
2332 */
2309 if (!vma || vma->vm_flags & VM_SHARED) 2333 if (!vma || vma->vm_flags & VM_SHARED)
2310 region_add(&inode->i_mapping->private_list, from, to); 2334 region_add(&inode->i_mapping->private_list, from, to);
2311 return 0; 2335 return 0;
diff --git a/mm/madvise.c b/mm/madvise.c
index f9349c18a1b5..b9ce574827c8 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -281,7 +281,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
281 * -EBADF - map exists, but area maps something that isn't a file. 281 * -EBADF - map exists, but area maps something that isn't a file.
282 * -EAGAIN - a kernel resource was temporarily unavailable. 282 * -EAGAIN - a kernel resource was temporarily unavailable.
283 */ 283 */
284asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) 284SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
285{ 285{
286 unsigned long end, tmp; 286 unsigned long end, tmp;
287 struct vm_area_struct * vma, *prev; 287 struct vm_area_struct * vma, *prev;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e2996b80601f..8e4be9cb2a6a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -202,6 +202,7 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
202 202
203static void mem_cgroup_get(struct mem_cgroup *mem); 203static void mem_cgroup_get(struct mem_cgroup *mem);
204static void mem_cgroup_put(struct mem_cgroup *mem); 204static void mem_cgroup_put(struct mem_cgroup *mem);
205static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
205 206
206static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 207static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
207 struct page_cgroup *pc, 208 struct page_cgroup *pc,
@@ -358,6 +359,10 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
358 return; 359 return;
359 360
360 pc = lookup_page_cgroup(page); 361 pc = lookup_page_cgroup(page);
362 /*
363 * Used bit is set without atomic ops but after smp_wmb().
364 * For making pc->mem_cgroup visible, insert smp_rmb() here.
365 */
361 smp_rmb(); 366 smp_rmb();
362 /* unused page is not rotated. */ 367 /* unused page is not rotated. */
363 if (!PageCgroupUsed(pc)) 368 if (!PageCgroupUsed(pc))
@@ -374,7 +379,10 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
374 if (mem_cgroup_disabled()) 379 if (mem_cgroup_disabled())
375 return; 380 return;
376 pc = lookup_page_cgroup(page); 381 pc = lookup_page_cgroup(page);
377 /* barrier to sync with "charge" */ 382 /*
383 * Used bit is set without atomic ops but after smp_wmb().
384 * For making pc->mem_cgroup visible, insert smp_rmb() here.
385 */
378 smp_rmb(); 386 smp_rmb();
379 if (!PageCgroupUsed(pc)) 387 if (!PageCgroupUsed(pc))
380 return; 388 return;
@@ -559,6 +567,14 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
559 return NULL; 567 return NULL;
560 568
561 pc = lookup_page_cgroup(page); 569 pc = lookup_page_cgroup(page);
570 /*
571 * Used bit is set without atomic ops but after smp_wmb().
572 * For making pc->mem_cgroup visible, insert smp_rmb() here.
573 */
574 smp_rmb();
575 if (!PageCgroupUsed(pc))
576 return NULL;
577
562 mz = page_cgroup_zoneinfo(pc); 578 mz = page_cgroup_zoneinfo(pc);
563 if (!mz) 579 if (!mz)
564 return NULL; 580 return NULL;
@@ -618,7 +634,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
618 * called with hierarchy_mutex held 634 * called with hierarchy_mutex held
619 */ 635 */
620static struct mem_cgroup * 636static struct mem_cgroup *
621mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) 637__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
622{ 638{
623 struct cgroup *cgroup, *curr_cgroup, *root_cgroup; 639 struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
624 640
@@ -629,19 +645,16 @@ mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
629 /* 645 /*
630 * Walk down to children 646 * Walk down to children
631 */ 647 */
632 mem_cgroup_put(curr);
633 cgroup = list_entry(curr_cgroup->children.next, 648 cgroup = list_entry(curr_cgroup->children.next,
634 struct cgroup, sibling); 649 struct cgroup, sibling);
635 curr = mem_cgroup_from_cont(cgroup); 650 curr = mem_cgroup_from_cont(cgroup);
636 mem_cgroup_get(curr);
637 goto done; 651 goto done;
638 } 652 }
639 653
640visit_parent: 654visit_parent:
641 if (curr_cgroup == root_cgroup) { 655 if (curr_cgroup == root_cgroup) {
642 mem_cgroup_put(curr); 656 /* caller handles NULL case */
643 curr = root_mem; 657 curr = NULL;
644 mem_cgroup_get(curr);
645 goto done; 658 goto done;
646 } 659 }
647 660
@@ -649,11 +662,9 @@ visit_parent:
649 * Goto next sibling 662 * Goto next sibling
650 */ 663 */
651 if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { 664 if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
652 mem_cgroup_put(curr);
653 cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, 665 cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
654 sibling); 666 sibling);
655 curr = mem_cgroup_from_cont(cgroup); 667 curr = mem_cgroup_from_cont(cgroup);
656 mem_cgroup_get(curr);
657 goto done; 668 goto done;
658 } 669 }
659 670
@@ -664,7 +675,6 @@ visit_parent:
664 goto visit_parent; 675 goto visit_parent;
665 676
666done: 677done:
667 root_mem->last_scanned_child = curr;
668 return curr; 678 return curr;
669} 679}
670 680
@@ -674,40 +684,46 @@ done:
674 * that to reclaim free pages from. 684 * that to reclaim free pages from.
675 */ 685 */
676static struct mem_cgroup * 686static struct mem_cgroup *
677mem_cgroup_get_first_node(struct mem_cgroup *root_mem) 687mem_cgroup_get_next_node(struct mem_cgroup *root_mem)
678{ 688{
679 struct cgroup *cgroup; 689 struct cgroup *cgroup;
680 struct mem_cgroup *ret; 690 struct mem_cgroup *orig, *next;
681 bool obsolete; 691 bool obsolete;
682 692
683 obsolete = mem_cgroup_is_obsolete(root_mem->last_scanned_child);
684
685 /* 693 /*
686 * Scan all children under the mem_cgroup mem 694 * Scan all children under the mem_cgroup mem
687 */ 695 */
688 mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); 696 mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
697
698 orig = root_mem->last_scanned_child;
699 obsolete = mem_cgroup_is_obsolete(orig);
700
689 if (list_empty(&root_mem->css.cgroup->children)) { 701 if (list_empty(&root_mem->css.cgroup->children)) {
690 ret = root_mem; 702 /*
703 * root_mem might have children before and last_scanned_child
704 * may point to one of them. We put it later.
705 */
706 if (orig)
707 VM_BUG_ON(!obsolete);
708 next = NULL;
691 goto done; 709 goto done;
692 } 710 }
693 711
694 if (!root_mem->last_scanned_child || obsolete) { 712 if (!orig || obsolete) {
695
696 if (obsolete && root_mem->last_scanned_child)
697 mem_cgroup_put(root_mem->last_scanned_child);
698
699 cgroup = list_first_entry(&root_mem->css.cgroup->children, 713 cgroup = list_first_entry(&root_mem->css.cgroup->children,
700 struct cgroup, sibling); 714 struct cgroup, sibling);
701 ret = mem_cgroup_from_cont(cgroup); 715 next = mem_cgroup_from_cont(cgroup);
702 mem_cgroup_get(ret);
703 } else 716 } else
704 ret = mem_cgroup_get_next_node(root_mem->last_scanned_child, 717 next = __mem_cgroup_get_next_node(orig, root_mem);
705 root_mem);
706 718
707done: 719done:
708 root_mem->last_scanned_child = ret; 720 if (next)
721 mem_cgroup_get(next);
722 root_mem->last_scanned_child = next;
723 if (orig)
724 mem_cgroup_put(orig);
709 mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); 725 mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
710 return ret; 726 return (next) ? next : root_mem;
711} 727}
712 728
713static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 729static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
@@ -758,28 +774,25 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
758 * but there might be left over accounting, even after children 774 * but there might be left over accounting, even after children
759 * have left. 775 * have left.
760 */ 776 */
761 ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, 777 ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
762 get_swappiness(root_mem)); 778 get_swappiness(root_mem));
763 if (mem_cgroup_check_under_limit(root_mem)) 779 if (mem_cgroup_check_under_limit(root_mem))
764 return 0; 780 return 1; /* indicate reclaim has succeeded */
765 if (!root_mem->use_hierarchy) 781 if (!root_mem->use_hierarchy)
766 return ret; 782 return ret;
767 783
768 next_mem = mem_cgroup_get_first_node(root_mem); 784 next_mem = mem_cgroup_get_next_node(root_mem);
769 785
770 while (next_mem != root_mem) { 786 while (next_mem != root_mem) {
771 if (mem_cgroup_is_obsolete(next_mem)) { 787 if (mem_cgroup_is_obsolete(next_mem)) {
772 mem_cgroup_put(next_mem); 788 next_mem = mem_cgroup_get_next_node(root_mem);
773 next_mem = mem_cgroup_get_first_node(root_mem);
774 continue; 789 continue;
775 } 790 }
776 ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, 791 ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
777 get_swappiness(next_mem)); 792 get_swappiness(next_mem));
778 if (mem_cgroup_check_under_limit(root_mem)) 793 if (mem_cgroup_check_under_limit(root_mem))
779 return 0; 794 return 1; /* indicate reclaim has succeeded */
780 mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); 795 next_mem = mem_cgroup_get_next_node(root_mem);
781 next_mem = mem_cgroup_get_next_node(next_mem, root_mem);
782 mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
783 } 796 }
784 return ret; 797 return ret;
785} 798}
@@ -863,6 +876,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
863 876
864 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 877 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
865 noswap); 878 noswap);
879 if (ret)
880 continue;
866 881
867 /* 882 /*
868 * try_to_free_mem_cgroup_pages() might not give us a full 883 * try_to_free_mem_cgroup_pages() might not give us a full
@@ -979,14 +994,15 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
979 if (pc->mem_cgroup != from) 994 if (pc->mem_cgroup != from)
980 goto out; 995 goto out;
981 996
982 css_put(&from->css);
983 res_counter_uncharge(&from->res, PAGE_SIZE); 997 res_counter_uncharge(&from->res, PAGE_SIZE);
984 mem_cgroup_charge_statistics(from, pc, false); 998 mem_cgroup_charge_statistics(from, pc, false);
985 if (do_swap_account) 999 if (do_swap_account)
986 res_counter_uncharge(&from->memsw, PAGE_SIZE); 1000 res_counter_uncharge(&from->memsw, PAGE_SIZE);
1001 css_put(&from->css);
1002
1003 css_get(&to->css);
987 pc->mem_cgroup = to; 1004 pc->mem_cgroup = to;
988 mem_cgroup_charge_statistics(to, pc, true); 1005 mem_cgroup_charge_statistics(to, pc, true);
989 css_get(&to->css);
990 ret = 0; 1006 ret = 0;
991out: 1007out:
992 unlock_page_cgroup(pc); 1008 unlock_page_cgroup(pc);
@@ -1019,8 +1035,10 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1019 if (ret || !parent) 1035 if (ret || !parent)
1020 return ret; 1036 return ret;
1021 1037
1022 if (!get_page_unless_zero(page)) 1038 if (!get_page_unless_zero(page)) {
1023 return -EBUSY; 1039 ret = -EBUSY;
1040 goto uncharge;
1041 }
1024 1042
1025 ret = isolate_lru_page(page); 1043 ret = isolate_lru_page(page);
1026 1044
@@ -1029,19 +1047,23 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1029 1047
1030 ret = mem_cgroup_move_account(pc, child, parent); 1048 ret = mem_cgroup_move_account(pc, child, parent);
1031 1049
1032 /* drop extra refcnt by try_charge() (move_account increment one) */
1033 css_put(&parent->css);
1034 putback_lru_page(page); 1050 putback_lru_page(page);
1035 if (!ret) { 1051 if (!ret) {
1036 put_page(page); 1052 put_page(page);
1053 /* drop extra refcnt by try_charge() */
1054 css_put(&parent->css);
1037 return 0; 1055 return 0;
1038 } 1056 }
1039 /* uncharge if move fails */ 1057
1040cancel: 1058cancel:
1059 put_page(page);
1060uncharge:
1061 /* drop extra refcnt by try_charge() */
1062 css_put(&parent->css);
1063 /* uncharge if move fails */
1041 res_counter_uncharge(&parent->res, PAGE_SIZE); 1064 res_counter_uncharge(&parent->res, PAGE_SIZE);
1042 if (do_swap_account) 1065 if (do_swap_account)
1043 res_counter_uncharge(&parent->memsw, PAGE_SIZE); 1066 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1044 put_page(page);
1045 return ret; 1067 return ret;
1046} 1068}
1047 1069
@@ -1663,7 +1685,7 @@ move_account:
1663 /* This is for making all *used* pages to be on LRU. */ 1685 /* This is for making all *used* pages to be on LRU. */
1664 lru_add_drain_all(); 1686 lru_add_drain_all();
1665 ret = 0; 1687 ret = 0;
1666 for_each_node_state(node, N_POSSIBLE) { 1688 for_each_node_state(node, N_HIGH_MEMORY) {
1667 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 1689 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
1668 enum lru_list l; 1690 enum lru_list l;
1669 for_each_lru(l) { 1691 for_each_lru(l) {
@@ -1971,6 +1993,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
1971{ 1993{
1972 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 1994 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
1973 struct mem_cgroup *parent; 1995 struct mem_cgroup *parent;
1996
1974 if (val > 100) 1997 if (val > 100)
1975 return -EINVAL; 1998 return -EINVAL;
1976 1999
@@ -1978,15 +2001,22 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
1978 return -EINVAL; 2001 return -EINVAL;
1979 2002
1980 parent = mem_cgroup_from_cont(cgrp->parent); 2003 parent = mem_cgroup_from_cont(cgrp->parent);
2004
2005 cgroup_lock();
2006
1981 /* If under hierarchy, only empty-root can set this value */ 2007 /* If under hierarchy, only empty-root can set this value */
1982 if ((parent->use_hierarchy) || 2008 if ((parent->use_hierarchy) ||
1983 (memcg->use_hierarchy && !list_empty(&cgrp->children))) 2009 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
2010 cgroup_unlock();
1984 return -EINVAL; 2011 return -EINVAL;
2012 }
1985 2013
1986 spin_lock(&memcg->reclaim_param_lock); 2014 spin_lock(&memcg->reclaim_param_lock);
1987 memcg->swappiness = val; 2015 memcg->swappiness = val;
1988 spin_unlock(&memcg->reclaim_param_lock); 2016 spin_unlock(&memcg->reclaim_param_lock);
1989 2017
2018 cgroup_unlock();
2019
1990 return 0; 2020 return 0;
1991} 2021}
1992 2022
@@ -2164,10 +2194,23 @@ static void mem_cgroup_get(struct mem_cgroup *mem)
2164 2194
2165static void mem_cgroup_put(struct mem_cgroup *mem) 2195static void mem_cgroup_put(struct mem_cgroup *mem)
2166{ 2196{
2167 if (atomic_dec_and_test(&mem->refcnt)) 2197 if (atomic_dec_and_test(&mem->refcnt)) {
2198 struct mem_cgroup *parent = parent_mem_cgroup(mem);
2168 __mem_cgroup_free(mem); 2199 __mem_cgroup_free(mem);
2200 if (parent)
2201 mem_cgroup_put(parent);
2202 }
2169} 2203}
2170 2204
2205/*
2206 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
2207 */
2208static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
2209{
2210 if (!mem->res.parent)
2211 return NULL;
2212 return mem_cgroup_from_res_counter(mem->res.parent, res);
2213}
2171 2214
2172#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2215#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2173static void __init enable_swap_cgroup(void) 2216static void __init enable_swap_cgroup(void)
@@ -2181,7 +2224,7 @@ static void __init enable_swap_cgroup(void)
2181} 2224}
2182#endif 2225#endif
2183 2226
2184static struct cgroup_subsys_state * 2227static struct cgroup_subsys_state * __ref
2185mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 2228mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2186{ 2229{
2187 struct mem_cgroup *mem, *parent; 2230 struct mem_cgroup *mem, *parent;
@@ -2206,6 +2249,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2206 if (parent && parent->use_hierarchy) { 2249 if (parent && parent->use_hierarchy) {
2207 res_counter_init(&mem->res, &parent->res); 2250 res_counter_init(&mem->res, &parent->res);
2208 res_counter_init(&mem->memsw, &parent->memsw); 2251 res_counter_init(&mem->memsw, &parent->memsw);
2252 /*
2253 * We increment refcnt of the parent to ensure that we can
2254 * safely access it on res_counter_charge/uncharge.
2255 * This refcnt will be decremented when freeing this
2256 * mem_cgroup(see mem_cgroup_put).
2257 */
2258 mem_cgroup_get(parent);
2209 } else { 2259 } else {
2210 res_counter_init(&mem->res, NULL); 2260 res_counter_init(&mem->res, NULL);
2211 res_counter_init(&mem->memsw, NULL); 2261 res_counter_init(&mem->memsw, NULL);
@@ -2232,7 +2282,14 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
2232static void mem_cgroup_destroy(struct cgroup_subsys *ss, 2282static void mem_cgroup_destroy(struct cgroup_subsys *ss,
2233 struct cgroup *cont) 2283 struct cgroup *cont)
2234{ 2284{
2235 mem_cgroup_put(mem_cgroup_from_cont(cont)); 2285 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2286 struct mem_cgroup *last_scanned_child = mem->last_scanned_child;
2287
2288 if (last_scanned_child) {
2289 VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child));
2290 mem_cgroup_put(last_scanned_child);
2291 }
2292 mem_cgroup_put(mem);
2236} 2293}
2237 2294
2238static int mem_cgroup_populate(struct cgroup_subsys *ss, 2295static int mem_cgroup_populate(struct cgroup_subsys *ss,
diff --git a/mm/memory.c b/mm/memory.c
index 22bfa7a47a0b..baa999e87cd2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1999,7 +1999,7 @@ gotten:
1999 * Don't let another task, with possibly unlocked vma, 1999 * Don't let another task, with possibly unlocked vma,
2000 * keep the mlocked page. 2000 * keep the mlocked page.
2001 */ 2001 */
2002 if (vma->vm_flags & VM_LOCKED) { 2002 if ((vma->vm_flags & VM_LOCKED) && old_page) {
2003 lock_page(old_page); /* for LRU manipulation */ 2003 lock_page(old_page); /* for LRU manipulation */
2004 clear_page_mlock(old_page); 2004 clear_page_mlock(old_page);
2005 unlock_page(old_page); 2005 unlock_page(old_page);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e412ffa8e52e..3eb4a6fdc043 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1068,10 +1068,9 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1068 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 1068 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1069} 1069}
1070 1070
1071asmlinkage long sys_mbind(unsigned long start, unsigned long len, 1071SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1072 unsigned long mode, 1072 unsigned long, mode, unsigned long __user *, nmask,
1073 unsigned long __user *nmask, unsigned long maxnode, 1073 unsigned long, maxnode, unsigned, flags)
1074 unsigned flags)
1075{ 1074{
1076 nodemask_t nodes; 1075 nodemask_t nodes;
1077 int err; 1076 int err;
@@ -1091,8 +1090,8 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1091} 1090}
1092 1091
1093/* Set the process memory policy */ 1092/* Set the process memory policy */
1094asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, 1093SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1095 unsigned long maxnode) 1094 unsigned long, maxnode)
1096{ 1095{
1097 int err; 1096 int err;
1098 nodemask_t nodes; 1097 nodemask_t nodes;
@@ -1110,9 +1109,9 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1110 return do_set_mempolicy(mode, flags, &nodes); 1109 return do_set_mempolicy(mode, flags, &nodes);
1111} 1110}
1112 1111
1113asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, 1112SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1114 const unsigned long __user *old_nodes, 1113 const unsigned long __user *, old_nodes,
1115 const unsigned long __user *new_nodes) 1114 const unsigned long __user *, new_nodes)
1116{ 1115{
1117 const struct cred *cred = current_cred(), *tcred; 1116 const struct cred *cred = current_cred(), *tcred;
1118 struct mm_struct *mm; 1117 struct mm_struct *mm;
@@ -1185,10 +1184,9 @@ out:
1185 1184
1186 1185
1187/* Retrieve NUMA policy */ 1186/* Retrieve NUMA policy */
1188asmlinkage long sys_get_mempolicy(int __user *policy, 1187SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1189 unsigned long __user *nmask, 1188 unsigned long __user *, nmask, unsigned long, maxnode,
1190 unsigned long maxnode, 1189 unsigned long, addr, unsigned long, flags)
1191 unsigned long addr, unsigned long flags)
1192{ 1190{
1193 int err; 1191 int err;
1194 int uninitialized_var(pval); 1192 int uninitialized_var(pval);
diff --git a/mm/migrate.c b/mm/migrate.c
index a30ea5fcf9f1..a9eff3f092f6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1055,10 +1055,10 @@ out:
1055 * Move a list of pages in the address space of the currently executing 1055 * Move a list of pages in the address space of the currently executing
1056 * process. 1056 * process.
1057 */ 1057 */
1058asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, 1058SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1059 const void __user * __user *pages, 1059 const void __user * __user *, pages,
1060 const int __user *nodes, 1060 const int __user *, nodes,
1061 int __user *status, int flags) 1061 int __user *, status, int, flags)
1062{ 1062{
1063 const struct cred *cred = current_cred(), *tcred; 1063 const struct cred *cred = current_cred(), *tcred;
1064 struct task_struct *task; 1064 struct task_struct *task;
@@ -1129,7 +1129,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1129 struct vm_area_struct *vma; 1129 struct vm_area_struct *vma;
1130 int err = 0; 1130 int err = 0;
1131 1131
1132 for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) { 1132 for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
1133 if (vma->vm_ops && vma->vm_ops->migrate) { 1133 if (vma->vm_ops && vma->vm_ops->migrate) {
1134 err = vma->vm_ops->migrate(vma, to, from, flags); 1134 err = vma->vm_ops->migrate(vma, to, from, flags);
1135 if (err) 1135 if (err)
diff --git a/mm/mincore.c b/mm/mincore.c
index 5178800bc129..8cb508f84ea4 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -177,8 +177,8 @@ none_mapped:
177 * mapped 177 * mapped
178 * -EAGAIN - A kernel resource was temporarily unavailable. 178 * -EAGAIN - A kernel resource was temporarily unavailable.
179 */ 179 */
180asmlinkage long sys_mincore(unsigned long start, size_t len, 180SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
181 unsigned char __user * vec) 181 unsigned char __user *, vec)
182{ 182{
183 long retval; 183 long retval;
184 unsigned long pages; 184 unsigned long pages;
diff --git a/mm/mlock.c b/mm/mlock.c
index e125156c664e..cbe9e0581b75 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -294,14 +294,10 @@ static inline int __mlock_posix_error_return(long retval)
294 * 294 *
295 * return number of pages [> 0] to be removed from locked_vm on success 295 * return number of pages [> 0] to be removed from locked_vm on success
296 * of "special" vmas. 296 * of "special" vmas.
297 *
298 * return negative error if vma spanning @start-@range disappears while
299 * mmap semaphore is dropped. Unlikely?
300 */ 297 */
301long mlock_vma_pages_range(struct vm_area_struct *vma, 298long mlock_vma_pages_range(struct vm_area_struct *vma,
302 unsigned long start, unsigned long end) 299 unsigned long start, unsigned long end)
303{ 300{
304 struct mm_struct *mm = vma->vm_mm;
305 int nr_pages = (end - start) / PAGE_SIZE; 301 int nr_pages = (end - start) / PAGE_SIZE;
306 BUG_ON(!(vma->vm_flags & VM_LOCKED)); 302 BUG_ON(!(vma->vm_flags & VM_LOCKED));
307 303
@@ -314,20 +310,11 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
314 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || 310 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
315 is_vm_hugetlb_page(vma) || 311 is_vm_hugetlb_page(vma) ||
316 vma == get_gate_vma(current))) { 312 vma == get_gate_vma(current))) {
317 long error;
318 downgrade_write(&mm->mmap_sem);
319 313
320 error = __mlock_vma_pages_range(vma, start, end, 1); 314 __mlock_vma_pages_range(vma, start, end, 1);
321 315
322 up_read(&mm->mmap_sem); 316 /* Hide errors from mmap() and other callers */
323 /* vma can change or disappear */ 317 return 0;
324 down_write(&mm->mmap_sem);
325 vma = find_vma(mm, start);
326 /* non-NULL vma must contain @start, but need to check @end */
327 if (!vma || end > vma->vm_end)
328 return -ENOMEM;
329
330 return 0; /* hide other errors from mmap(), et al */
331 } 318 }
332 319
333 /* 320 /*
@@ -438,41 +425,14 @@ success:
438 vma->vm_flags = newflags; 425 vma->vm_flags = newflags;
439 426
440 if (lock) { 427 if (lock) {
441 /*
442 * mmap_sem is currently held for write. Downgrade the write
443 * lock to a read lock so that other faults, mmap scans, ...
444 * while we fault in all pages.
445 */
446 downgrade_write(&mm->mmap_sem);
447
448 ret = __mlock_vma_pages_range(vma, start, end, 1); 428 ret = __mlock_vma_pages_range(vma, start, end, 1);
449 429
450 /* 430 if (ret > 0) {
451 * Need to reacquire mmap sem in write mode, as our callers
452 * expect this. We have no support for atomically upgrading
453 * a sem to write, so we need to check for ranges while sem
454 * is unlocked.
455 */
456 up_read(&mm->mmap_sem);
457 /* vma can change or disappear */
458 down_write(&mm->mmap_sem);
459 *prev = find_vma(mm, start);
460 /* non-NULL *prev must contain @start, but need to check @end */
461 if (!(*prev) || end > (*prev)->vm_end)
462 ret = -ENOMEM;
463 else if (ret > 0) {
464 mm->locked_vm -= ret; 431 mm->locked_vm -= ret;
465 ret = 0; 432 ret = 0;
466 } else 433 } else
467 ret = __mlock_posix_error_return(ret); /* translate if needed */ 434 ret = __mlock_posix_error_return(ret); /* translate if needed */
468 } else { 435 } else {
469 /*
470 * TODO: for unlocking, pages will already be resident, so
471 * we don't need to wait for allocations/reclaim/pagein, ...
472 * However, unlocking a very large region can still take a
473 * while. Should we downgrade the semaphore for both lock
474 * AND unlock ?
475 */
476 __mlock_vma_pages_range(vma, start, end, 0); 436 __mlock_vma_pages_range(vma, start, end, 0);
477 } 437 }
478 438
@@ -530,7 +490,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
530 return error; 490 return error;
531} 491}
532 492
533asmlinkage long sys_mlock(unsigned long start, size_t len) 493SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
534{ 494{
535 unsigned long locked; 495 unsigned long locked;
536 unsigned long lock_limit; 496 unsigned long lock_limit;
@@ -558,7 +518,7 @@ asmlinkage long sys_mlock(unsigned long start, size_t len)
558 return error; 518 return error;
559} 519}
560 520
561asmlinkage long sys_munlock(unsigned long start, size_t len) 521SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
562{ 522{
563 int ret; 523 int ret;
564 524
@@ -595,7 +555,7 @@ out:
595 return 0; 555 return 0;
596} 556}
597 557
598asmlinkage long sys_mlockall(int flags) 558SYSCALL_DEFINE1(mlockall, int, flags)
599{ 559{
600 unsigned long lock_limit; 560 unsigned long lock_limit;
601 int ret = -EINVAL; 561 int ret = -EINVAL;
@@ -623,7 +583,7 @@ out:
623 return ret; 583 return ret;
624} 584}
625 585
626asmlinkage long sys_munlockall(void) 586SYSCALL_DEFINE0(munlockall)
627{ 587{
628 int ret; 588 int ret;
629 589
@@ -700,7 +660,7 @@ void *alloc_locked_buffer(size_t size)
700 return buffer; 660 return buffer;
701} 661}
702 662
703void free_locked_buffer(void *buffer, size_t size) 663void release_locked_buffer(void *buffer, size_t size)
704{ 664{
705 unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; 665 unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
706 666
@@ -710,6 +670,11 @@ void free_locked_buffer(void *buffer, size_t size)
710 current->mm->locked_vm -= pgsz; 670 current->mm->locked_vm -= pgsz;
711 671
712 up_write(&current->mm->mmap_sem); 672 up_write(&current->mm->mmap_sem);
673}
674
675void free_locked_buffer(void *buffer, size_t size)
676{
677 release_locked_buffer(buffer, size);
713 678
714 kfree(buffer); 679 kfree(buffer);
715} 680}
diff --git a/mm/mmap.c b/mm/mmap.c
index 749623196cb9..00ced3ee49a8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -245,7 +245,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
245 return next; 245 return next;
246} 246}
247 247
248asmlinkage unsigned long sys_brk(unsigned long brk) 248SYSCALL_DEFINE1(brk, unsigned long, brk)
249{ 249{
250 unsigned long rlim, retval; 250 unsigned long rlim, retval;
251 unsigned long newbrk, oldbrk; 251 unsigned long newbrk, oldbrk;
@@ -658,6 +658,9 @@ again: remove_next = 1 + (end > next->vm_end);
658 validate_mm(mm); 658 validate_mm(mm);
659} 659}
660 660
661/* Flags that can be inherited from an existing mapping when merging */
662#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR)
663
661/* 664/*
662 * If the vma has a ->close operation then the driver probably needs to release 665 * If the vma has a ->close operation then the driver probably needs to release
663 * per-vma resources, so we don't attempt to merge those. 666 * per-vma resources, so we don't attempt to merge those.
@@ -665,7 +668,7 @@ again: remove_next = 1 + (end > next->vm_end);
665static inline int is_mergeable_vma(struct vm_area_struct *vma, 668static inline int is_mergeable_vma(struct vm_area_struct *vma,
666 struct file *file, unsigned long vm_flags) 669 struct file *file, unsigned long vm_flags)
667{ 670{
668 if (vma->vm_flags != vm_flags) 671 if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS)
669 return 0; 672 return 0;
670 if (vma->vm_file != file) 673 if (vma->vm_file != file)
671 return 0; 674 return 0;
@@ -915,7 +918,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
915 struct inode *inode; 918 struct inode *inode;
916 unsigned int vm_flags; 919 unsigned int vm_flags;
917 int error; 920 int error;
918 int accountable = 1;
919 unsigned long reqprot = prot; 921 unsigned long reqprot = prot;
920 922
921 /* 923 /*
@@ -1016,8 +1018,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1016 return -EPERM; 1018 return -EPERM;
1017 vm_flags &= ~VM_MAYEXEC; 1019 vm_flags &= ~VM_MAYEXEC;
1018 } 1020 }
1019 if (is_file_hugepages(file))
1020 accountable = 0;
1021 1021
1022 if (!file->f_op || !file->f_op->mmap) 1022 if (!file->f_op || !file->f_op->mmap)
1023 return -ENODEV; 1023 return -ENODEV;
@@ -1050,8 +1050,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1050 if (error) 1050 if (error)
1051 return error; 1051 return error;
1052 1052
1053 return mmap_region(file, addr, len, flags, vm_flags, pgoff, 1053 return mmap_region(file, addr, len, flags, vm_flags, pgoff);
1054 accountable);
1055} 1054}
1056EXPORT_SYMBOL(do_mmap_pgoff); 1055EXPORT_SYMBOL(do_mmap_pgoff);
1057 1056
@@ -1087,10 +1086,25 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
1087 mapping_cap_account_dirty(vma->vm_file->f_mapping); 1086 mapping_cap_account_dirty(vma->vm_file->f_mapping);
1088} 1087}
1089 1088
1089/*
1090 * We account for memory if it's a private writeable mapping,
1091 * not hugepages and VM_NORESERVE wasn't set.
1092 */
1093static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
1094{
1095 /*
1096 * hugetlb has its own accounting separate from the core VM
1097 * VM_HUGETLB may not be set yet so we cannot check for that flag.
1098 */
1099 if (file && is_file_hugepages(file))
1100 return 0;
1101
1102 return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
1103}
1104
1090unsigned long mmap_region(struct file *file, unsigned long addr, 1105unsigned long mmap_region(struct file *file, unsigned long addr,
1091 unsigned long len, unsigned long flags, 1106 unsigned long len, unsigned long flags,
1092 unsigned int vm_flags, unsigned long pgoff, 1107 unsigned int vm_flags, unsigned long pgoff)
1093 int accountable)
1094{ 1108{
1095 struct mm_struct *mm = current->mm; 1109 struct mm_struct *mm = current->mm;
1096 struct vm_area_struct *vma, *prev; 1110 struct vm_area_struct *vma, *prev;
@@ -1114,38 +1128,38 @@ munmap_back:
1114 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) 1128 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1115 return -ENOMEM; 1129 return -ENOMEM;
1116 1130
1117 if (flags & MAP_NORESERVE) 1131 /*
1118 vm_flags |= VM_NORESERVE; 1132 * Set 'VM_NORESERVE' if we should not account for the
1133 * memory use of this mapping.
1134 */
1135 if ((flags & MAP_NORESERVE)) {
1136 /* We honor MAP_NORESERVE if allowed to overcommit */
1137 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1138 vm_flags |= VM_NORESERVE;
1119 1139
1120 if (accountable && (!(flags & MAP_NORESERVE) || 1140 /* hugetlb applies strict overcommit unless MAP_NORESERVE */
1121 sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { 1141 if (file && is_file_hugepages(file))
1122 if (vm_flags & VM_SHARED) { 1142 vm_flags |= VM_NORESERVE;
1123 /* Check memory availability in shmem_file_setup? */
1124 vm_flags |= VM_ACCOUNT;
1125 } else if (vm_flags & VM_WRITE) {
1126 /*
1127 * Private writable mapping: check memory availability
1128 */
1129 charged = len >> PAGE_SHIFT;
1130 if (security_vm_enough_memory(charged))
1131 return -ENOMEM;
1132 vm_flags |= VM_ACCOUNT;
1133 }
1134 } 1143 }
1135 1144
1136 /* 1145 /*
1137 * Can we just expand an old private anonymous mapping? 1146 * Private writable mapping: check memory availability
1138 * The VM_SHARED test is necessary because shmem_zero_setup
1139 * will create the file object for a shared anonymous map below.
1140 */ 1147 */
1141 if (!file && !(vm_flags & VM_SHARED)) { 1148 if (accountable_mapping(file, vm_flags)) {
1142 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, 1149 charged = len >> PAGE_SHIFT;
1143 NULL, NULL, pgoff, NULL); 1150 if (security_vm_enough_memory(charged))
1144 if (vma) 1151 return -ENOMEM;
1145 goto out; 1152 vm_flags |= VM_ACCOUNT;
1146 } 1153 }
1147 1154
1148 /* 1155 /*
1156 * Can we just expand an old mapping?
1157 */
1158 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
1159 if (vma)
1160 goto out;
1161
1162 /*
1149 * Determine the object being mapped and call the appropriate 1163 * Determine the object being mapped and call the appropriate
1150 * specific mapper. the address has already been validated, but 1164 * specific mapper. the address has already been validated, but
1151 * not unmapped, but the maps are removed from the list. 1165 * not unmapped, but the maps are removed from the list.
@@ -1186,14 +1200,6 @@ munmap_back:
1186 goto free_vma; 1200 goto free_vma;
1187 } 1201 }
1188 1202
1189 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
1190 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
1191 * that memory reservation must be checked; but that reservation
1192 * belongs to shared memory object, not to vma: so now clear it.
1193 */
1194 if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
1195 vma->vm_flags &= ~VM_ACCOUNT;
1196
1197 /* Can addr have changed?? 1203 /* Can addr have changed??
1198 * 1204 *
1199 * Answer: Yes, several device drivers can do it in their 1205 * Answer: Yes, several device drivers can do it in their
@@ -1206,17 +1212,8 @@ munmap_back:
1206 if (vma_wants_writenotify(vma)) 1212 if (vma_wants_writenotify(vma))
1207 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); 1213 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
1208 1214
1209 if (file && vma_merge(mm, prev, addr, vma->vm_end, 1215 vma_link(mm, vma, prev, rb_link, rb_parent);
1210 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { 1216 file = vma->vm_file;
1211 mpol_put(vma_policy(vma));
1212 kmem_cache_free(vm_area_cachep, vma);
1213 fput(file);
1214 if (vm_flags & VM_EXECUTABLE)
1215 removed_exe_file_vma(mm);
1216 } else {
1217 vma_link(mm, vma, prev, rb_link, rb_parent);
1218 file = vma->vm_file;
1219 }
1220 1217
1221 /* Once vma denies write, undo our temporary denial count */ 1218 /* Once vma denies write, undo our temporary denial count */
1222 if (correct_wcount) 1219 if (correct_wcount)
@@ -1948,7 +1945,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1948 1945
1949EXPORT_SYMBOL(do_munmap); 1946EXPORT_SYMBOL(do_munmap);
1950 1947
1951asmlinkage long sys_munmap(unsigned long addr, size_t len) 1948SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
1952{ 1949{
1953 int ret; 1950 int ret;
1954 struct mm_struct *mm = current->mm; 1951 struct mm_struct *mm = current->mm;
@@ -2087,12 +2084,8 @@ void exit_mmap(struct mm_struct *mm)
2087 unsigned long end; 2084 unsigned long end;
2088 2085
2089 /* mm's last user has gone, and its about to be pulled down */ 2086 /* mm's last user has gone, and its about to be pulled down */
2090 arch_exit_mmap(mm);
2091 mmu_notifier_release(mm); 2087 mmu_notifier_release(mm);
2092 2088
2093 if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */
2094 return;
2095
2096 if (mm->locked_vm) { 2089 if (mm->locked_vm) {
2097 vma = mm->mmap; 2090 vma = mm->mmap;
2098 while (vma) { 2091 while (vma) {
@@ -2101,7 +2094,13 @@ void exit_mmap(struct mm_struct *mm)
2101 vma = vma->vm_next; 2094 vma = vma->vm_next;
2102 } 2095 }
2103 } 2096 }
2097
2098 arch_exit_mmap(mm);
2099
2104 vma = mm->mmap; 2100 vma = mm->mmap;
2101 if (!vma) /* Can happen if dup_mmap() received an OOM */
2102 return;
2103
2105 lru_add_drain(); 2104 lru_add_drain();
2106 flush_cache_mm(mm); 2105 flush_cache_mm(mm);
2107 tlb = tlb_gather_mmu(mm, 1); 2106 tlb = tlb_gather_mmu(mm, 1);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index d0f6e7ce09f1..258197b76fb4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -151,10 +151,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
151 /* 151 /*
152 * If we make a private mapping writable we increase our commit; 152 * If we make a private mapping writable we increase our commit;
153 * but (without finer accounting) cannot reduce our commit if we 153 * but (without finer accounting) cannot reduce our commit if we
154 * make it unwritable again. 154 * make it unwritable again. hugetlb mapping were accounted for
155 * even if read-only so there is no need to account for them here
155 */ 156 */
156 if (newflags & VM_WRITE) { 157 if (newflags & VM_WRITE) {
157 if (!(oldflags & (VM_ACCOUNT|VM_WRITE| 158 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
158 VM_SHARED|VM_NORESERVE))) { 159 VM_SHARED|VM_NORESERVE))) {
159 charged = nrpages; 160 charged = nrpages;
160 if (security_vm_enough_memory(charged)) 161 if (security_vm_enough_memory(charged))
@@ -217,8 +218,8 @@ fail:
217 return error; 218 return error;
218} 219}
219 220
220asmlinkage long 221SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
221sys_mprotect(unsigned long start, size_t len, unsigned long prot) 222 unsigned long, prot)
222{ 223{
223 unsigned long vm_flags, nstart, end, tmp, reqprot; 224 unsigned long vm_flags, nstart, end, tmp, reqprot;
224 struct vm_area_struct *vma, *prev; 225 struct vm_area_struct *vma, *prev;
diff --git a/mm/mremap.c b/mm/mremap.c
index 646de959aa58..a39b7b91be46 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -420,9 +420,9 @@ out_nc:
420 return ret; 420 return ret;
421} 421}
422 422
423asmlinkage unsigned long sys_mremap(unsigned long addr, 423SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
424 unsigned long old_len, unsigned long new_len, 424 unsigned long, new_len, unsigned long, flags,
425 unsigned long flags, unsigned long new_addr) 425 unsigned long, new_addr)
426{ 426{
427 unsigned long ret; 427 unsigned long ret;
428 428
diff --git a/mm/msync.c b/mm/msync.c
index 07dae08cf31c..4083209b7f02 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -28,7 +28,7 @@
28 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to 28 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
29 * applications. 29 * applications.
30 */ 30 */
31asmlinkage long sys_msync(unsigned long start, size_t len, int flags) 31SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
32{ 32{
33 unsigned long end; 33 unsigned long end;
34 struct mm_struct *mm = current->mm; 34 struct mm_struct *mm = current->mm;
diff --git a/mm/nommu.c b/mm/nommu.c
index 60ed8375c986..2fcf47d449b4 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,7 +10,7 @@
10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> 10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> 11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> 12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
13 * Copyright (c) 2007-2008 Paul Mundt <lethal@linux-sh.org> 13 * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org>
14 */ 14 */
15 15
16#include <linux/module.h> 16#include <linux/module.h>
@@ -394,6 +394,24 @@ void vunmap(const void *addr)
394} 394}
395EXPORT_SYMBOL(vunmap); 395EXPORT_SYMBOL(vunmap);
396 396
397void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
398{
399 BUG();
400 return NULL;
401}
402EXPORT_SYMBOL(vm_map_ram);
403
404void vm_unmap_ram(const void *mem, unsigned int count)
405{
406 BUG();
407}
408EXPORT_SYMBOL(vm_unmap_ram);
409
410void vm_unmap_aliases(void)
411{
412}
413EXPORT_SYMBOL_GPL(vm_unmap_aliases);
414
397/* 415/*
398 * Implement a stub for vmalloc_sync_all() if the architecture chose not to 416 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
399 * have one. 417 * have one.
@@ -416,7 +434,7 @@ EXPORT_SYMBOL(vm_insert_page);
416 * to a regular file. in this case, the unmapping will need 434 * to a regular file. in this case, the unmapping will need
417 * to invoke file system routines that need the global lock. 435 * to invoke file system routines that need the global lock.
418 */ 436 */
419asmlinkage unsigned long sys_brk(unsigned long brk) 437SYSCALL_DEFINE1(brk, unsigned long, brk)
420{ 438{
421 struct mm_struct *mm = current->mm; 439 struct mm_struct *mm = current->mm;
422 440
@@ -1143,8 +1161,8 @@ error_free:
1143 return ret; 1161 return ret;
1144 1162
1145enomem: 1163enomem:
1146 printk("Allocation of length %lu from process %d failed\n", 1164 printk("Allocation of length %lu from process %d (%s) failed\n",
1147 len, current->pid); 1165 len, current->pid, current->comm);
1148 show_free_areas(); 1166 show_free_areas();
1149 return -ENOMEM; 1167 return -ENOMEM;
1150} 1168}
@@ -1573,7 +1591,7 @@ erase_whole_vma:
1573} 1591}
1574EXPORT_SYMBOL(do_munmap); 1592EXPORT_SYMBOL(do_munmap);
1575 1593
1576asmlinkage long sys_munmap(unsigned long addr, size_t len) 1594SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
1577{ 1595{
1578 int ret; 1596 int ret;
1579 struct mm_struct *mm = current->mm; 1597 struct mm_struct *mm = current->mm;
@@ -1657,10 +1675,9 @@ unsigned long do_mremap(unsigned long addr,
1657} 1675}
1658EXPORT_SYMBOL(do_mremap); 1676EXPORT_SYMBOL(do_mremap);
1659 1677
1660asmlinkage 1678SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
1661unsigned long sys_mremap(unsigned long addr, 1679 unsigned long, new_len, unsigned long, flags,
1662 unsigned long old_len, unsigned long new_len, 1680 unsigned long, new_addr)
1663 unsigned long flags, unsigned long new_addr)
1664{ 1681{
1665 unsigned long ret; 1682 unsigned long ret;
1666 1683
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b493db7841dc..74dc57c74349 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -209,7 +209,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
209 struct file *filp, void __user *buffer, size_t *lenp, 209 struct file *filp, void __user *buffer, size_t *lenp,
210 loff_t *ppos) 210 loff_t *ppos)
211{ 211{
212 int old_bytes = vm_dirty_bytes; 212 unsigned long old_bytes = vm_dirty_bytes;
213 int ret; 213 int ret;
214 214
215 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 215 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
@@ -240,7 +240,7 @@ void bdi_writeout_inc(struct backing_dev_info *bdi)
240} 240}
241EXPORT_SYMBOL_GPL(bdi_writeout_inc); 241EXPORT_SYMBOL_GPL(bdi_writeout_inc);
242 242
243static inline void task_dirty_inc(struct task_struct *tsk) 243void task_dirty_inc(struct task_struct *tsk)
244{ 244{
245 prop_inc_single(&vm_dirties, &tsk->dirties); 245 prop_inc_single(&vm_dirties, &tsk->dirties);
246} 246}
@@ -1051,13 +1051,25 @@ continue_unlock:
1051 } 1051 }
1052 } 1052 }
1053 1053
1054 if (wbc->sync_mode == WB_SYNC_NONE) { 1054 if (nr_to_write > 0) {
1055 wbc->nr_to_write--; 1055 nr_to_write--;
1056 if (wbc->nr_to_write <= 0) { 1056 if (nr_to_write == 0 &&
1057 wbc->sync_mode == WB_SYNC_NONE) {
1058 /*
1059 * We stop writing back only if we are
1060 * not doing integrity sync. In case of
1061 * integrity sync we have to keep going
1062 * because someone may be concurrently
1063 * dirtying pages, and we might have
1064 * synced a lot of newly appeared dirty
1065 * pages, but have not synced all of the
1066 * old dirty pages.
1067 */
1057 done = 1; 1068 done = 1;
1058 break; 1069 break;
1059 } 1070 }
1060 } 1071 }
1072
1061 if (wbc->nonblocking && bdi_write_congested(bdi)) { 1073 if (wbc->nonblocking && bdi_write_congested(bdi)) {
1062 wbc->encountered_congestion = 1; 1074 wbc->encountered_congestion = 1;
1063 done = 1; 1075 done = 1;
@@ -1067,7 +1079,7 @@ continue_unlock:
1067 pagevec_release(&pvec); 1079 pagevec_release(&pvec);
1068 cond_resched(); 1080 cond_resched();
1069 } 1081 }
1070 if (!cycled) { 1082 if (!cycled && !done) {
1071 /* 1083 /*
1072 * range_cyclic: 1084 * range_cyclic:
1073 * We hit the last page and there is more work to be done: wrap 1085 * We hit the last page and there is more work to be done: wrap
@@ -1218,6 +1230,7 @@ int __set_page_dirty_nobuffers(struct page *page)
1218 __inc_zone_page_state(page, NR_FILE_DIRTY); 1230 __inc_zone_page_state(page, NR_FILE_DIRTY);
1219 __inc_bdi_stat(mapping->backing_dev_info, 1231 __inc_bdi_stat(mapping->backing_dev_info,
1220 BDI_RECLAIMABLE); 1232 BDI_RECLAIMABLE);
1233 task_dirty_inc(current);
1221 task_io_account_write(PAGE_CACHE_SIZE); 1234 task_io_account_write(PAGE_CACHE_SIZE);
1222 } 1235 }
1223 radix_tree_tag_set(&mapping->page_tree, 1236 radix_tree_tag_set(&mapping->page_tree,
@@ -1250,7 +1263,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage);
1250 * If the mapping doesn't provide a set_page_dirty a_op, then 1263 * If the mapping doesn't provide a set_page_dirty a_op, then
1251 * just fall through and assume that it wants buffer_heads. 1264 * just fall through and assume that it wants buffer_heads.
1252 */ 1265 */
1253static int __set_page_dirty(struct page *page) 1266int set_page_dirty(struct page *page)
1254{ 1267{
1255 struct address_space *mapping = page_mapping(page); 1268 struct address_space *mapping = page_mapping(page);
1256 1269
@@ -1268,14 +1281,6 @@ static int __set_page_dirty(struct page *page)
1268 } 1281 }
1269 return 0; 1282 return 0;
1270} 1283}
1271
1272int set_page_dirty(struct page *page)
1273{
1274 int ret = __set_page_dirty(page);
1275 if (ret)
1276 task_dirty_inc(current);
1277 return ret;
1278}
1279EXPORT_SYMBOL(set_page_dirty); 1284EXPORT_SYMBOL(set_page_dirty);
1280 1285
1281/* 1286/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5675b3073854..5c44ed49ca93 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2989,7 +2989,7 @@ static int __meminit next_active_region_index_in_nid(int index, int nid)
2989 * was used and there are no special requirements, this is a convenient 2989 * was used and there are no special requirements, this is a convenient
2990 * alternative 2990 * alternative
2991 */ 2991 */
2992int __meminit early_pfn_to_nid(unsigned long pfn) 2992int __meminit __early_pfn_to_nid(unsigned long pfn)
2993{ 2993{
2994 int i; 2994 int i;
2995 2995
@@ -3000,10 +3000,33 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
3000 if (start_pfn <= pfn && pfn < end_pfn) 3000 if (start_pfn <= pfn && pfn < end_pfn)
3001 return early_node_map[i].nid; 3001 return early_node_map[i].nid;
3002 } 3002 }
3003 /* This is a memory hole */
3004 return -1;
3005}
3006#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
3007
3008int __meminit early_pfn_to_nid(unsigned long pfn)
3009{
3010 int nid;
3003 3011
3012 nid = __early_pfn_to_nid(pfn);
3013 if (nid >= 0)
3014 return nid;
3015 /* just returns 0 */
3004 return 0; 3016 return 0;
3005} 3017}
3006#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 3018
3019#ifdef CONFIG_NODES_SPAN_OTHER_NODES
3020bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
3021{
3022 int nid;
3023
3024 nid = __early_pfn_to_nid(pfn);
3025 if (nid >= 0 && nid != node)
3026 return false;
3027 return true;
3028}
3029#endif
3007 3030
3008/* Basic iterator support to walk early_node_map[] */ 3031/* Basic iterator support to walk early_node_map[] */
3009#define for_each_active_range_index_in_nid(i, nid) \ 3032#define for_each_active_range_index_in_nid(i, nid) \
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 7006a11350c8..ceecfbb143fa 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -114,7 +114,8 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
114 nid = page_to_nid(pfn_to_page(pfn)); 114 nid = page_to_nid(pfn_to_page(pfn));
115 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 115 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
116 if (slab_is_available()) { 116 if (slab_is_available()) {
117 base = kmalloc_node(table_size, GFP_KERNEL, nid); 117 base = kmalloc_node(table_size,
118 GFP_KERNEL | __GFP_NOWARN, nid);
118 if (!base) 119 if (!base)
119 base = vmalloc_node(table_size, nid); 120 base = vmalloc_node(table_size, nid);
120 } else { 121 } else {
diff --git a/mm/page_io.c b/mm/page_io.c
index dc6ce0afbded..3023c475e041 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -111,7 +111,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
111 goto out; 111 goto out;
112 } 112 }
113 if (wbc->sync_mode == WB_SYNC_ALL) 113 if (wbc->sync_mode == WB_SYNC_ALL)
114 rw |= (1 << BIO_RW_SYNC); 114 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
115 count_vm_event(PSWPOUT); 115 count_vm_event(PSWPOUT);
116 set_page_writeback(page); 116 set_page_writeback(page);
117 unlock_page(page); 117 unlock_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index ac4af8cffbf9..16521664010d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1072,7 +1072,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1072 spin_lock(&mapping->i_mmap_lock); 1072 spin_lock(&mapping->i_mmap_lock);
1073 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1073 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1074 if (MLOCK_PAGES && unlikely(unlock)) { 1074 if (MLOCK_PAGES && unlikely(unlock)) {
1075 if (!(vma->vm_flags & VM_LOCKED)) 1075 if (!((vma->vm_flags & VM_LOCKED) &&
1076 page_mapped_in_vma(page, vma)))
1076 continue; /* must visit all vmas */ 1077 continue; /* must visit all vmas */
1077 ret = SWAP_MLOCK; 1078 ret = SWAP_MLOCK;
1078 } else { 1079 } else {
diff --git a/mm/shmem.c b/mm/shmem.c
index 5d0de96c9789..4103a239ce84 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -169,13 +169,13 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
169 */ 169 */
170static inline int shmem_acct_size(unsigned long flags, loff_t size) 170static inline int shmem_acct_size(unsigned long flags, loff_t size)
171{ 171{
172 return (flags & VM_ACCOUNT) ? 172 return (flags & VM_NORESERVE) ?
173 security_vm_enough_memory_kern(VM_ACCT(size)) : 0; 173 0 : security_vm_enough_memory_kern(VM_ACCT(size));
174} 174}
175 175
176static inline void shmem_unacct_size(unsigned long flags, loff_t size) 176static inline void shmem_unacct_size(unsigned long flags, loff_t size)
177{ 177{
178 if (flags & VM_ACCOUNT) 178 if (!(flags & VM_NORESERVE))
179 vm_unacct_memory(VM_ACCT(size)); 179 vm_unacct_memory(VM_ACCT(size));
180} 180}
181 181
@@ -187,13 +187,13 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
187 */ 187 */
188static inline int shmem_acct_block(unsigned long flags) 188static inline int shmem_acct_block(unsigned long flags)
189{ 189{
190 return (flags & VM_ACCOUNT) ? 190 return (flags & VM_NORESERVE) ?
191 0 : security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)); 191 security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0;
192} 192}
193 193
194static inline void shmem_unacct_blocks(unsigned long flags, long pages) 194static inline void shmem_unacct_blocks(unsigned long flags, long pages)
195{ 195{
196 if (!(flags & VM_ACCOUNT)) 196 if (flags & VM_NORESERVE)
197 vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE)); 197 vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
198} 198}
199 199
@@ -1515,8 +1515,8 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1515 return 0; 1515 return 0;
1516} 1516}
1517 1517
1518static struct inode * 1518static struct inode *shmem_get_inode(struct super_block *sb, int mode,
1519shmem_get_inode(struct super_block *sb, int mode, dev_t dev) 1519 dev_t dev, unsigned long flags)
1520{ 1520{
1521 struct inode *inode; 1521 struct inode *inode;
1522 struct shmem_inode_info *info; 1522 struct shmem_inode_info *info;
@@ -1537,6 +1537,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1537 info = SHMEM_I(inode); 1537 info = SHMEM_I(inode);
1538 memset(info, 0, (char *)inode - (char *)info); 1538 memset(info, 0, (char *)inode - (char *)info);
1539 spin_lock_init(&info->lock); 1539 spin_lock_init(&info->lock);
1540 info->flags = flags & VM_NORESERVE;
1540 INIT_LIST_HEAD(&info->swaplist); 1541 INIT_LIST_HEAD(&info->swaplist);
1541 1542
1542 switch (mode & S_IFMT) { 1543 switch (mode & S_IFMT) {
@@ -1779,9 +1780,10 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1779static int 1780static int
1780shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) 1781shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1781{ 1782{
1782 struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev); 1783 struct inode *inode;
1783 int error = -ENOSPC; 1784 int error = -ENOSPC;
1784 1785
1786 inode = shmem_get_inode(dir->i_sb, mode, dev, VM_NORESERVE);
1785 if (inode) { 1787 if (inode) {
1786 error = security_inode_init_security(inode, dir, NULL, NULL, 1788 error = security_inode_init_security(inode, dir, NULL, NULL,
1787 NULL); 1789 NULL);
@@ -1920,7 +1922,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1920 if (len > PAGE_CACHE_SIZE) 1922 if (len > PAGE_CACHE_SIZE)
1921 return -ENAMETOOLONG; 1923 return -ENAMETOOLONG;
1922 1924
1923 inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); 1925 inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
1924 if (!inode) 1926 if (!inode)
1925 return -ENOSPC; 1927 return -ENOSPC;
1926 1928
@@ -2332,7 +2334,7 @@ static int shmem_fill_super(struct super_block *sb,
2332 sb->s_flags |= MS_POSIXACL; 2334 sb->s_flags |= MS_POSIXACL;
2333#endif 2335#endif
2334 2336
2335 inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0); 2337 inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
2336 if (!inode) 2338 if (!inode)
2337 goto failed; 2339 goto failed;
2338 inode->i_uid = sbinfo->uid; 2340 inode->i_uid = sbinfo->uid;
@@ -2574,12 +2576,12 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
2574 return 0; 2576 return 0;
2575} 2577}
2576 2578
2577#define shmem_file_operations ramfs_file_operations 2579#define shmem_vm_ops generic_file_vm_ops
2578#define shmem_vm_ops generic_file_vm_ops 2580#define shmem_file_operations ramfs_file_operations
2579#define shmem_get_inode ramfs_get_inode 2581#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev)
2580#define shmem_acct_size(a, b) 0 2582#define shmem_acct_size(flags, size) 0
2581#define shmem_unacct_size(a, b) do {} while (0) 2583#define shmem_unacct_size(flags, size) do {} while (0)
2582#define SHMEM_MAX_BYTES LLONG_MAX 2584#define SHMEM_MAX_BYTES LLONG_MAX
2583 2585
2584#endif /* CONFIG_SHMEM */ 2586#endif /* CONFIG_SHMEM */
2585 2587
@@ -2589,7 +2591,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
2589 * shmem_file_setup - get an unlinked file living in tmpfs 2591 * shmem_file_setup - get an unlinked file living in tmpfs
2590 * @name: name for dentry (to be seen in /proc/<pid>/maps 2592 * @name: name for dentry (to be seen in /proc/<pid>/maps
2591 * @size: size to be set for the file 2593 * @size: size to be set for the file
2592 * @flags: vm_flags 2594 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
2593 */ 2595 */
2594struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) 2596struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2595{ 2597{
@@ -2623,13 +2625,10 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2623 goto put_dentry; 2625 goto put_dentry;
2624 2626
2625 error = -ENOSPC; 2627 error = -ENOSPC;
2626 inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); 2628 inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags);
2627 if (!inode) 2629 if (!inode)
2628 goto close_file; 2630 goto close_file;
2629 2631
2630#ifdef CONFIG_SHMEM
2631 SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
2632#endif
2633 d_instantiate(dentry, inode); 2632 d_instantiate(dentry, inode);
2634 inode->i_size = size; 2633 inode->i_size = size;
2635 inode->i_nlink = 0; /* It is unlinked */ 2634 inode->i_nlink = 0; /* It is unlinked */
diff --git a/mm/slab.c b/mm/slab.c
index ddc41f337d58..4d00855629c4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4457,3 +4457,4 @@ size_t ksize(const void *objp)
4457 4457
4458 return obj_size(virt_to_cache(objp)); 4458 return obj_size(virt_to_cache(objp));
4459} 4459}
4460EXPORT_SYMBOL(ksize);
diff --git a/mm/slob.c b/mm/slob.c
index bf7e8fc3aed8..0bfa680a8981 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -126,9 +126,9 @@ static LIST_HEAD(free_slob_medium);
126static LIST_HEAD(free_slob_large); 126static LIST_HEAD(free_slob_large);
127 127
128/* 128/*
129 * slob_page: True for all slob pages (false for bigblock pages) 129 * is_slob_page: True for all slob pages (false for bigblock pages)
130 */ 130 */
131static inline int slob_page(struct slob_page *sp) 131static inline int is_slob_page(struct slob_page *sp)
132{ 132{
133 return PageSlobPage((struct page *)sp); 133 return PageSlobPage((struct page *)sp);
134} 134}
@@ -143,6 +143,11 @@ static inline void clear_slob_page(struct slob_page *sp)
143 __ClearPageSlobPage((struct page *)sp); 143 __ClearPageSlobPage((struct page *)sp);
144} 144}
145 145
146static inline struct slob_page *slob_page(const void *addr)
147{
148 return (struct slob_page *)virt_to_page(addr);
149}
150
146/* 151/*
147 * slob_page_free: true for pages on free_slob_pages list. 152 * slob_page_free: true for pages on free_slob_pages list.
148 */ 153 */
@@ -230,7 +235,7 @@ static int slob_last(slob_t *s)
230 return !((unsigned long)slob_next(s) & ~PAGE_MASK); 235 return !((unsigned long)slob_next(s) & ~PAGE_MASK);
231} 236}
232 237
233static void *slob_new_page(gfp_t gfp, int order, int node) 238static void *slob_new_pages(gfp_t gfp, int order, int node)
234{ 239{
235 void *page; 240 void *page;
236 241
@@ -247,12 +252,17 @@ static void *slob_new_page(gfp_t gfp, int order, int node)
247 return page_address(page); 252 return page_address(page);
248} 253}
249 254
255static void slob_free_pages(void *b, int order)
256{
257 free_pages((unsigned long)b, order);
258}
259
250/* 260/*
251 * Allocate a slob block within a given slob_page sp. 261 * Allocate a slob block within a given slob_page sp.
252 */ 262 */
253static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) 263static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
254{ 264{
255 slob_t *prev, *cur, *aligned = 0; 265 slob_t *prev, *cur, *aligned = NULL;
256 int delta = 0, units = SLOB_UNITS(size); 266 int delta = 0, units = SLOB_UNITS(size);
257 267
258 for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { 268 for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) {
@@ -349,10 +359,10 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
349 359
350 /* Not enough space: must allocate a new page */ 360 /* Not enough space: must allocate a new page */
351 if (!b) { 361 if (!b) {
352 b = slob_new_page(gfp & ~__GFP_ZERO, 0, node); 362 b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
353 if (!b) 363 if (!b)
354 return 0; 364 return NULL;
355 sp = (struct slob_page *)virt_to_page(b); 365 sp = slob_page(b);
356 set_slob_page(sp); 366 set_slob_page(sp);
357 367
358 spin_lock_irqsave(&slob_lock, flags); 368 spin_lock_irqsave(&slob_lock, flags);
@@ -384,7 +394,7 @@ static void slob_free(void *block, int size)
384 return; 394 return;
385 BUG_ON(!size); 395 BUG_ON(!size);
386 396
387 sp = (struct slob_page *)virt_to_page(block); 397 sp = slob_page(block);
388 units = SLOB_UNITS(size); 398 units = SLOB_UNITS(size);
389 399
390 spin_lock_irqsave(&slob_lock, flags); 400 spin_lock_irqsave(&slob_lock, flags);
@@ -393,10 +403,11 @@ static void slob_free(void *block, int size)
393 /* Go directly to page allocator. Do not pass slob allocator */ 403 /* Go directly to page allocator. Do not pass slob allocator */
394 if (slob_page_free(sp)) 404 if (slob_page_free(sp))
395 clear_slob_page_free(sp); 405 clear_slob_page_free(sp);
406 spin_unlock_irqrestore(&slob_lock, flags);
396 clear_slob_page(sp); 407 clear_slob_page(sp);
397 free_slob_page(sp); 408 free_slob_page(sp);
398 free_page((unsigned long)b); 409 free_page((unsigned long)b);
399 goto out; 410 return;
400 } 411 }
401 412
402 if (!slob_page_free(sp)) { 413 if (!slob_page_free(sp)) {
@@ -476,7 +487,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
476 } else { 487 } else {
477 void *ret; 488 void *ret;
478 489
479 ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); 490 ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node);
480 if (ret) { 491 if (ret) {
481 struct page *page; 492 struct page *page;
482 page = virt_to_page(ret); 493 page = virt_to_page(ret);
@@ -494,8 +505,8 @@ void kfree(const void *block)
494 if (unlikely(ZERO_OR_NULL_PTR(block))) 505 if (unlikely(ZERO_OR_NULL_PTR(block)))
495 return; 506 return;
496 507
497 sp = (struct slob_page *)virt_to_page(block); 508 sp = slob_page(block);
498 if (slob_page(sp)) { 509 if (is_slob_page(sp)) {
499 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 510 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
500 unsigned int *m = (unsigned int *)(block - align); 511 unsigned int *m = (unsigned int *)(block - align);
501 slob_free(m, *m + align); 512 slob_free(m, *m + align);
@@ -513,14 +524,15 @@ size_t ksize(const void *block)
513 if (unlikely(block == ZERO_SIZE_PTR)) 524 if (unlikely(block == ZERO_SIZE_PTR))
514 return 0; 525 return 0;
515 526
516 sp = (struct slob_page *)virt_to_page(block); 527 sp = slob_page(block);
517 if (slob_page(sp)) { 528 if (is_slob_page(sp)) {
518 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 529 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
519 unsigned int *m = (unsigned int *)(block - align); 530 unsigned int *m = (unsigned int *)(block - align);
520 return SLOB_UNITS(*m) * SLOB_UNIT; 531 return SLOB_UNITS(*m) * SLOB_UNIT;
521 } else 532 } else
522 return sp->page.private; 533 return sp->page.private;
523} 534}
535EXPORT_SYMBOL(ksize);
524 536
525struct kmem_cache { 537struct kmem_cache {
526 unsigned int size, align; 538 unsigned int size, align;
@@ -572,7 +584,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
572 if (c->size < PAGE_SIZE) 584 if (c->size < PAGE_SIZE)
573 b = slob_alloc(c->size, flags, c->align, node); 585 b = slob_alloc(c->size, flags, c->align, node);
574 else 586 else
575 b = slob_new_page(flags, get_order(c->size), node); 587 b = slob_new_pages(flags, get_order(c->size), node);
576 588
577 if (c->ctor) 589 if (c->ctor)
578 c->ctor(b); 590 c->ctor(b);
@@ -586,7 +598,7 @@ static void __kmem_cache_free(void *b, int size)
586 if (size < PAGE_SIZE) 598 if (size < PAGE_SIZE)
587 slob_free(b, size); 599 slob_free(b, size);
588 else 600 else
589 free_pages((unsigned long)b, get_order(size)); 601 slob_free_pages(b, get_order(size));
590} 602}
591 603
592static void kmem_rcu_free(struct rcu_head *head) 604static void kmem_rcu_free(struct rcu_head *head)
diff --git a/mm/slub.c b/mm/slub.c
index e150b5c0424f..c65a4edafc33 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1329,7 +1329,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1329 n = get_node(s, zone_to_nid(zone)); 1329 n = get_node(s, zone_to_nid(zone));
1330 1330
1331 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1331 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1332 n->nr_partial > n->min_partial) { 1332 n->nr_partial > s->min_partial) {
1333 page = get_partial_node(n); 1333 page = get_partial_node(n);
1334 if (page) 1334 if (page)
1335 return page; 1335 return page;
@@ -1381,7 +1381,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1381 slab_unlock(page); 1381 slab_unlock(page);
1382 } else { 1382 } else {
1383 stat(c, DEACTIVATE_EMPTY); 1383 stat(c, DEACTIVATE_EMPTY);
1384 if (n->nr_partial < n->min_partial) { 1384 if (n->nr_partial < s->min_partial) {
1385 /* 1385 /*
1386 * Adding an empty slab to the partial slabs in order 1386 * Adding an empty slab to the partial slabs in order
1387 * to avoid page allocator overhead. This slab needs 1387 * to avoid page allocator overhead. This slab needs
@@ -1838,6 +1838,7 @@ static inline int calculate_order(int size)
1838 int order; 1838 int order;
1839 int min_objects; 1839 int min_objects;
1840 int fraction; 1840 int fraction;
1841 int max_objects;
1841 1842
1842 /* 1843 /*
1843 * Attempt to find best configuration for a slab. This 1844 * Attempt to find best configuration for a slab. This
@@ -1850,6 +1851,9 @@ static inline int calculate_order(int size)
1850 min_objects = slub_min_objects; 1851 min_objects = slub_min_objects;
1851 if (!min_objects) 1852 if (!min_objects)
1852 min_objects = 4 * (fls(nr_cpu_ids) + 1); 1853 min_objects = 4 * (fls(nr_cpu_ids) + 1);
1854 max_objects = (PAGE_SIZE << slub_max_order)/size;
1855 min_objects = min(min_objects, max_objects);
1856
1853 while (min_objects > 1) { 1857 while (min_objects > 1) {
1854 fraction = 16; 1858 fraction = 16;
1855 while (fraction >= 4) { 1859 while (fraction >= 4) {
@@ -1859,7 +1863,7 @@ static inline int calculate_order(int size)
1859 return order; 1863 return order;
1860 fraction /= 2; 1864 fraction /= 2;
1861 } 1865 }
1862 min_objects /= 2; 1866 min_objects --;
1863 } 1867 }
1864 1868
1865 /* 1869 /*
@@ -1922,17 +1926,6 @@ static void
1922init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) 1926init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
1923{ 1927{
1924 n->nr_partial = 0; 1928 n->nr_partial = 0;
1925
1926 /*
1927 * The larger the object size is, the more pages we want on the partial
1928 * list to avoid pounding the page allocator excessively.
1929 */
1930 n->min_partial = ilog2(s->size);
1931 if (n->min_partial < MIN_PARTIAL)
1932 n->min_partial = MIN_PARTIAL;
1933 else if (n->min_partial > MAX_PARTIAL)
1934 n->min_partial = MAX_PARTIAL;
1935
1936 spin_lock_init(&n->list_lock); 1929 spin_lock_init(&n->list_lock);
1937 INIT_LIST_HEAD(&n->partial); 1930 INIT_LIST_HEAD(&n->partial);
1938#ifdef CONFIG_SLUB_DEBUG 1931#ifdef CONFIG_SLUB_DEBUG
@@ -1990,7 +1983,7 @@ static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
1990static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) 1983static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
1991{ 1984{
1992 if (c < per_cpu(kmem_cache_cpu, cpu) || 1985 if (c < per_cpu(kmem_cache_cpu, cpu) ||
1993 c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { 1986 c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
1994 kfree(c); 1987 kfree(c);
1995 return; 1988 return;
1996 } 1989 }
@@ -2175,6 +2168,15 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2175} 2168}
2176#endif 2169#endif
2177 2170
2171static void set_min_partial(struct kmem_cache *s, unsigned long min)
2172{
2173 if (min < MIN_PARTIAL)
2174 min = MIN_PARTIAL;
2175 else if (min > MAX_PARTIAL)
2176 min = MAX_PARTIAL;
2177 s->min_partial = min;
2178}
2179
2178/* 2180/*
2179 * calculate_sizes() determines the order and the distribution of data within 2181 * calculate_sizes() determines the order and the distribution of data within
2180 * a slab object. 2182 * a slab object.
@@ -2313,6 +2315,11 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2313 if (!calculate_sizes(s, -1)) 2315 if (!calculate_sizes(s, -1))
2314 goto error; 2316 goto error;
2315 2317
2318 /*
2319 * The larger the object size is, the more pages we want on the partial
2320 * list to avoid pounding the page allocator excessively.
2321 */
2322 set_min_partial(s, ilog2(s->size));
2316 s->refcount = 1; 2323 s->refcount = 1;
2317#ifdef CONFIG_NUMA 2324#ifdef CONFIG_NUMA
2318 s->remote_node_defrag_ratio = 1000; 2325 s->remote_node_defrag_ratio = 1000;
@@ -2469,7 +2476,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
2469 * Kmalloc subsystem 2476 * Kmalloc subsystem
2470 *******************************************************************/ 2477 *******************************************************************/
2471 2478
2472struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; 2479struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
2473EXPORT_SYMBOL(kmalloc_caches); 2480EXPORT_SYMBOL(kmalloc_caches);
2474 2481
2475static int __init setup_slub_min_order(char *str) 2482static int __init setup_slub_min_order(char *str)
@@ -2531,7 +2538,7 @@ panic:
2531} 2538}
2532 2539
2533#ifdef CONFIG_ZONE_DMA 2540#ifdef CONFIG_ZONE_DMA
2534static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; 2541static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
2535 2542
2536static void sysfs_add_func(struct work_struct *w) 2543static void sysfs_add_func(struct work_struct *w)
2537{ 2544{
@@ -2652,7 +2659,7 @@ void *__kmalloc(size_t size, gfp_t flags)
2652{ 2659{
2653 struct kmem_cache *s; 2660 struct kmem_cache *s;
2654 2661
2655 if (unlikely(size > PAGE_SIZE)) 2662 if (unlikely(size > SLUB_MAX_SIZE))
2656 return kmalloc_large(size, flags); 2663 return kmalloc_large(size, flags);
2657 2664
2658 s = get_slab(size, flags); 2665 s = get_slab(size, flags);
@@ -2680,7 +2687,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
2680{ 2687{
2681 struct kmem_cache *s; 2688 struct kmem_cache *s;
2682 2689
2683 if (unlikely(size > PAGE_SIZE)) 2690 if (unlikely(size > SLUB_MAX_SIZE))
2684 return kmalloc_large_node(size, flags, node); 2691 return kmalloc_large_node(size, flags, node);
2685 2692
2686 s = get_slab(size, flags); 2693 s = get_slab(size, flags);
@@ -2730,6 +2737,7 @@ size_t ksize(const void *object)
2730 */ 2737 */
2731 return s->size; 2738 return s->size;
2732} 2739}
2740EXPORT_SYMBOL(ksize);
2733 2741
2734void kfree(const void *x) 2742void kfree(const void *x)
2735{ 2743{
@@ -2979,7 +2987,7 @@ void __init kmem_cache_init(void)
2979 caches++; 2987 caches++;
2980 } 2988 }
2981 2989
2982 for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { 2990 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
2983 create_kmalloc_cache(&kmalloc_caches[i], 2991 create_kmalloc_cache(&kmalloc_caches[i],
2984 "kmalloc", 1 << i, GFP_KERNEL); 2992 "kmalloc", 1 << i, GFP_KERNEL);
2985 caches++; 2993 caches++;
@@ -3016,7 +3024,7 @@ void __init kmem_cache_init(void)
3016 slab_state = UP; 3024 slab_state = UP;
3017 3025
3018 /* Provide the correct kmalloc names now that the caches are up */ 3026 /* Provide the correct kmalloc names now that the caches are up */
3019 for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) 3027 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
3020 kmalloc_caches[i]. name = 3028 kmalloc_caches[i]. name =
3021 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); 3029 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
3022 3030
@@ -3216,7 +3224,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3216{ 3224{
3217 struct kmem_cache *s; 3225 struct kmem_cache *s;
3218 3226
3219 if (unlikely(size > PAGE_SIZE)) 3227 if (unlikely(size > SLUB_MAX_SIZE))
3220 return kmalloc_large(size, gfpflags); 3228 return kmalloc_large(size, gfpflags);
3221 3229
3222 s = get_slab(size, gfpflags); 3230 s = get_slab(size, gfpflags);
@@ -3232,7 +3240,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3232{ 3240{
3233 struct kmem_cache *s; 3241 struct kmem_cache *s;
3234 3242
3235 if (unlikely(size > PAGE_SIZE)) 3243 if (unlikely(size > SLUB_MAX_SIZE))
3236 return kmalloc_large_node(size, gfpflags, node); 3244 return kmalloc_large_node(size, gfpflags, node);
3237 3245
3238 s = get_slab(size, gfpflags); 3246 s = get_slab(size, gfpflags);
@@ -3829,6 +3837,26 @@ static ssize_t order_show(struct kmem_cache *s, char *buf)
3829} 3837}
3830SLAB_ATTR(order); 3838SLAB_ATTR(order);
3831 3839
3840static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
3841{
3842 return sprintf(buf, "%lu\n", s->min_partial);
3843}
3844
3845static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
3846 size_t length)
3847{
3848 unsigned long min;
3849 int err;
3850
3851 err = strict_strtoul(buf, 10, &min);
3852 if (err)
3853 return err;
3854
3855 set_min_partial(s, min);
3856 return length;
3857}
3858SLAB_ATTR(min_partial);
3859
3832static ssize_t ctor_show(struct kmem_cache *s, char *buf) 3860static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3833{ 3861{
3834 if (s->ctor) { 3862 if (s->ctor) {
@@ -4144,6 +4172,7 @@ static struct attribute *slab_attrs[] = {
4144 &object_size_attr.attr, 4172 &object_size_attr.attr,
4145 &objs_per_slab_attr.attr, 4173 &objs_per_slab_attr.attr,
4146 &order_attr.attr, 4174 &order_attr.attr,
4175 &min_partial_attr.attr,
4147 &objects_attr.attr, 4176 &objects_attr.attr,
4148 &objects_partial_attr.attr, 4177 &objects_partial_attr.attr,
4149 &total_objects_attr.attr, 4178 &total_objects_attr.attr,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index da422c47e2ee..312fafe0ab6e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -635,7 +635,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
635 635
636 if (!bdev) { 636 if (!bdev) {
637 if (bdev_p) 637 if (bdev_p)
638 *bdev_p = sis->bdev; 638 *bdev_p = bdget(sis->bdev->bd_dev);
639 639
640 spin_unlock(&swap_lock); 640 spin_unlock(&swap_lock);
641 return i; 641 return i;
@@ -647,7 +647,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
647 struct swap_extent, list); 647 struct swap_extent, list);
648 if (se->start_block == offset) { 648 if (se->start_block == offset) {
649 if (bdev_p) 649 if (bdev_p)
650 *bdev_p = sis->bdev; 650 *bdev_p = bdget(sis->bdev->bd_dev);
651 651
652 spin_unlock(&swap_lock); 652 spin_unlock(&swap_lock);
653 bdput(bdev); 653 bdput(bdev);
@@ -698,8 +698,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
698 pte_t *pte; 698 pte_t *pte;
699 int ret = 1; 699 int ret = 1;
700 700
701 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) 701 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
702 ret = -ENOMEM; 702 ret = -ENOMEM;
703 goto out_nolock;
704 }
703 705
704 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 706 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
705 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 707 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
@@ -723,6 +725,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
723 activate_page(page); 725 activate_page(page);
724out: 726out:
725 pte_unmap_unlock(pte, ptl); 727 pte_unmap_unlock(pte, ptl);
728out_nolock:
726 return ret; 729 return ret;
727} 730}
728 731
@@ -1377,7 +1380,7 @@ out:
1377 return ret; 1380 return ret;
1378} 1381}
1379 1382
1380asmlinkage long sys_swapoff(const char __user * specialfile) 1383SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1381{ 1384{
1382 struct swap_info_struct * p = NULL; 1385 struct swap_info_struct * p = NULL;
1383 unsigned short *swap_map; 1386 unsigned short *swap_map;
@@ -1633,7 +1636,7 @@ late_initcall(max_swapfiles_check);
1633 * 1636 *
1634 * The swapon system call 1637 * The swapon system call
1635 */ 1638 */
1636asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) 1639SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1637{ 1640{
1638 struct swap_info_struct * p; 1641 struct swap_info_struct * p;
1639 char *name = NULL; 1642 char *name = NULL;
diff --git a/mm/util.c b/mm/util.c
index cb00b748ce47..37eaccdf3054 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -129,6 +129,26 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
129} 129}
130EXPORT_SYMBOL(krealloc); 130EXPORT_SYMBOL(krealloc);
131 131
132/**
133 * kzfree - like kfree but zero memory
134 * @p: object to free memory of
135 *
136 * The memory of the object @p points to is zeroed before freed.
137 * If @p is %NULL, kzfree() does nothing.
138 */
139void kzfree(const void *p)
140{
141 size_t ks;
142 void *mem = (void *)p;
143
144 if (unlikely(ZERO_OR_NULL_PTR(mem)))
145 return;
146 ks = ksize(mem);
147 memset(mem, 0, ks);
148 kfree(mem);
149}
150EXPORT_SYMBOL(kzfree);
151
132/* 152/*
133 * strndup_user - duplicate an existing string from user space 153 * strndup_user - duplicate an existing string from user space
134 * @s: The string to duplicate 154 * @s: The string to duplicate
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c5db9a7264d9..520a75980269 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -14,7 +14,6 @@
14#include <linux/highmem.h> 14#include <linux/highmem.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/spinlock.h> 16#include <linux/spinlock.h>
17#include <linux/mutex.h>
18#include <linux/interrupt.h> 17#include <linux/interrupt.h>
19#include <linux/proc_fs.h> 18#include <linux/proc_fs.h>
20#include <linux/seq_file.h> 19#include <linux/seq_file.h>
@@ -24,6 +23,7 @@
24#include <linux/rbtree.h> 23#include <linux/rbtree.h>
25#include <linux/radix-tree.h> 24#include <linux/radix-tree.h>
26#include <linux/rcupdate.h> 25#include <linux/rcupdate.h>
26#include <linux/bootmem.h>
27 27
28#include <asm/atomic.h> 28#include <asm/atomic.h>
29#include <asm/uaccess.h> 29#include <asm/uaccess.h>
@@ -323,6 +323,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
323 unsigned long addr; 323 unsigned long addr;
324 int purged = 0; 324 int purged = 0;
325 325
326 BUG_ON(!size);
326 BUG_ON(size & ~PAGE_MASK); 327 BUG_ON(size & ~PAGE_MASK);
327 328
328 va = kmalloc_node(sizeof(struct vmap_area), 329 va = kmalloc_node(sizeof(struct vmap_area),
@@ -334,6 +335,9 @@ retry:
334 addr = ALIGN(vstart, align); 335 addr = ALIGN(vstart, align);
335 336
336 spin_lock(&vmap_area_lock); 337 spin_lock(&vmap_area_lock);
338 if (addr + size - 1 < addr)
339 goto overflow;
340
337 /* XXX: could have a last_hole cache */ 341 /* XXX: could have a last_hole cache */
338 n = vmap_area_root.rb_node; 342 n = vmap_area_root.rb_node;
339 if (n) { 343 if (n) {
@@ -365,6 +369,8 @@ retry:
365 369
366 while (addr + size > first->va_start && addr + size <= vend) { 370 while (addr + size > first->va_start && addr + size <= vend) {
367 addr = ALIGN(first->va_end + PAGE_SIZE, align); 371 addr = ALIGN(first->va_end + PAGE_SIZE, align);
372 if (addr + size - 1 < addr)
373 goto overflow;
368 374
369 n = rb_next(&first->rb_node); 375 n = rb_next(&first->rb_node);
370 if (n) 376 if (n)
@@ -375,6 +381,7 @@ retry:
375 } 381 }
376found: 382found:
377 if (addr + size > vend) { 383 if (addr + size > vend) {
384overflow:
378 spin_unlock(&vmap_area_lock); 385 spin_unlock(&vmap_area_lock);
379 if (!purged) { 386 if (!purged) {
380 purge_vmap_area_lazy(); 387 purge_vmap_area_lazy();
@@ -495,9 +502,10 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
495static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, 502static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
496 int sync, int force_flush) 503 int sync, int force_flush)
497{ 504{
498 static DEFINE_MUTEX(purge_lock); 505 static DEFINE_SPINLOCK(purge_lock);
499 LIST_HEAD(valist); 506 LIST_HEAD(valist);
500 struct vmap_area *va; 507 struct vmap_area *va;
508 struct vmap_area *n_va;
501 int nr = 0; 509 int nr = 0;
502 510
503 /* 511 /*
@@ -506,10 +514,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
506 * the case that isn't actually used at the moment anyway. 514 * the case that isn't actually used at the moment anyway.
507 */ 515 */
508 if (!sync && !force_flush) { 516 if (!sync && !force_flush) {
509 if (!mutex_trylock(&purge_lock)) 517 if (!spin_trylock(&purge_lock))
510 return; 518 return;
511 } else 519 } else
512 mutex_lock(&purge_lock); 520 spin_lock(&purge_lock);
513 521
514 rcu_read_lock(); 522 rcu_read_lock();
515 list_for_each_entry_rcu(va, &vmap_area_list, list) { 523 list_for_each_entry_rcu(va, &vmap_area_list, list) {
@@ -537,11 +545,11 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
537 545
538 if (nr) { 546 if (nr) {
539 spin_lock(&vmap_area_lock); 547 spin_lock(&vmap_area_lock);
540 list_for_each_entry(va, &valist, purge_list) 548 list_for_each_entry_safe(va, n_va, &valist, purge_list)
541 __free_vmap_area(va); 549 __free_vmap_area(va);
542 spin_unlock(&vmap_area_lock); 550 spin_unlock(&vmap_area_lock);
543 } 551 }
544 mutex_unlock(&purge_lock); 552 spin_unlock(&purge_lock);
545} 553}
546 554
547/* 555/*
@@ -984,6 +992,8 @@ EXPORT_SYMBOL(vm_map_ram);
984 992
985void __init vmalloc_init(void) 993void __init vmalloc_init(void)
986{ 994{
995 struct vmap_area *va;
996 struct vm_struct *tmp;
987 int i; 997 int i;
988 998
989 for_each_possible_cpu(i) { 999 for_each_possible_cpu(i) {
@@ -996,12 +1006,22 @@ void __init vmalloc_init(void)
996 vbq->nr_dirty = 0; 1006 vbq->nr_dirty = 0;
997 } 1007 }
998 1008
1009 /* Import existing vmlist entries. */
1010 for (tmp = vmlist; tmp; tmp = tmp->next) {
1011 va = alloc_bootmem(sizeof(struct vmap_area));
1012 va->flags = tmp->flags | VM_VM_AREA;
1013 va->va_start = (unsigned long)tmp->addr;
1014 va->va_end = va->va_start + tmp->size;
1015 __insert_vmap_area(va);
1016 }
999 vmap_initialized = true; 1017 vmap_initialized = true;
1000} 1018}
1001 1019
1002void unmap_kernel_range(unsigned long addr, unsigned long size) 1020void unmap_kernel_range(unsigned long addr, unsigned long size)
1003{ 1021{
1004 unsigned long end = addr + size; 1022 unsigned long end = addr + size;
1023
1024 flush_cache_vunmap(addr, end);
1005 vunmap_page_range(addr, end); 1025 vunmap_page_range(addr, end);
1006 flush_tlb_kernel_range(addr, end); 1026 flush_tlb_kernel_range(addr, end);
1007} 1027}
@@ -1096,6 +1116,14 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
1096} 1116}
1097EXPORT_SYMBOL_GPL(__get_vm_area); 1117EXPORT_SYMBOL_GPL(__get_vm_area);
1098 1118
1119struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1120 unsigned long start, unsigned long end,
1121 void *caller)
1122{
1123 return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
1124 caller);
1125}
1126
1099/** 1127/**
1100 * get_vm_area - reserve a contiguous kernel virtual area 1128 * get_vm_area - reserve a contiguous kernel virtual area
1101 * @size: size of the area 1129 * @size: size of the area
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9a27c44aa327..56ddf41149eb 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1262,7 +1262,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1262 * Move the pages to the [file or anon] inactive list. 1262 * Move the pages to the [file or anon] inactive list.
1263 */ 1263 */
1264 pagevec_init(&pvec, 1); 1264 pagevec_init(&pvec, 1);
1265 pgmoved = 0;
1266 lru = LRU_BASE + file * LRU_FILE; 1265 lru = LRU_BASE + file * LRU_FILE;
1267 1266
1268 spin_lock_irq(&zone->lru_lock); 1267 spin_lock_irq(&zone->lru_lock);
@@ -1274,6 +1273,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1274 */ 1273 */
1275 reclaim_stat->recent_rotated[!!file] += pgmoved; 1274 reclaim_stat->recent_rotated[!!file] += pgmoved;
1276 1275
1276 pgmoved = 0;
1277 while (!list_empty(&l_inactive)) { 1277 while (!list_empty(&l_inactive)) {
1278 page = lru_to_page(&l_inactive); 1278 page = lru_to_page(&l_inactive);
1279 prefetchw_prev_lru_page(page, &l_inactive, flags); 1279 prefetchw_prev_lru_page(page, &l_inactive, flags);
@@ -1469,7 +1469,7 @@ static void shrink_zone(int priority, struct zone *zone,
1469 int file = is_file_lru(l); 1469 int file = is_file_lru(l);
1470 int scan; 1470 int scan;
1471 1471
1472 scan = zone_page_state(zone, NR_LRU_BASE + l); 1472 scan = zone_nr_pages(zone, sc, l);
1473 if (priority) { 1473 if (priority) {
1474 scan >>= priority; 1474 scan >>= priority;
1475 scan = (scan * percent[file]) / 100; 1475 scan = (scan * percent[file]) / 100;
@@ -2057,31 +2057,31 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
2057 int pass, struct scan_control *sc) 2057 int pass, struct scan_control *sc)
2058{ 2058{
2059 struct zone *zone; 2059 struct zone *zone;
2060 unsigned long nr_to_scan, ret = 0; 2060 unsigned long ret = 0;
2061 enum lru_list l;
2062 2061
2063 for_each_zone(zone) { 2062 for_each_zone(zone) {
2063 enum lru_list l;
2064 2064
2065 if (!populated_zone(zone)) 2065 if (!populated_zone(zone))
2066 continue; 2066 continue;
2067
2068 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) 2067 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
2069 continue; 2068 continue;
2070 2069
2071 for_each_evictable_lru(l) { 2070 for_each_evictable_lru(l) {
2071 enum zone_stat_item ls = NR_LRU_BASE + l;
2072 unsigned long lru_pages = zone_page_state(zone, ls);
2073
2072 /* For pass = 0, we don't shrink the active list */ 2074 /* For pass = 0, we don't shrink the active list */
2073 if (pass == 0 && 2075 if (pass == 0 && (l == LRU_ACTIVE_ANON ||
2074 (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE)) 2076 l == LRU_ACTIVE_FILE))
2075 continue; 2077 continue;
2076 2078
2077 zone->lru[l].nr_scan += 2079 zone->lru[l].nr_scan += (lru_pages >> prio) + 1;
2078 (zone_page_state(zone, NR_LRU_BASE + l)
2079 >> prio) + 1;
2080 if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { 2080 if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
2081 unsigned long nr_to_scan;
2082
2081 zone->lru[l].nr_scan = 0; 2083 zone->lru[l].nr_scan = 0;
2082 nr_to_scan = min(nr_pages, 2084 nr_to_scan = min(nr_pages, lru_pages);
2083 zone_page_state(zone,
2084 NR_LRU_BASE + l));
2085 ret += shrink_list(l, nr_to_scan, zone, 2085 ret += shrink_list(l, nr_to_scan, zone,
2086 sc, prio); 2086 sc, prio);
2087 if (ret >= nr_pages) 2087 if (ret >= nr_pages)
@@ -2089,7 +2089,6 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
2089 } 2089 }
2090 } 2090 }
2091 } 2091 }
2092
2093 return ret; 2092 return ret;
2094} 2093}
2095 2094
@@ -2112,7 +2111,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2112 .may_swap = 0, 2111 .may_swap = 0,
2113 .swap_cluster_max = nr_pages, 2112 .swap_cluster_max = nr_pages,
2114 .may_writepage = 1, 2113 .may_writepage = 1,
2115 .swappiness = vm_swappiness,
2116 .isolate_pages = isolate_pages_global, 2114 .isolate_pages = isolate_pages_global,
2117 }; 2115 };
2118 2116
@@ -2146,10 +2144,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2146 int prio; 2144 int prio;
2147 2145
2148 /* Force reclaiming mapped pages in the passes #3 and #4 */ 2146 /* Force reclaiming mapped pages in the passes #3 and #4 */
2149 if (pass > 2) { 2147 if (pass > 2)
2150 sc.may_swap = 1; 2148 sc.may_swap = 1;
2151 sc.swappiness = 100;
2152 }
2153 2149
2154 for (prio = DEF_PRIORITY; prio >= 0; prio--) { 2150 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
2155 unsigned long nr_to_scan = nr_pages - ret; 2151 unsigned long nr_to_scan = nr_pages - ret;