From 16b56cf4b8a0fa9acc21bd2ad19839b917999b96 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:04 -0700
Subject: mm: fix sparse warnings on GFP_ZONE_TABLE/BAD

Introduce ___GFP_* masks in order for gfp_t to not be mixed with plain
integers which causes a lot of warnings like the following:

 warning: restricted gfp_t degrades to integer

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 105 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 63 insertions(+), 42 deletions(-)

(limited to 'include/linux/gfp.h')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 975609cb8548..e8713d55360a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -9,6 +9,32 @@
 
 struct vm_area_struct;
 
+/* Plain integer GFP bitmasks. Do not use this directly. */
+#define ___GFP_DMA		0x01u
+#define ___GFP_HIGHMEM		0x02u
+#define ___GFP_DMA32		0x04u
+#define ___GFP_MOVABLE		0x08u
+#define ___GFP_WAIT		0x10u
+#define ___GFP_HIGH		0x20u
+#define ___GFP_IO		0x40u
+#define ___GFP_FS		0x80u
+#define ___GFP_COLD		0x100u
+#define ___GFP_NOWARN		0x200u
+#define ___GFP_REPEAT		0x400u
+#define ___GFP_NOFAIL		0x800u
+#define ___GFP_NORETRY		0x1000u
+#define ___GFP_COMP		0x4000u
+#define ___GFP_ZERO		0x8000u
+#define ___GFP_NOMEMALLOC	0x10000u
+#define ___GFP_HARDWALL		0x20000u
+#define ___GFP_THISNODE		0x40000u
+#define ___GFP_RECLAIMABLE	0x80000u
+#ifdef CONFIG_KMEMCHECK
+#define ___GFP_NOTRACK		0x200000u
+#else
+#define ___GFP_NOTRACK		0
+#endif
+
 /*
  * GFP bitmasks..
  *
@@ -18,10 +44,10 @@ struct vm_area_struct;
  * without the underscores and use them consistently. The definitions here may
  * be used in bit comparisons.
  */
-#define __GFP_DMA	((__force gfp_t)0x01u)
-#define __GFP_HIGHMEM	((__force gfp_t)0x02u)
-#define __GFP_DMA32	((__force gfp_t)0x04u)
-#define __GFP_MOVABLE	((__force gfp_t)0x08u)  /* Page is movable */
+#define __GFP_DMA	((__force gfp_t)___GFP_DMA)
+#define __GFP_HIGHMEM	((__force gfp_t)___GFP_HIGHMEM)
+#define __GFP_DMA32	((__force gfp_t)___GFP_DMA32)
+#define __GFP_MOVABLE	((__force gfp_t)___GFP_MOVABLE)  /* Page is movable */
 #define GFP_ZONEMASK	(__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
 /*
  * Action modifiers - doesn't change the zoning
@@ -38,27 +64,22 @@ struct vm_area_struct;
  * __GFP_MOVABLE: Flag that this page will be movable by the page migration
  * mechanism or reclaimed
  */
-#define __GFP_WAIT	((__force gfp_t)0x10u)	/* Can wait and reschedule? */
-#define __GFP_HIGH	((__force gfp_t)0x20u)	/* Should access emergency pools? */
-#define __GFP_IO	((__force gfp_t)0x40u)	/* Can start physical IO? */
-#define __GFP_FS	((__force gfp_t)0x80u)	/* Can call down to low-level FS? */
-#define __GFP_COLD	((__force gfp_t)0x100u)	/* Cache-cold page required */
-#define __GFP_NOWARN	((__force gfp_t)0x200u)	/* Suppress page allocation failure warning */
-#define __GFP_REPEAT	((__force gfp_t)0x400u)	/* See above */
-#define __GFP_NOFAIL	((__force gfp_t)0x800u)	/* See above */
-#define __GFP_NORETRY	((__force gfp_t)0x1000u)/* See above */
-#define __GFP_COMP	((__force gfp_t)0x4000u)/* Add compound page metadata */
-#define __GFP_ZERO	((__force gfp_t)0x8000u)/* Return zeroed page on success */
-#define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
-#define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
-#define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
-#define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
-
-#ifdef CONFIG_KMEMCHECK
-#define __GFP_NOTRACK	((__force gfp_t)0x200000u)  /* Don't track with kmemcheck */
-#else
-#define __GFP_NOTRACK	((__force gfp_t)0)
-#endif
+#define __GFP_WAIT	((__force gfp_t)___GFP_WAIT)	/* Can wait and reschedule? */
+#define __GFP_HIGH	((__force gfp_t)___GFP_HIGH)	/* Should access emergency pools? */
+#define __GFP_IO	((__force gfp_t)___GFP_IO)	/* Can start physical IO? */
+#define __GFP_FS	((__force gfp_t)___GFP_FS)	/* Can call down to low-level FS? */
+#define __GFP_COLD	((__force gfp_t)___GFP_COLD)	/* Cache-cold page required */
+#define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)	/* Suppress page allocation failure warning */
+#define __GFP_REPEAT	((__force gfp_t)___GFP_REPEAT)	/* See above */
+#define __GFP_NOFAIL	((__force gfp_t)___GFP_NOFAIL)	/* See above */
+#define __GFP_NORETRY	((__force gfp_t)___GFP_NORETRY) /* See above */
+#define __GFP_COMP	((__force gfp_t)___GFP_COMP)	/* Add compound page metadata */
+#define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)	/* Return zeroed page on success */
+#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves */
+#define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */
+#define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */
+#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
+#define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
 
 /*
  * This may seem redundant, but it's a way of annotating false positives vs.
@@ -186,14 +207,14 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags)
 #endif
 
 #define GFP_ZONE_TABLE ( \
-	(ZONE_NORMAL << 0 * ZONES_SHIFT)				\
-	| (OPT_ZONE_DMA << __GFP_DMA * ZONES_SHIFT)			\
-	| (OPT_ZONE_HIGHMEM << __GFP_HIGHMEM * ZONES_SHIFT)		\
-	| (OPT_ZONE_DMA32 << __GFP_DMA32 * ZONES_SHIFT)			\
-	| (ZONE_NORMAL << __GFP_MOVABLE * ZONES_SHIFT)			\
-	| (OPT_ZONE_DMA << (__GFP_MOVABLE | __GFP_DMA) * ZONES_SHIFT)	\
-	| (ZONE_MOVABLE << (__GFP_MOVABLE | __GFP_HIGHMEM) * ZONES_SHIFT)\
-	| (OPT_ZONE_DMA32 << (__GFP_MOVABLE | __GFP_DMA32) * ZONES_SHIFT)\
+	(ZONE_NORMAL << 0 * ZONES_SHIFT)				      \
+	| (OPT_ZONE_DMA << ___GFP_DMA * ZONES_SHIFT)			      \
+	| (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * ZONES_SHIFT)		      \
+	| (OPT_ZONE_DMA32 << ___GFP_DMA32 * ZONES_SHIFT)		      \
+	| (ZONE_NORMAL << ___GFP_MOVABLE * ZONES_SHIFT)			      \
+	| (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * ZONES_SHIFT)	      \
+	| (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * ZONES_SHIFT)   \
+	| (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * ZONES_SHIFT)   \
 )
 
 /*
@@ -203,20 +224,20 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags)
  * allowed.
  */
 #define GFP_ZONE_BAD ( \
-	1 << (__GFP_DMA | __GFP_HIGHMEM)				\
-	| 1 << (__GFP_DMA | __GFP_DMA32)				\
-	| 1 << (__GFP_DMA32 | __GFP_HIGHMEM)				\
-	| 1 << (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM)		\
-	| 1 << (__GFP_MOVABLE | __GFP_HIGHMEM | __GFP_DMA)		\
-	| 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA)		\
-	| 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_HIGHMEM)		\
-	| 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA | __GFP_HIGHMEM)\
+	1 << (___GFP_DMA | ___GFP_HIGHMEM)				      \
+	| 1 << (___GFP_DMA | ___GFP_DMA32)				      \
+	| 1 << (___GFP_DMA32 | ___GFP_HIGHMEM)				      \
+	| 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM)		      \
+	| 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA)		      \
+	| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA)		      \
+	| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM)		      \
+	| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM)  \
 )
 
 static inline enum zone_type gfp_zone(gfp_t flags)
 {
 	enum zone_type z;
-	int bit = flags & GFP_ZONEMASK;
+	int bit = (__force int) (flags & GFP_ZONEMASK);
 
 	z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) &
 					 ((1 << ZONES_SHIFT) - 1);
-- 
cgit v1.2.2


From c9e664f1fdf34aa8cede047b206deaa8f1945af0 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 3 Dec 2010 22:57:45 +0100
Subject: PM / Hibernate: Fix memory corruption related to swap

There is a problem that swap pages allocated before the creation of
a hibernation image can be released and used for storing the contents
of different memory pages while the image is being saved.  Since the
kernel stored in the image doesn't know of that, it causes memory
corruption to occur after resume from hibernation, especially on
systems with relatively small RAM that need to swap often.

This issue can be addressed by keeping the GFP_IOFS bits clear
in gfp_allowed_mask during the entire hibernation, including the
saving of the image, until the system is finally turned off or
the hibernation is aborted.  Unfortunately, for this purpose
it's necessary to rework the way in which the hibernate and
suspend code manipulates gfp_allowed_mask.

This change is based on an earlier patch from Hugh Dickins.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reported-by: Ondrej Zary <linux@rainbow-software.org>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: stable@kernel.org
---
 include/linux/gfp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux/gfp.h')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index e8713d55360a..f54adfcbec9c 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -360,7 +360,7 @@ void drain_local_pages(void *dummy);
 
 extern gfp_t gfp_allowed_mask;
 
-extern void set_gfp_allowed_mask(gfp_t mask);
-extern gfp_t clear_gfp_allowed_mask(gfp_t mask);
+extern void pm_restrict_gfp_mask(void);
+extern void pm_restore_gfp_mask(void);
 
 #endif /* __LINUX_GFP_H */
-- 
cgit v1.2.2


From 32dba98e085f8b2b4345887df9abf5e0e93bfc12 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:49 -0800
Subject: thp: _GFP_NO_KSWAPD

Transparent hugepage allocations must be allowed not to invoke kswapd or
any other kind of indirect reclaim (especially when the defrag sysfs is
control disabled).  It's unacceptable to swap out anonymous pages
(potentially anonymous transparent hugepages) in order to create new
transparent hugepages.  This is true for the MADV_HUGEPAGE areas too
(swapping out a kvm virtual machine and so having it suffer an unbearable
slowdown, so another one with guest physical memory marked MADV_HUGEPAGE
can run 30% faster if it is running memory intensive workloads, makes no
sense).  If a transparent hugepage allocation fails the slowdown is minor
and there is total fallback, so kswapd should never be asked to swapout
memory to allow the high order allocation to succeed.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux/gfp.h')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f54adfcbec9c..49d2356bb82d 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -34,6 +34,7 @@ struct vm_area_struct;
 #else
 #define ___GFP_NOTRACK		0
 #endif
+#define ___GFP_NO_KSWAPD	0x400000u
 
 /*
  * GFP bitmasks..
@@ -81,13 +82,15 @@ struct vm_area_struct;
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
 #define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
 
+#define __GFP_NO_KSWAPD	((__force gfp_t)___GFP_NO_KSWAPD)
+
 /*
  * This may seem redundant, but it's a way of annotating false positives vs.
  * allocations that simply cannot be supported (e.g. page tables).
  */
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
-#define __GFP_BITS_SHIFT 22	/* Room for 22 __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 23	/* Room for 23 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /* This equals 0, but use constants in case they ever change */
-- 
cgit v1.2.2


From 71e3aac0724ffe8918992d76acfe3aad7d8724a5 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:52 -0800
Subject: thp: transparent hugepage core

Lately I've been working to make KVM use hugepages transparently without
the usual restrictions of hugetlbfs.  Some of the restrictions I'd like to
see removed:

1) hugepages have to be swappable or the guest physical memory remains
   locked in RAM and can't be paged out to swap

2) if a hugepage allocation fails, regular pages should be allocated
   instead and mixed in the same vma without any failure and without
   userland noticing

3) if some task quits and more hugepages become available in the
   buddy, guest physical memory backed by regular pages should be
   relocated on hugepages automatically in regions under
   madvise(MADV_HUGEPAGE) (ideally event driven by waking up the
   kernel deamon if the order=HPAGE_PMD_SHIFT-PAGE_SHIFT list becomes
   not null)

4) avoidance of reservation and maximization of use of hugepages whenever
   possible. Reservation (needed to avoid runtime fatal faliures) may be ok for
   1 machine with 1 database with 1 database cache with 1 database cache size
   known at boot time. It's definitely not feasible with a virtualization
   hypervisor usage like RHEV-H that runs an unknown number of virtual machines
   with an unknown size of each virtual machine with an unknown amount of
   pagecache that could be potentially useful in the host for guest not using
   O_DIRECT (aka cache=off).

hugepages in the virtualization hypervisor (and also in the guest!) are
much more important than in a regular host not using virtualization,
becasue with NPT/EPT they decrease the tlb-miss cacheline accesses from 24
to 19 in case only the hypervisor uses transparent hugepages, and they
decrease the tlb-miss cacheline accesses from 19 to 15 in case both the
linux hypervisor and the linux guest both uses this patch (though the
guest will limit the addition speedup to anonymous regions only for
now...).  Even more important is that the tlb miss handler is much slower
on a NPT/EPT guest than for a regular shadow paging or no-virtualization
scenario.  So maximizing the amount of virtual memory cached by the TLB
pays off significantly more with NPT/EPT than without (even if there would
be no significant speedup in the tlb-miss runtime).

The first (and more tedious) part of this work requires allowing the VM to
handle anonymous hugepages mixed with regular pages transparently on
regular anonymous vmas.  This is what this patch tries to achieve in the
least intrusive possible way.  We want hugepages and hugetlb to be used in
a way so that all applications can benefit without changes (as usual we
leverage the KVM virtualization design: by improving the Linux VM at
large, KVM gets the performance boost too).

The most important design choice is: always fallback to 4k allocation if
the hugepage allocation fails!  This is the _very_ opposite of some large
pagecache patches that failed with -EIO back then if a 64k (or similar)
allocation failed...

Second important decision (to reduce the impact of the feature on the
existing pagetable handling code) is that at any time we can split an
hugepage into 512 regular pages and it has to be done with an operation
that can't fail.  This way the reliability of the swapping isn't decreased
(no need to allocate memory when we are short on memory to swap) and it's
trivial to plug a split_huge_page* one-liner where needed without
polluting the VM.  Over time we can teach mprotect, mremap and friends to
handle pmd_trans_huge natively without calling split_huge_page*.  The fact
it can't fail isn't just for swap: if split_huge_page would return -ENOMEM
(instead of the current void) we'd need to rollback the mprotect from the
middle of it (ideally including undoing the split_vma) which would be a
big change and in the very wrong direction (it'd likely be simpler not to
call split_huge_page at all and to teach mprotect and friends to handle
hugepages instead of rolling them back from the middle).  In short the
very value of split_huge_page is that it can't fail.

The collapsing and madvise(MADV_HUGEPAGE) part will remain separated and
incremental and it'll just be an "harmless" addition later if this initial
part is agreed upon.  It also should be noted that locking-wise replacing
regular pages with hugepages is going to be very easy if compared to what
I'm doing below in split_huge_page, as it will only happen when
page_count(page) matches page_mapcount(page) if we can take the PG_lock
and mmap_sem in write mode.  collapse_huge_page will be a "best effort"
that (unlike split_huge_page) can fail at the minimal sign of trouble and
we can try again later.  collapse_huge_page will be similar to how KSM
works and the madvise(MADV_HUGEPAGE) will work similar to
madvise(MADV_MERGEABLE).

The default I like is that transparent hugepages are used at page fault
time.  This can be changed with
/sys/kernel/mm/transparent_hugepage/enabled.  The control knob can be set
to three values "always", "madvise", "never" which mean respectively that
hugepages are always used, or only inside madvise(MADV_HUGEPAGE) regions,
or never used.  /sys/kernel/mm/transparent_hugepage/defrag instead
controls if the hugepage allocation should defrag memory aggressively
"always", only inside "madvise" regions, or "never".

The pmd_trans_splitting/pmd_trans_huge locking is very solid.  The
put_page (from get_user_page users that can't use mmu notifier like
O_DIRECT) that runs against a __split_huge_page_refcount instead was a
pain to serialize in a way that would result always in a coherent page
count for both tail and head.  I think my locking solution with a
compound_lock taken only after the page_first is valid and is still a
PageHead should be safe but it surely needs review from SMP race point of
view.  In short there is no current existing way to serialize the O_DIRECT
final put_page against split_huge_page_refcount so I had to invent a new
one (O_DIRECT loses knowledge on the mapping status by the time gup_fast
returns so...).  And I didn't want to impact all gup/gup_fast users for
now, maybe if we change the gup interface substantially we can avoid this
locking, I admit I didn't think too much about it because changing the gup
unpinning interface would be invasive.

If we ignored O_DIRECT we could stick to the existing compound refcounting
code, by simply adding a get_user_pages_fast_flags(foll_flags) where KVM
(and any other mmu notifier user) would call it without FOLL_GET (and if
FOLL_GET isn't set we'd just BUG_ON if nobody registered itself in the
current task mmu notifier list yet).  But O_DIRECT is fundamental for
decent performance of virtualized I/O on fast storage so we can't avoid it
to solve the race of put_page against split_huge_page_refcount to achieve
a complete hugepage feature for KVM.

Swap and oom works fine (well just like with regular pages ;).  MMU
notifier is handled transparently too, with the exception of the young bit
on the pmd, that didn't have a range check but I think KVM will be fine
because the whole point of hugepages is that EPT/NPT will also use a huge
pmd when they notice gup returns pages with PageCompound set, so they
won't care of a range and there's just the pmd young bit to check in that
case.

NOTE: in some cases if the L2 cache is small, this may slowdown and waste
memory during COWs because 4M of memory are accessed in a single fault
instead of 8k (the payoff is that after COW the program can run faster).
So we might want to switch the copy_huge_page (and clear_huge_page too) to
not temporal stores.  I also extensively researched ways to avoid this
cache trashing with a full prefault logic that would cow in 8k/16k/32k/64k
up to 1M (I can send those patches that fully implemented prefault) but I
concluded they're not worth it and they add an huge additional complexity
and they remove all tlb benefits until the full hugepage has been faulted
in, to save a little bit of memory and some cache during app startup, but
they still don't improve substantially the cache-trashing during startup
if the prefault happens in >4k chunks.  One reason is that those 4k pte
entries copied are still mapped on a perfectly cache-colored hugepage, so
the trashing is the worst one can generate in those copies (cow of 4k page
copies aren't so well colored so they trashes less, but again this results
in software running faster after the page fault).  Those prefault patches
allowed things like a pte where post-cow pages were local 4k regular anon
pages and the not-yet-cowed pte entries were pointing in the middle of
some hugepage mapped read-only.  If it doesn't payoff substantially with
todays hardware it will payoff even less in the future with larger l2
caches, and the prefault logic would blot the VM a lot.  If one is
emebdded transparent_hugepage can be disabled during boot with sysfs or
with the boot commandline parameter transparent_hugepage=0 (or
transparent_hugepage=2 to restrict hugepages inside madvise regions) that
will ensure not a single hugepage is allocated at boot time.  It is simple
enough to just disable transparent hugepage globally and let transparent
hugepages be allocated selectively by applications in the MADV_HUGEPAGE
region (both at page fault time, and if enabled with the
collapse_huge_page too through the kernel daemon).

This patch supports only hugepages mapped in the pmd, archs that have
smaller hugepages will not fit in this patch alone.  Also some archs like
power have certain tlb limits that prevents mixing different page size in
the same regions so they will not fit in this framework that requires
"graceful fallback" to basic PAGE_SIZE in case of physical memory
fragmentation.  hugetlbfs remains a perfect fit for those because its
software limits happen to match the hardware limits.  hugetlbfs also
remains a perfect fit for hugepage sizes like 1GByte that cannot be hoped
to be found not fragmented after a certain system uptime and that would be
very expensive to defragment with relocation, so requiring reservation.
hugetlbfs is the "reservation way", the point of transparent hugepages is
not to have any reservation at all and maximizing the use of cache and
hugepages at all times automatically.

Some performance result:

vmx andrea # LD_PRELOAD=/usr/lib64/libhugetlbfs.so HUGETLB_MORECORE=yes HUGETLB_PATH=/mnt/huge/ ./largep
ages3
memset page fault 1566023
memset tlb miss 453854
memset second tlb miss 453321
random access tlb miss 41635
random access second tlb miss 41658
vmx andrea # LD_PRELOAD=/usr/lib64/libhugetlbfs.so HUGETLB_MORECORE=yes HUGETLB_PATH=/mnt/huge/ ./largepages3
memset page fault 1566471
memset tlb miss 453375
memset second tlb miss 453320
random access tlb miss 41636
random access second tlb miss 41637
vmx andrea # ./largepages3
memset page fault 1566642
memset tlb miss 453417
memset second tlb miss 453313
random access tlb miss 41630
random access second tlb miss 41647
vmx andrea # ./largepages3
memset page fault 1566872
memset tlb miss 453418
memset second tlb miss 453315
random access tlb miss 41618
random access second tlb miss 41659
vmx andrea # echo 0 > /proc/sys/vm/transparent_hugepage
vmx andrea # ./largepages3
memset page fault 2182476
memset tlb miss 460305
memset second tlb miss 460179
random access tlb miss 44483
random access second tlb miss 44186
vmx andrea # ./largepages3
memset page fault 2182791
memset tlb miss 460742
memset second tlb miss 459962
random access tlb miss 43981
random access second tlb miss 43988

============
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>

#define SIZE (3UL*1024*1024*1024)

int main()
{
	char *p = malloc(SIZE), *p2;
	struct timeval before, after;

	gettimeofday(&before, NULL);
	memset(p, 0, SIZE);
	gettimeofday(&after, NULL);
	printf("memset page fault %Lu\n",
	       (after.tv_sec-before.tv_sec)*1000000UL +
	       after.tv_usec-before.tv_usec);

	gettimeofday(&before, NULL);
	memset(p, 0, SIZE);
	gettimeofday(&after, NULL);
	printf("memset tlb miss %Lu\n",
	       (after.tv_sec-before.tv_sec)*1000000UL +
	       after.tv_usec-before.tv_usec);

	gettimeofday(&before, NULL);
	memset(p, 0, SIZE);
	gettimeofday(&after, NULL);
	printf("memset second tlb miss %Lu\n",
	       (after.tv_sec-before.tv_sec)*1000000UL +
	       after.tv_usec-before.tv_usec);

	gettimeofday(&before, NULL);
	for (p2 = p; p2 < p+SIZE; p2 += 4096)
		*p2 = 0;
	gettimeofday(&after, NULL);
	printf("random access tlb miss %Lu\n",
	       (after.tv_sec-before.tv_sec)*1000000UL +
	       after.tv_usec-before.tv_usec);

	gettimeofday(&before, NULL);
	for (p2 = p; p2 < p+SIZE; p2 += 4096)
		*p2 = 0;
	gettimeofday(&after, NULL);
	printf("random access second tlb miss %Lu\n",
	       (after.tv_sec-before.tv_sec)*1000000UL +
	       after.tv_usec-before.tv_usec);

	return 0;
}
============

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux/gfp.h')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 49d2356bb82d..d95082cc6f4a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -109,6 +109,9 @@ struct vm_area_struct;
 				 __GFP_HARDWALL | __GFP_HIGHMEM | \
 				 __GFP_MOVABLE)
 #define GFP_IOFS	(__GFP_IO | __GFP_FS)
+#define GFP_TRANSHUGE	(GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
+			 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
+			 __GFP_NO_KSWAPD)
 
 #ifdef CONFIG_NUMA
 #define GFP_THISNODE	(__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
-- 
cgit v1.2.2


From 0bbbc0b33d141f78a0d9218a54a47f50621220d3 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:47:05 -0800
Subject: thp: add numa awareness to hugepage allocations

It's mostly a matter of replacing alloc_pages with alloc_pages_vma after
introducing alloc_pages_vma.  khugepaged needs special handling as the
allocation has to happen inside collapse_huge_page where the vma is known
and an error has to be returned to the outer loop to sleep
alloc_sleep_millisecs in case of failure.  But it retains the more
efficient logic of handling allocation failures in khugepaged in case of
CONFIG_NUMA=n.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux/gfp.h')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index d95082cc6f4a..a3b148a91874 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -331,14 +331,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
 	return alloc_pages_current(gfp_mask, order);
 }
-extern struct page *alloc_page_vma(gfp_t gfp_mask,
+extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
 			struct vm_area_struct *vma, unsigned long addr);
 #else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0)
+#define alloc_pages_vma(gfp_mask, order, vma, addr)	\
+	alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
+#define alloc_page_vma(gfp_mask, vma, addr)	\
+	alloc_pages_vma(gfp_mask, 0, vma, addr)
 
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
-- 
cgit v1.2.2


From 1765e3a4933ea0870fabd755feffc5473c4363ce Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 24 Jan 2011 14:45:10 -0600
Subject: Remove MAYBE_BUILD_BUG_ON

Now BUILD_BUG_ON() can handle optimizable constants, we don't need
MAYBE_BUILD_BUG_ON any more.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/gfp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux/gfp.h')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index a3b148a91874..0b84c61607e8 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -249,7 +249,7 @@ static inline enum zone_type gfp_zone(gfp_t flags)
 					 ((1 << ZONES_SHIFT) - 1);
 
 	if (__builtin_constant_p(bit))
-		MAYBE_BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
+		BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
 	else {
 #ifdef CONFIG_DEBUG_VM
 		BUG_ON((GFP_ZONE_BAD >> bit) & 1);
-- 
cgit v1.2.2


From 2f5f9486f8c12e3aa40fe3775a18cb14efc5cea2 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 4 Mar 2011 17:36:29 -0800
Subject: mm: change alloc_pages_vma to pass down the policy node for local
 policy

Currently alloc_pages_vma() always uses the local node as policy node for
the LOCAL policy.  Pass this node down as an argument instead.

No behaviour change from this patch, but will be needed for followons.

Acked-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux/gfp.h')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0b84c61607e8..37b8af5db091 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -332,16 +332,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 	return alloc_pages_current(gfp_mask, order);
 }
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
-			struct vm_area_struct *vma, unsigned long addr);
+			struct vm_area_struct *vma, unsigned long addr,
+			int node);
 #else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_pages_vma(gfp_mask, order, vma, addr)	\
+#define alloc_pages_vma(gfp_mask, order, vma, addr, node)	\
 	alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
-#define alloc_page_vma(gfp_mask, vma, addr)	\
-	alloc_pages_vma(gfp_mask, 0, vma, addr)
+#define alloc_page_vma(gfp_mask, vma, addr)			\
+	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
 
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
-- 
cgit v1.2.2


From 236344d6b417d05a3080477639234fd9ca97568d Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 4 Mar 2011 17:36:30 -0800
Subject: mm: add alloc_page_vma_node()

Add a alloc_page_vma_node that allows passing the "local" node in.  Used
in a followon patch.

Acked-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux/gfp.h')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 37b8af5db091..dca31761b311 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -343,6 +343,8 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 #define alloc_page_vma(gfp_mask, vma, addr)			\
 	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
+#define alloc_page_vma_node(gfp_mask, vma, addr, node)		\
+	alloc_pages_vma(gfp_mask, 0, vma, addr, node)
 
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
-- 
cgit v1.2.2


From 78afd5612deb8268bafc8b6507d72341d5ed9aac Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 22 Mar 2011 16:33:12 -0700
Subject: mm: add __GFP_OTHER_NODE flag

Add a new __GFP_OTHER_NODE flag to tell the low level numa statistics in
zone_statistics() that an allocation is on behalf of another thread.  This
way the local and remote counters can be still correct, even when
background daemons like khugepaged are changing memory mappings.

This only affects the accounting, but I think it's worth doing that right
to avoid confusing users.

I first tried to just pass down the right node, but this required a lot of
changes to pass down this parameter and at least one addition of a 10th
argument to a 9 argument function.  Using the flag is a lot less
intrusive.

Open: should be also used for migration?

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux/gfp.h')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index dca31761b311..bfb8f934521e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -35,6 +35,7 @@ struct vm_area_struct;
 #define ___GFP_NOTRACK		0
 #endif
 #define ___GFP_NO_KSWAPD	0x400000u
+#define ___GFP_OTHER_NODE	0x800000u
 
 /*
  * GFP bitmasks..
@@ -83,6 +84,7 @@ struct vm_area_struct;
 #define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
 
 #define __GFP_NO_KSWAPD	((__force gfp_t)___GFP_NO_KSWAPD)
+#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
 
 /*
  * This may seem redundant, but it's a way of annotating false positives vs.
-- 
cgit v1.2.2


From ee85c2e1454603ebb9f8d87223ac79dcdc87fa32 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 11 May 2011 15:13:34 -0700
Subject: mm: add alloc_pages_exact_nid()

Add a alloc_pages_exact_nid() that allocates on a specific node.

The naming is quite broken, but fixing that would need a larger renaming
action.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: tweak comment]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux/gfp.h')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index bfb8f934521e..56d8fc87fbbc 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -353,6 +353,8 @@ extern unsigned long get_zeroed_page(gfp_t gfp_mask);
 
 void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
 void free_pages_exact(void *virt, size_t size);
+/* This is different from alloc_pages_exact_node !!! */
+void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
 
 #define __get_free_page(gfp_mask) \
 		__get_free_pages((gfp_mask), 0)
-- 
cgit v1.2.2


From 15fa8f425557a0d698f933627771f520ef4ae34b Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Tue, 24 May 2011 17:11:41 -0700
Subject: include/linux/gfp.h: work around apparent sparse confusion

Running sparse on page_alloc.c today, it errors out:
        include/linux/gfp.h:254:17: error: bad constant expression
        include/linux/gfp.h:254:17: error: cannot size expression

which is a line in gfp_zone():

        BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1);

That's really unfortunate, because it ends up hiding all of the other
legitimate sparse messages like this:
        mm/page_alloc.c:5315:59: warning: incorrect type in argument 1 (different base types)
        mm/page_alloc.c:5315:59:    expected unsigned long [unsigned] [usertype] size
        mm/page_alloc.c:5315:59:    got restricted gfp_t [usertype] <noident>
...

Having sparse be able to catch these very oopsable bugs is a lot more
important than keeping a BUILD_BUG_ON().  Kill the BUILD_BUG_ON().

Compiles on x86_64 with and without CONFIG_DEBUG_VM=y.  defconfig boots
fine for me.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux/gfp.h')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 56d8fc87fbbc..7e8f5d863c76 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -249,14 +249,9 @@ static inline enum zone_type gfp_zone(gfp_t flags)
 
 	z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) &
 					 ((1 << ZONES_SHIFT) - 1);
-
-	if (__builtin_constant_p(bit))
-		BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
-	else {
 #ifdef CONFIG_DEBUG_VM
-		BUG_ON((GFP_ZONE_BAD >> bit) & 1);
+	BUG_ON((GFP_ZONE_BAD >> bit) & 1);
 #endif
-	}
 	return z;
 }
 
-- 
cgit v1.2.2


From 82d4b5779a75887750748609f3415f01c1bb9f81 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Tue, 24 May 2011 17:11:42 -0700
Subject: include/linux/gfp.h: convert BUG_ON() into VM_BUG_ON()

VM_BUG_ON() if effectively a BUG_ON() undef #ifdef CONFIG_DEBUG_VM.  That
is exactly what we have here now, and two different folks have suggested
doing it this way.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux/gfp.h')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 7e8f5d863c76..cb4089254f01 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -249,9 +249,7 @@ static inline enum zone_type gfp_zone(gfp_t flags)
 
 	z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) &
 					 ((1 << ZONES_SHIFT) - 1);
-#ifdef CONFIG_DEBUG_VM
-	BUG_ON((GFP_ZONE_BAD >> bit) & 1);
-#endif
+	VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
 	return z;
 }
 
-- 
cgit v1.2.2