KVM: PPC: Book3S HV: Migrate pinned pages out of CMA

When PCI Device pass-through is enabled via VFIO, KVM-PPC will pin pages using get_user_pages_fast(). One of the downsides of the pinning is that the page could be in CMA region. The CMA region is used for other allocations like the hash page table. Ideally we want the pinned pages to be from non CMA region. This patch (currently only for KVM PPC with VFIO) forcefully migrates the pages out (huge pages are omitted for the moment). There are more efficient ways of doing this, but that might be elaborate and might impact a larger audience beyond just the kvm ppc implementation. The magic is in new_iommu_non_cma_page() which allocates the new page from a non CMA region. I've tested the patches lightly at my end. The full solution requires migration of THP pages in the CMA region. That work will be done incrementally on top of this. Signed-off-by: Balbir Singh <bsingharora@gmail.com> Acked-by: Alexey Kardashevskiy <aik@ozlabs.ru> [mpe: Merged via powerpc tree as that's where the changes are] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
author: Balbir Singh <bsingharora@gmail.com> 2016-09-06 02:27:31 -0400
committer: Michael Ellerman <mpe@ellerman.id.au> 2016-09-29 01:14:44 -0400
commit: 2e5bbb5461f138cac631fe21b4ad956feabfba22 (patch)
tree: eb89de095b80a8f419022bb05ec40cf16a6cf3a7 /arch
parent: 360aebd85a4c946764f6301d68de2a817fad5159 (diff)
2 files changed, 78 insertions, 4 deletions
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 9d2cd0c36ec2..475d1be39191 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -18,6 +18,7 @@ extern void destroy_context(struct mm_struct *mm);
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 struct mm_iommu_table_group_mem_t;
+extern int isolate_lru_page(struct page *page); /* from internal.h */
 extern bool mm_iommu_preregistered(void);
 extern long mm_iommu_get(unsigned long ua, unsigned long entries,
                struct mm_iommu_table_group_mem_t **pmem);
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index da6a2168ae9e..e0f1c33601dd 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -15,6 +15,9 @@
 #include <linux/rculist.h>
 #include <linux/vmalloc.h>
 #include <linux/mutex.h>
+#include <linux/migrate.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
 #include <asm/mmu_context.h>
 static DEFINE_MUTEX(mem_list_mutex);
@@ -72,6 +75,55 @@ bool mm_iommu_preregistered(void)
 }
 EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
+/*
+ * Taken from alloc_migrate_target with changes to remove CMA allocations
+ */
+struct page *new_iommu_non_cma_page(struct page *page, unsigned long private,
+                                        int **resultp)
+{
+        gfp_t gfp_mask = GFP_USER;
+        struct page *new_page;
+        if (PageHuge(page) || PageTransHuge(page) || PageCompound(page))
+                return NULL;
+        if (PageHighMem(page))
+                gfp_mask |= __GFP_HIGHMEM;
+        /*
+         * We don't want the allocation to force an OOM if possibe
+         */
+        new_page = alloc_page(gfp_mask | __GFP_NORETRY | __GFP_NOWARN);
+        return new_page;
+}
+static int mm_iommu_move_page_from_cma(struct page *page)
+{
+        int ret = 0;
+        LIST_HEAD(cma_migrate_pages);
+        /* Ignore huge pages for now */
+        if (PageHuge(page) || PageTransHuge(page) || PageCompound(page))
+                return -EBUSY;
+        lru_add_drain();
+        ret = isolate_lru_page(page);
+        if (ret)
+                return ret;
+        list_add(&page->lru, &cma_migrate_pages);
+        put_page(page); /* Drop the gup reference */
+        ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page,
+                                NULL, 0, MIGRATE_SYNC, MR_CMA);
+        if (ret) {
+                if (!list_empty(&cma_migrate_pages))
+                        putback_movable_pages(&cma_migrate_pages);
+        }
+        return 0;
+}
 long mm_iommu_get(unsigned long ua, unsigned long entries,
                struct mm_iommu_table_group_mem_t **pmem)
 {
@@ -124,15 +176,36 @@ long mm_iommu_get(unsigned long ua, unsigned long entries,
        for (i = 0; i < entries; ++i) {
                if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
                                        1/* pages */, 1/* iswrite */, &page)) {
+                        ret = -EFAULT;
                        for (j = 0; j < i; ++j)
-                                put_page(pfn_to_page(
+                                put_page(pfn_to_page(mem->hpas[j] >>
-                                                mem->hpas[j] >> PAGE_SHIFT));
+                                                PAGE_SHIFT));
                        vfree(mem->hpas);
                        kfree(mem);
-                        ret = -EFAULT;
                        goto unlock_exit;
                }
+                /*
+                 * If we get a page from the CMA zone, since we are going to
+                 * be pinning these entries, we might as well move them out
+                 * of the CMA zone if possible. NOTE: faulting in + migration
+                 * can be expensive. Batching can be considered later
+                 */
+                if (get_pageblock_migratetype(page) == MIGRATE_CMA) {
+                        if (mm_iommu_move_page_from_cma(page))
+                                goto populate;
+                        if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
+                                                1/* pages */, 1/* iswrite */,
+                                                &page)) {
+                                ret = -EFAULT;
+                                for (j = 0; j < i; ++j)
+                                        put_page(pfn_to_page(mem->hpas[j] >>
+                                                                PAGE_SHIFT));
+                                vfree(mem->hpas);
+                                kfree(mem);
+                                goto unlock_exit;
+                        }
+                }
+populate:
                mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
        }
author	Balbir Singh <bsingharora@gmail.com>	2016-09-06 02:27:31 -0400
committer	Michael Ellerman <mpe@ellerman.id.au>	2016-09-29 01:14:44 -0400
commit	2e5bbb5461f138cac631fe21b4ad956feabfba22 (patch)
tree	eb89de095b80a8f419022bb05ec40cf16a6cf3a7 /arch
parent	360aebd85a4c946764f6301d68de2a817fad5159 (diff)

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 9d2cd0c36ec2..475d1be39191 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h
@@ -18,6 +18,7 @@ extern void destroy_context(struct mm_struct *mm);
18	#ifdef CONFIG_SPAPR_TCE_IOMMU	18	#ifdef CONFIG_SPAPR_TCE_IOMMU
19	struct mm_iommu_table_group_mem_t;	19	struct mm_iommu_table_group_mem_t;
20		20
		21	extern int isolate_lru_page(struct page page); / from internal.h */
21	extern bool mm_iommu_preregistered(void);	22	extern bool mm_iommu_preregistered(void);
22	extern long mm_iommu_get(unsigned long ua, unsigned long entries,	23	extern long mm_iommu_get(unsigned long ua, unsigned long entries,
23	struct mm_iommu_table_group_mem_t **pmem);	24	struct mm_iommu_table_group_mem_t **pmem);


diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index da6a2168ae9e..e0f1c33601dd 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -15,6 +15,9 @@
15	#include <linux/rculist.h>	15	#include <linux/rculist.h>
16	#include <linux/vmalloc.h>	16	#include <linux/vmalloc.h>
17	#include <linux/mutex.h>	17	#include <linux/mutex.h>
		18	#include <linux/migrate.h>
		19	#include <linux/hugetlb.h>
		20	#include <linux/swap.h>
18	#include <asm/mmu_context.h>	21	#include <asm/mmu_context.h>
19		22
20	static DEFINE_MUTEX(mem_list_mutex);	23	static DEFINE_MUTEX(mem_list_mutex);
@@ -72,6 +75,55 @@ bool mm_iommu_preregistered(void)
72	}	75	}
73	EXPORT_SYMBOL_GPL(mm_iommu_preregistered);	76	EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
74		77
		78	/*
		79	* Taken from alloc_migrate_target with changes to remove CMA allocations
		80	*/
		81	struct page new_iommu_non_cma_page(struct page page, unsigned long private,
		82	int **resultp)
		83	{
		84	gfp_t gfp_mask = GFP_USER;
		85	struct page *new_page;
		86
		87	if (PageHuge(page) \|\| PageTransHuge(page) \|\| PageCompound(page))
		88	return NULL;
		89
		90	if (PageHighMem(page))
		91	gfp_mask \|= __GFP_HIGHMEM;
		92
		93	/*
		94	* We don't want the allocation to force an OOM if possibe
		95	*/
		96	new_page = alloc_page(gfp_mask \| __GFP_NORETRY \| __GFP_NOWARN);
		97	return new_page;
		98	}
		99
		100	static int mm_iommu_move_page_from_cma(struct page *page)
		101	{
		102	int ret = 0;
		103	LIST_HEAD(cma_migrate_pages);
		104
		105	/* Ignore huge pages for now */
		106	if (PageHuge(page) \|\| PageTransHuge(page) \|\| PageCompound(page))
		107	return -EBUSY;
		108
		109	lru_add_drain();
		110	ret = isolate_lru_page(page);
		111	if (ret)
		112	return ret;
		113
		114	list_add(&page->lru, &cma_migrate_pages);
		115	put_page(page); /* Drop the gup reference */
		116
		117	ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page,
		118	NULL, 0, MIGRATE_SYNC, MR_CMA);
		119	if (ret) {
		120	if (!list_empty(&cma_migrate_pages))
		121	putback_movable_pages(&cma_migrate_pages);
		122	}
		123
		124	return 0;
		125	}
		126
75	long mm_iommu_get(unsigned long ua, unsigned long entries,	127	long mm_iommu_get(unsigned long ua, unsigned long entries,
76	struct mm_iommu_table_group_mem_t **pmem)	128	struct mm_iommu_table_group_mem_t **pmem)
77	{	129	{
@@ -124,15 +176,36 @@ long mm_iommu_get(unsigned long ua, unsigned long entries,
124	for (i = 0; i < entries; ++i) {	176	for (i = 0; i < entries; ++i) {
125	if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),	177	if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
126	1/* pages /, 1/ iswrite */, &page)) {	178	1/* pages /, 1/ iswrite */, &page)) {
		179	ret = -EFAULT;
127	for (j = 0; j < i; ++j)	180	for (j = 0; j < i; ++j)
128	put_page(pfn_to_page(	181	put_page(pfn_to_page(mem->hpas[j] >>
129	mem->hpas[j] >> PAGE_SHIFT));	182	PAGE_SHIFT));
130	vfree(mem->hpas);	183	vfree(mem->hpas);
131	kfree(mem);	184	kfree(mem);
132	ret = -EFAULT;
133	goto unlock_exit;	185	goto unlock_exit;
134	}	186	}
135		187	/*
		188	* If we get a page from the CMA zone, since we are going to
		189	* be pinning these entries, we might as well move them out
		190	* of the CMA zone if possible. NOTE: faulting in + migration
		191	* can be expensive. Batching can be considered later
		192	*/
		193	if (get_pageblock_migratetype(page) == MIGRATE_CMA) {
		194	if (mm_iommu_move_page_from_cma(page))
		195	goto populate;
		196	if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
		197	1/* pages /, 1/ iswrite */,
		198	&page)) {
		199	ret = -EFAULT;
		200	for (j = 0; j < i; ++j)
		201	put_page(pfn_to_page(mem->hpas[j] >>
		202	PAGE_SHIFT));
		203	vfree(mem->hpas);
		204	kfree(mem);
		205	goto unlock_exit;
		206	}
		207	}
		208	populate:
136	mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;	209	mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
137	}	210	}
138		211