diff options
author | Dan Magenheimer <dan.magenheimer@oracle.com> | 2011-05-26 12:02:21 -0400 |
---|---|---|
committer | Dan Magenheimer <dan.magenheimer@oracle.com> | 2011-05-26 12:02:21 -0400 |
commit | 5bc20fc59706214d9591c11e1938a629d3538c12 (patch) | |
tree | ebfb66428ce888560dd982d3ca313f039a53ae40 | |
parent | 1cfd8bd0f97ae3ad314151cd0fd70454d7b39699 (diff) |
xen: cleancache shim to Xen Transcendent Memory
This patch provides a shim between the kernel-internal cleancache
API (see Documentation/mm/cleancache.txt) and the Xen Transcendent
Memory ABI (see http://oss.oracle.com/projects/tmem).
Xen tmem provides "hypervisor RAM" as an ephemeral page-oriented
pseudo-RAM store for cleancache pages, shared cleancache pages,
and frontswap pages. Tmem provides enterprise-quality concurrency,
full save/restore and live migration support, compression
and deduplication.
A presentation showing up to 8% faster performance and up to 52%
reduction in sectors read on a kernel compile workload, despite
aggressive in-kernel page reclamation ("self-ballooning") can be
found at:
http://oss.oracle.com/projects/tmem/dist/documentation/presentations/TranscendentMemoryXenSummit2010.pdf
Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Reviewed-by: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik Van Riel <riel@redhat.com>
Cc: Jan Beulich <JBeulich@novell.com>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Andreas Dilger <adilger@sun.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Nitin Gupta <ngupta@vflare.org>
-rw-r--r-- | arch/x86/include/asm/xen/hypercall.h | 7 | ||||
-rw-r--r-- | drivers/xen/Makefile | 1 | ||||
-rw-r--r-- | drivers/xen/tmem.c | 264 | ||||
-rw-r--r-- | include/xen/interface/xen.h | 22 |
4 files changed, 294 insertions, 0 deletions
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 8508bfe52296..d240ea950519 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h | |||
@@ -447,6 +447,13 @@ HYPERVISOR_hvm_op(int op, void *arg) | |||
447 | return _hypercall2(unsigned long, hvm_op, op, arg); | 447 | return _hypercall2(unsigned long, hvm_op, op, arg); |
448 | } | 448 | } |
449 | 449 | ||
450 | static inline int | ||
451 | HYPERVISOR_tmem_op( | ||
452 | struct tmem_op *op) | ||
453 | { | ||
454 | return _hypercall1(int, tmem_op, op); | ||
455 | } | ||
456 | |||
450 | static inline void | 457 | static inline void |
451 | MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) | 458 | MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) |
452 | { | 459 | { |
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index f420f1ff7f13..7aa6804173ab 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile | |||
@@ -1,5 +1,6 @@ | |||
1 | obj-y += grant-table.o features.o events.o manage.o balloon.o | 1 | obj-y += grant-table.o features.o events.o manage.o balloon.o |
2 | obj-y += xenbus/ | 2 | obj-y += xenbus/ |
3 | obj-y += tmem.o | ||
3 | 4 | ||
4 | nostackp := $(call cc-option, -fno-stack-protector) | 5 | nostackp := $(call cc-option, -fno-stack-protector) |
5 | CFLAGS_features.o := $(nostackp) | 6 | CFLAGS_features.o := $(nostackp) |
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c new file mode 100644 index 000000000000..816a44959ef0 --- /dev/null +++ b/drivers/xen/tmem.c | |||
@@ -0,0 +1,264 @@ | |||
1 | /* | ||
2 | * Xen implementation for transcendent memory (tmem) | ||
3 | * | ||
4 | * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. | ||
5 | * Author: Dan Magenheimer | ||
6 | */ | ||
7 | |||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/types.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/pagemap.h> | ||
12 | #include <linux/cleancache.h> | ||
13 | |||
14 | #include <xen/xen.h> | ||
15 | #include <xen/interface/xen.h> | ||
16 | #include <asm/xen/hypercall.h> | ||
17 | #include <asm/xen/page.h> | ||
18 | #include <asm/xen/hypervisor.h> | ||
19 | |||
20 | #define TMEM_CONTROL 0 | ||
21 | #define TMEM_NEW_POOL 1 | ||
22 | #define TMEM_DESTROY_POOL 2 | ||
23 | #define TMEM_NEW_PAGE 3 | ||
24 | #define TMEM_PUT_PAGE 4 | ||
25 | #define TMEM_GET_PAGE 5 | ||
26 | #define TMEM_FLUSH_PAGE 6 | ||
27 | #define TMEM_FLUSH_OBJECT 7 | ||
28 | #define TMEM_READ 8 | ||
29 | #define TMEM_WRITE 9 | ||
30 | #define TMEM_XCHG 10 | ||
31 | |||
32 | /* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */ | ||
33 | #define TMEM_POOL_PERSIST 1 | ||
34 | #define TMEM_POOL_SHARED 2 | ||
35 | #define TMEM_POOL_PAGESIZE_SHIFT 4 | ||
36 | #define TMEM_VERSION_SHIFT 24 | ||
37 | |||
38 | |||
39 | struct tmem_pool_uuid { | ||
40 | u64 uuid_lo; | ||
41 | u64 uuid_hi; | ||
42 | }; | ||
43 | |||
44 | struct tmem_oid { | ||
45 | u64 oid[3]; | ||
46 | }; | ||
47 | |||
48 | #define TMEM_POOL_PRIVATE_UUID { 0, 0 } | ||
49 | |||
50 | /* flags for tmem_ops.new_pool */ | ||
51 | #define TMEM_POOL_PERSIST 1 | ||
52 | #define TMEM_POOL_SHARED 2 | ||
53 | |||
54 | /* xen tmem foundation ops/hypercalls */ | ||
55 | |||
56 | static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid, | ||
57 | u32 index, unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len) | ||
58 | { | ||
59 | struct tmem_op op; | ||
60 | int rc = 0; | ||
61 | |||
62 | op.cmd = tmem_cmd; | ||
63 | op.pool_id = tmem_pool; | ||
64 | op.u.gen.oid[0] = oid.oid[0]; | ||
65 | op.u.gen.oid[1] = oid.oid[1]; | ||
66 | op.u.gen.oid[2] = oid.oid[2]; | ||
67 | op.u.gen.index = index; | ||
68 | op.u.gen.tmem_offset = tmem_offset; | ||
69 | op.u.gen.pfn_offset = pfn_offset; | ||
70 | op.u.gen.len = len; | ||
71 | set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn); | ||
72 | rc = HYPERVISOR_tmem_op(&op); | ||
73 | return rc; | ||
74 | } | ||
75 | |||
76 | static int xen_tmem_new_pool(struct tmem_pool_uuid uuid, | ||
77 | u32 flags, unsigned long pagesize) | ||
78 | { | ||
79 | struct tmem_op op; | ||
80 | int rc = 0, pageshift; | ||
81 | |||
82 | for (pageshift = 0; pagesize != 1; pageshift++) | ||
83 | pagesize >>= 1; | ||
84 | flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT; | ||
85 | flags |= TMEM_SPEC_VERSION << TMEM_VERSION_SHIFT; | ||
86 | op.cmd = TMEM_NEW_POOL; | ||
87 | op.u.new.uuid[0] = uuid.uuid_lo; | ||
88 | op.u.new.uuid[1] = uuid.uuid_hi; | ||
89 | op.u.new.flags = flags; | ||
90 | rc = HYPERVISOR_tmem_op(&op); | ||
91 | return rc; | ||
92 | } | ||
93 | |||
94 | /* xen generic tmem ops */ | ||
95 | |||
96 | static int xen_tmem_put_page(u32 pool_id, struct tmem_oid oid, | ||
97 | u32 index, unsigned long pfn) | ||
98 | { | ||
99 | unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn; | ||
100 | |||
101 | return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index, | ||
102 | gmfn, 0, 0, 0); | ||
103 | } | ||
104 | |||
105 | static int xen_tmem_get_page(u32 pool_id, struct tmem_oid oid, | ||
106 | u32 index, unsigned long pfn) | ||
107 | { | ||
108 | unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn; | ||
109 | |||
110 | return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index, | ||
111 | gmfn, 0, 0, 0); | ||
112 | } | ||
113 | |||
114 | static int xen_tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index) | ||
115 | { | ||
116 | return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, oid, index, | ||
117 | 0, 0, 0, 0); | ||
118 | } | ||
119 | |||
120 | static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid) | ||
121 | { | ||
122 | return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0); | ||
123 | } | ||
124 | |||
125 | static int xen_tmem_destroy_pool(u32 pool_id) | ||
126 | { | ||
127 | struct tmem_oid oid = { { 0 } }; | ||
128 | |||
129 | return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0); | ||
130 | } | ||
131 | |||
132 | int tmem_enabled; | ||
133 | |||
134 | static int __init enable_tmem(char *s) | ||
135 | { | ||
136 | tmem_enabled = 1; | ||
137 | return 1; | ||
138 | } | ||
139 | |||
140 | __setup("tmem", enable_tmem); | ||
141 | |||
142 | /* cleancache ops */ | ||
143 | |||
144 | static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key, | ||
145 | pgoff_t index, struct page *page) | ||
146 | { | ||
147 | u32 ind = (u32) index; | ||
148 | struct tmem_oid oid = *(struct tmem_oid *)&key; | ||
149 | unsigned long pfn = page_to_pfn(page); | ||
150 | |||
151 | if (pool < 0) | ||
152 | return; | ||
153 | if (ind != index) | ||
154 | return; | ||
155 | mb(); /* ensure page is quiescent; tmem may address it with an alias */ | ||
156 | (void)xen_tmem_put_page((u32)pool, oid, ind, pfn); | ||
157 | } | ||
158 | |||
159 | static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key, | ||
160 | pgoff_t index, struct page *page) | ||
161 | { | ||
162 | u32 ind = (u32) index; | ||
163 | struct tmem_oid oid = *(struct tmem_oid *)&key; | ||
164 | unsigned long pfn = page_to_pfn(page); | ||
165 | int ret; | ||
166 | |||
167 | /* translate return values to linux semantics */ | ||
168 | if (pool < 0) | ||
169 | return -1; | ||
170 | if (ind != index) | ||
171 | return -1; | ||
172 | ret = xen_tmem_get_page((u32)pool, oid, ind, pfn); | ||
173 | if (ret == 1) | ||
174 | return 0; | ||
175 | else | ||
176 | return -1; | ||
177 | } | ||
178 | |||
179 | static void tmem_cleancache_flush_page(int pool, struct cleancache_filekey key, | ||
180 | pgoff_t index) | ||
181 | { | ||
182 | u32 ind = (u32) index; | ||
183 | struct tmem_oid oid = *(struct tmem_oid *)&key; | ||
184 | |||
185 | if (pool < 0) | ||
186 | return; | ||
187 | if (ind != index) | ||
188 | return; | ||
189 | (void)xen_tmem_flush_page((u32)pool, oid, ind); | ||
190 | } | ||
191 | |||
192 | static void tmem_cleancache_flush_inode(int pool, struct cleancache_filekey key) | ||
193 | { | ||
194 | struct tmem_oid oid = *(struct tmem_oid *)&key; | ||
195 | |||
196 | if (pool < 0) | ||
197 | return; | ||
198 | (void)xen_tmem_flush_object((u32)pool, oid); | ||
199 | } | ||
200 | |||
201 | static void tmem_cleancache_flush_fs(int pool) | ||
202 | { | ||
203 | if (pool < 0) | ||
204 | return; | ||
205 | (void)xen_tmem_destroy_pool((u32)pool); | ||
206 | } | ||
207 | |||
208 | static int tmem_cleancache_init_fs(size_t pagesize) | ||
209 | { | ||
210 | struct tmem_pool_uuid uuid_private = TMEM_POOL_PRIVATE_UUID; | ||
211 | |||
212 | return xen_tmem_new_pool(uuid_private, 0, pagesize); | ||
213 | } | ||
214 | |||
215 | static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize) | ||
216 | { | ||
217 | struct tmem_pool_uuid shared_uuid; | ||
218 | |||
219 | shared_uuid.uuid_lo = *(u64 *)uuid; | ||
220 | shared_uuid.uuid_hi = *(u64 *)(&uuid[8]); | ||
221 | return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize); | ||
222 | } | ||
223 | |||
224 | static int use_cleancache = 1; | ||
225 | |||
226 | static int __init no_cleancache(char *s) | ||
227 | { | ||
228 | use_cleancache = 0; | ||
229 | return 1; | ||
230 | } | ||
231 | |||
232 | __setup("nocleancache", no_cleancache); | ||
233 | |||
234 | static struct cleancache_ops tmem_cleancache_ops = { | ||
235 | .put_page = tmem_cleancache_put_page, | ||
236 | .get_page = tmem_cleancache_get_page, | ||
237 | .flush_page = tmem_cleancache_flush_page, | ||
238 | .flush_inode = tmem_cleancache_flush_inode, | ||
239 | .flush_fs = tmem_cleancache_flush_fs, | ||
240 | .init_shared_fs = tmem_cleancache_init_shared_fs, | ||
241 | .init_fs = tmem_cleancache_init_fs | ||
242 | }; | ||
243 | |||
244 | static int __init xen_tmem_init(void) | ||
245 | { | ||
246 | struct cleancache_ops old_ops; | ||
247 | |||
248 | if (!xen_domain()) | ||
249 | return 0; | ||
250 | #ifdef CONFIG_CLEANCACHE | ||
251 | BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); | ||
252 | if (tmem_enabled && use_cleancache) { | ||
253 | char *s = ""; | ||
254 | old_ops = cleancache_register_ops(&tmem_cleancache_ops); | ||
255 | if (old_ops.init_fs != NULL) | ||
256 | s = " (WARNING: cleancache_ops overridden)"; | ||
257 | printk(KERN_INFO "cleancache enabled, RAM provided by " | ||
258 | "Xen Transcendent Memory%s\n", s); | ||
259 | } | ||
260 | #endif | ||
261 | return 0; | ||
262 | } | ||
263 | |||
264 | module_init(xen_tmem_init) | ||
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index b33257bc7e83..70213b4515eb 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h | |||
@@ -58,6 +58,7 @@ | |||
58 | #define __HYPERVISOR_event_channel_op 32 | 58 | #define __HYPERVISOR_event_channel_op 32 |
59 | #define __HYPERVISOR_physdev_op 33 | 59 | #define __HYPERVISOR_physdev_op 33 |
60 | #define __HYPERVISOR_hvm_op 34 | 60 | #define __HYPERVISOR_hvm_op 34 |
61 | #define __HYPERVISOR_tmem_op 38 | ||
61 | 62 | ||
62 | /* Architecture-specific hypercall definitions. */ | 63 | /* Architecture-specific hypercall definitions. */ |
63 | #define __HYPERVISOR_arch_0 48 | 64 | #define __HYPERVISOR_arch_0 48 |
@@ -461,6 +462,27 @@ typedef uint8_t xen_domain_handle_t[16]; | |||
461 | #define __mk_unsigned_long(x) x ## UL | 462 | #define __mk_unsigned_long(x) x ## UL |
462 | #define mk_unsigned_long(x) __mk_unsigned_long(x) | 463 | #define mk_unsigned_long(x) __mk_unsigned_long(x) |
463 | 464 | ||
465 | #define TMEM_SPEC_VERSION 1 | ||
466 | |||
467 | struct tmem_op { | ||
468 | uint32_t cmd; | ||
469 | int32_t pool_id; | ||
470 | union { | ||
471 | struct { /* for cmd == TMEM_NEW_POOL */ | ||
472 | uint64_t uuid[2]; | ||
473 | uint32_t flags; | ||
474 | } new; | ||
475 | struct { | ||
476 | uint64_t oid[3]; | ||
477 | uint32_t index; | ||
478 | uint32_t tmem_offset; | ||
479 | uint32_t pfn_offset; | ||
480 | uint32_t len; | ||
481 | GUEST_HANDLE(void) gmfn; /* guest machine page frame */ | ||
482 | } gen; | ||
483 | } u; | ||
484 | }; | ||
485 | |||
464 | #else /* __ASSEMBLY__ */ | 486 | #else /* __ASSEMBLY__ */ |
465 | 487 | ||
466 | /* In assembly code we cannot use C numeric constant suffixes. */ | 488 | /* In assembly code we cannot use C numeric constant suffixes. */ |