diff options
author | Carsten Otte <cotte@de.ibm.com> | 2008-03-25 13:47:10 -0400 |
---|---|---|
committer | Avi Kivity <avi@qumranet.com> | 2008-04-27 05:00:40 -0400 |
commit | 402b08622d9ac6e32e25289573272e0f21bb58a7 (patch) | |
tree | 40d7386154cef85c9bfd2bd862db025933820776 | |
parent | 37817f2982d0f559f90cecc66e150dd9d2c2df05 (diff) |
s390: KVM preparation: provide hook to enable pgstes in user pagetable
The SIE instruction on s390 uses the 2nd half of the page table page to
virtualize the storage keys of a guest. This patch offers the s390_enable_sie
function, which reorganizes the page tables of a single-threaded process to
reserve space in the page table:
s390_enable_sie makes sure that the process is single threaded and then uses
dup_mm to create a new mm with reorganized page tables. The old mm is freed
and the process has now a page status extended field after every page table.
Code that wants to exploit pgstes should SELECT CONFIG_PGSTE.
This patch has a small common code hit, namely making dup_mm non-static.
Edit (Carsten): I've modified Martin's patch, following Jeremy Fitzhardinge's
review feedback. Now we do have the prototype for dup_mm in
include/linux/sched.h. Following Martin's suggestion, s390_enable_sie() does now
call task_lock() to prevent race against ptrace modification of mm_users.
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Avi Kivity <avi@qumranet.com>
-rw-r--r-- | arch/s390/Kconfig | 4 | ||||
-rw-r--r-- | arch/s390/kernel/setup.c | 4 | ||||
-rw-r--r-- | arch/s390/mm/pgtable.c | 65 | ||||
-rw-r--r-- | include/asm-s390/mmu.h | 1 | ||||
-rw-r--r-- | include/asm-s390/mmu_context.h | 8 | ||||
-rw-r--r-- | include/asm-s390/pgtable.h | 1 | ||||
-rw-r--r-- | include/linux/sched.h | 2 | ||||
-rw-r--r-- | kernel/fork.c | 2 |
8 files changed, 82 insertions, 5 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index f6a68e178fc5..513a0589e81d 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig | |||
@@ -62,6 +62,10 @@ config GENERIC_LOCKBREAK | |||
62 | default y | 62 | default y |
63 | depends on SMP && PREEMPT | 63 | depends on SMP && PREEMPT |
64 | 64 | ||
65 | config PGSTE | ||
66 | bool | ||
67 | default y if KVM | ||
68 | |||
65 | mainmenu "Linux Kernel Configuration" | 69 | mainmenu "Linux Kernel Configuration" |
66 | 70 | ||
67 | config S390 | 71 | config S390 |
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 7141147e6b63..2f35133ebc18 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c | |||
@@ -316,7 +316,11 @@ static int __init early_parse_ipldelay(char *p) | |||
316 | early_param("ipldelay", early_parse_ipldelay); | 316 | early_param("ipldelay", early_parse_ipldelay); |
317 | 317 | ||
318 | #ifdef CONFIG_S390_SWITCH_AMODE | 318 | #ifdef CONFIG_S390_SWITCH_AMODE |
319 | #ifdef CONFIG_PGSTE | ||
320 | unsigned int switch_amode = 1; | ||
321 | #else | ||
319 | unsigned int switch_amode = 0; | 322 | unsigned int switch_amode = 0; |
323 | #endif | ||
320 | EXPORT_SYMBOL_GPL(switch_amode); | 324 | EXPORT_SYMBOL_GPL(switch_amode); |
321 | 325 | ||
322 | static void set_amode_and_uaccess(unsigned long user_amode, | 326 | static void set_amode_and_uaccess(unsigned long user_amode, |
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index fd072013f88c..5c1aea97cd12 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c | |||
@@ -30,11 +30,27 @@ | |||
30 | #define TABLES_PER_PAGE 4 | 30 | #define TABLES_PER_PAGE 4 |
31 | #define FRAG_MASK 15UL | 31 | #define FRAG_MASK 15UL |
32 | #define SECOND_HALVES 10UL | 32 | #define SECOND_HALVES 10UL |
33 | |||
34 | void clear_table_pgstes(unsigned long *table) | ||
35 | { | ||
36 | clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); | ||
37 | memset(table + 256, 0, PAGE_SIZE/4); | ||
38 | clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); | ||
39 | memset(table + 768, 0, PAGE_SIZE/4); | ||
40 | } | ||
41 | |||
33 | #else | 42 | #else |
34 | #define ALLOC_ORDER 2 | 43 | #define ALLOC_ORDER 2 |
35 | #define TABLES_PER_PAGE 2 | 44 | #define TABLES_PER_PAGE 2 |
36 | #define FRAG_MASK 3UL | 45 | #define FRAG_MASK 3UL |
37 | #define SECOND_HALVES 2UL | 46 | #define SECOND_HALVES 2UL |
47 | |||
48 | void clear_table_pgstes(unsigned long *table) | ||
49 | { | ||
50 | clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); | ||
51 | memset(table + 256, 0, PAGE_SIZE/2); | ||
52 | } | ||
53 | |||
38 | #endif | 54 | #endif |
39 | 55 | ||
40 | unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec) | 56 | unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec) |
@@ -153,7 +169,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) | |||
153 | unsigned long *table; | 169 | unsigned long *table; |
154 | unsigned long bits; | 170 | unsigned long bits; |
155 | 171 | ||
156 | bits = mm->context.noexec ? 3UL : 1UL; | 172 | bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL; |
157 | spin_lock(&mm->page_table_lock); | 173 | spin_lock(&mm->page_table_lock); |
158 | page = NULL; | 174 | page = NULL; |
159 | if (!list_empty(&mm->context.pgtable_list)) { | 175 | if (!list_empty(&mm->context.pgtable_list)) { |
@@ -170,7 +186,10 @@ unsigned long *page_table_alloc(struct mm_struct *mm) | |||
170 | pgtable_page_ctor(page); | 186 | pgtable_page_ctor(page); |
171 | page->flags &= ~FRAG_MASK; | 187 | page->flags &= ~FRAG_MASK; |
172 | table = (unsigned long *) page_to_phys(page); | 188 | table = (unsigned long *) page_to_phys(page); |
173 | clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); | 189 | if (mm->context.pgstes) |
190 | clear_table_pgstes(table); | ||
191 | else | ||
192 | clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); | ||
174 | spin_lock(&mm->page_table_lock); | 193 | spin_lock(&mm->page_table_lock); |
175 | list_add(&page->lru, &mm->context.pgtable_list); | 194 | list_add(&page->lru, &mm->context.pgtable_list); |
176 | } | 195 | } |
@@ -191,7 +210,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) | |||
191 | struct page *page; | 210 | struct page *page; |
192 | unsigned long bits; | 211 | unsigned long bits; |
193 | 212 | ||
194 | bits = mm->context.noexec ? 3UL : 1UL; | 213 | bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL; |
195 | bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); | 214 | bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); |
196 | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); | 215 | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); |
197 | spin_lock(&mm->page_table_lock); | 216 | spin_lock(&mm->page_table_lock); |
@@ -228,3 +247,43 @@ void disable_noexec(struct mm_struct *mm, struct task_struct *tsk) | |||
228 | mm->context.noexec = 0; | 247 | mm->context.noexec = 0; |
229 | update_mm(mm, tsk); | 248 | update_mm(mm, tsk); |
230 | } | 249 | } |
250 | |||
251 | /* | ||
252 | * switch on pgstes for its userspace process (for kvm) | ||
253 | */ | ||
254 | int s390_enable_sie(void) | ||
255 | { | ||
256 | struct task_struct *tsk = current; | ||
257 | struct mm_struct *mm; | ||
258 | int rc; | ||
259 | |||
260 | task_lock(tsk); | ||
261 | |||
262 | rc = 0; | ||
263 | if (tsk->mm->context.pgstes) | ||
264 | goto unlock; | ||
265 | |||
266 | rc = -EINVAL; | ||
267 | if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || | ||
268 | tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) | ||
269 | goto unlock; | ||
270 | |||
271 | tsk->mm->context.pgstes = 1; /* dirty little tricks .. */ | ||
272 | mm = dup_mm(tsk); | ||
273 | tsk->mm->context.pgstes = 0; | ||
274 | |||
275 | rc = -ENOMEM; | ||
276 | if (!mm) | ||
277 | goto unlock; | ||
278 | mmput(tsk->mm); | ||
279 | tsk->mm = tsk->active_mm = mm; | ||
280 | preempt_disable(); | ||
281 | update_mm(mm, tsk); | ||
282 | cpu_set(smp_processor_id(), mm->cpu_vm_mask); | ||
283 | preempt_enable(); | ||
284 | rc = 0; | ||
285 | unlock: | ||
286 | task_unlock(tsk); | ||
287 | return rc; | ||
288 | } | ||
289 | EXPORT_SYMBOL_GPL(s390_enable_sie); | ||
diff --git a/include/asm-s390/mmu.h b/include/asm-s390/mmu.h index 1698e29c5b20..5dd5e7b3476f 100644 --- a/include/asm-s390/mmu.h +++ b/include/asm-s390/mmu.h | |||
@@ -7,6 +7,7 @@ typedef struct { | |||
7 | unsigned long asce_bits; | 7 | unsigned long asce_bits; |
8 | unsigned long asce_limit; | 8 | unsigned long asce_limit; |
9 | int noexec; | 9 | int noexec; |
10 | int pgstes; | ||
10 | } mm_context_t; | 11 | } mm_context_t; |
11 | 12 | ||
12 | #endif | 13 | #endif |
diff --git a/include/asm-s390/mmu_context.h b/include/asm-s390/mmu_context.h index b5a34c6f91a9..4c2fbf48c9c4 100644 --- a/include/asm-s390/mmu_context.h +++ b/include/asm-s390/mmu_context.h | |||
@@ -20,7 +20,13 @@ static inline int init_new_context(struct task_struct *tsk, | |||
20 | #ifdef CONFIG_64BIT | 20 | #ifdef CONFIG_64BIT |
21 | mm->context.asce_bits |= _ASCE_TYPE_REGION3; | 21 | mm->context.asce_bits |= _ASCE_TYPE_REGION3; |
22 | #endif | 22 | #endif |
23 | mm->context.noexec = s390_noexec; | 23 | if (current->mm->context.pgstes) { |
24 | mm->context.noexec = 0; | ||
25 | mm->context.pgstes = 1; | ||
26 | } else { | ||
27 | mm->context.noexec = s390_noexec; | ||
28 | mm->context.pgstes = 0; | ||
29 | } | ||
24 | mm->context.asce_limit = STACK_TOP_MAX; | 30 | mm->context.asce_limit = STACK_TOP_MAX; |
25 | crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); | 31 | crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); |
26 | return 0; | 32 | return 0; |
diff --git a/include/asm-s390/pgtable.h b/include/asm-s390/pgtable.h index 65154dc9a9e5..8e9a629dc199 100644 --- a/include/asm-s390/pgtable.h +++ b/include/asm-s390/pgtable.h | |||
@@ -966,6 +966,7 @@ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) | |||
966 | 966 | ||
967 | extern int add_shared_memory(unsigned long start, unsigned long size); | 967 | extern int add_shared_memory(unsigned long start, unsigned long size); |
968 | extern int remove_shared_memory(unsigned long start, unsigned long size); | 968 | extern int remove_shared_memory(unsigned long start, unsigned long size); |
969 | extern int s390_enable_sie(void); | ||
969 | 970 | ||
970 | /* | 971 | /* |
971 | * No page table caches to initialise | 972 | * No page table caches to initialise |
diff --git a/include/linux/sched.h b/include/linux/sched.h index d0bd97044abd..9a4f3e63e3bf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1798,6 +1798,8 @@ extern void mmput(struct mm_struct *); | |||
1798 | extern struct mm_struct *get_task_mm(struct task_struct *task); | 1798 | extern struct mm_struct *get_task_mm(struct task_struct *task); |
1799 | /* Remove the current tasks stale references to the old mm_struct */ | 1799 | /* Remove the current tasks stale references to the old mm_struct */ |
1800 | extern void mm_release(struct task_struct *, struct mm_struct *); | 1800 | extern void mm_release(struct task_struct *, struct mm_struct *); |
1801 | /* Allocate a new mm structure and copy contents from tsk->mm */ | ||
1802 | extern struct mm_struct *dup_mm(struct task_struct *tsk); | ||
1801 | 1803 | ||
1802 | extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); | 1804 | extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); |
1803 | extern void flush_thread(void); | 1805 | extern void flush_thread(void); |
diff --git a/kernel/fork.c b/kernel/fork.c index cb46befdd3a0..c674aa8d3c31 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -521,7 +521,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
521 | * Allocate a new mm structure and copy contents from the | 521 | * Allocate a new mm structure and copy contents from the |
522 | * mm structure of the passed in task structure. | 522 | * mm structure of the passed in task structure. |
523 | */ | 523 | */ |
524 | static struct mm_struct *dup_mm(struct task_struct *tsk) | 524 | struct mm_struct *dup_mm(struct task_struct *tsk) |
525 | { | 525 | { |
526 | struct mm_struct *mm, *oldmm = current->mm; | 526 | struct mm_struct *mm, *oldmm = current->mm; |
527 | int err; | 527 | int err; |