diff options
author | Rik van Riel <riel@surriel.com> | 2018-07-16 15:03:31 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2018-07-17 03:35:30 -0400 |
commit | c1a2f7f0c06454387c2cd7b93ff1491c715a8c69 (patch) | |
tree | f3b3cce7e45b2bab54681b23a3947a445ae38a37 | |
parent | 37c45b2354cb2270f246679bedd8bf798cca351c (diff) |
mm: Allocate the mm_cpumask (mm->cpu_bitmap[]) dynamically based on nr_cpu_ids
The mm_struct always contains a cpumask bitmap, regardless of
CONFIG_CPUMASK_OFFSTACK. That means the first step can be to
simplify things, and simply have one bitmask at the end of the
mm_struct for the mm_cpumask.
This does necessitate moving everything else in mm_struct into
an anonymous sub-structure, which can be randomized when struct
randomization is enabled.
The second step is to determine the correct size for the
mm_struct slab object from the size of the mm_struct
(excluding the CPU bitmap) and the size the cpumask.
For init_mm we can simply allocate the maximum size this
kernel is compiled for, since we only have one init_mm
in the system, anyway.
Pointer magic by Mike Galbraith, to evade -Wstringop-overflow
getting confused by the dynamically sized array.
Tested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Rik van Riel <riel@surriel.com>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team@fb.com
Cc: luto@kernel.org
Link: http://lkml.kernel.org/r/20180716190337.26133-2-riel@surriel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | drivers/firmware/efi/efi.c | 1 | ||||
-rw-r--r-- | include/linux/mm_types.h | 241 | ||||
-rw-r--r-- | kernel/fork.c | 15 | ||||
-rw-r--r-- | mm/init-mm.c | 11 |
4 files changed, 145 insertions, 123 deletions
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 232f4915223b..7f0b19410a95 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c | |||
@@ -82,6 +82,7 @@ struct mm_struct efi_mm = { | |||
82 | .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), | 82 | .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), |
83 | .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), | 83 | .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), |
84 | .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), | 84 | .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), |
85 | .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, | ||
85 | }; | 86 | }; |
86 | 87 | ||
87 | static bool disable_runtime; | 88 | static bool disable_runtime; |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 99ce070e7dcb..efdc24dd9e97 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -335,176 +335,183 @@ struct core_state { | |||
335 | 335 | ||
336 | struct kioctx_table; | 336 | struct kioctx_table; |
337 | struct mm_struct { | 337 | struct mm_struct { |
338 | struct vm_area_struct *mmap; /* list of VMAs */ | 338 | struct { |
339 | struct rb_root mm_rb; | 339 | struct vm_area_struct *mmap; /* list of VMAs */ |
340 | u32 vmacache_seqnum; /* per-thread vmacache */ | 340 | struct rb_root mm_rb; |
341 | u32 vmacache_seqnum; /* per-thread vmacache */ | ||
341 | #ifdef CONFIG_MMU | 342 | #ifdef CONFIG_MMU |
342 | unsigned long (*get_unmapped_area) (struct file *filp, | 343 | unsigned long (*get_unmapped_area) (struct file *filp, |
343 | unsigned long addr, unsigned long len, | 344 | unsigned long addr, unsigned long len, |
344 | unsigned long pgoff, unsigned long flags); | 345 | unsigned long pgoff, unsigned long flags); |
345 | #endif | 346 | #endif |
346 | unsigned long mmap_base; /* base of mmap area */ | 347 | unsigned long mmap_base; /* base of mmap area */ |
347 | unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ | 348 | unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ |
348 | #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES | 349 | #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES |
349 | /* Base adresses for compatible mmap() */ | 350 | /* Base adresses for compatible mmap() */ |
350 | unsigned long mmap_compat_base; | 351 | unsigned long mmap_compat_base; |
351 | unsigned long mmap_compat_legacy_base; | 352 | unsigned long mmap_compat_legacy_base; |
352 | #endif | 353 | #endif |
353 | unsigned long task_size; /* size of task vm space */ | 354 | unsigned long task_size; /* size of task vm space */ |
354 | unsigned long highest_vm_end; /* highest vma end address */ | 355 | unsigned long highest_vm_end; /* highest vma end address */ |
355 | pgd_t * pgd; | 356 | pgd_t * pgd; |
356 | 357 | ||
357 | /** | 358 | /** |
358 | * @mm_users: The number of users including userspace. | 359 | * @mm_users: The number of users including userspace. |
359 | * | 360 | * |
360 | * Use mmget()/mmget_not_zero()/mmput() to modify. When this drops | 361 | * Use mmget()/mmget_not_zero()/mmput() to modify. When this |
361 | * to 0 (i.e. when the task exits and there are no other temporary | 362 | * drops to 0 (i.e. when the task exits and there are no other |
362 | * reference holders), we also release a reference on @mm_count | 363 | * temporary reference holders), we also release a reference on |
363 | * (which may then free the &struct mm_struct if @mm_count also | 364 | * @mm_count (which may then free the &struct mm_struct if |
364 | * drops to 0). | 365 | * @mm_count also drops to 0). |
365 | */ | 366 | */ |
366 | atomic_t mm_users; | 367 | atomic_t mm_users; |
367 | 368 | ||
368 | /** | 369 | /** |
369 | * @mm_count: The number of references to &struct mm_struct | 370 | * @mm_count: The number of references to &struct mm_struct |
370 | * (@mm_users count as 1). | 371 | * (@mm_users count as 1). |
371 | * | 372 | * |
372 | * Use mmgrab()/mmdrop() to modify. When this drops to 0, the | 373 | * Use mmgrab()/mmdrop() to modify. When this drops to 0, the |
373 | * &struct mm_struct is freed. | 374 | * &struct mm_struct is freed. |
374 | */ | 375 | */ |
375 | atomic_t mm_count; | 376 | atomic_t mm_count; |
376 | 377 | ||
377 | #ifdef CONFIG_MMU | 378 | #ifdef CONFIG_MMU |
378 | atomic_long_t pgtables_bytes; /* PTE page table pages */ | 379 | atomic_long_t pgtables_bytes; /* PTE page table pages */ |
379 | #endif | 380 | #endif |
380 | int map_count; /* number of VMAs */ | 381 | int map_count; /* number of VMAs */ |
381 | 382 | ||
382 | spinlock_t page_table_lock; /* Protects page tables and some counters */ | 383 | spinlock_t page_table_lock; /* Protects page tables and some |
383 | struct rw_semaphore mmap_sem; | 384 | * counters |
385 | */ | ||
386 | struct rw_semaphore mmap_sem; | ||
384 | 387 | ||
385 | struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung | 388 | struct list_head mmlist; /* List of maybe swapped mm's. These |
386 | * together off init_mm.mmlist, and are protected | 389 | * are globally strung together off |
387 | * by mmlist_lock | 390 | * init_mm.mmlist, and are protected |
388 | */ | 391 | * by mmlist_lock |
392 | */ | ||
389 | 393 | ||
390 | 394 | ||
391 | unsigned long hiwater_rss; /* High-watermark of RSS usage */ | 395 | unsigned long hiwater_rss; /* High-watermark of RSS usage */ |
392 | unsigned long hiwater_vm; /* High-water virtual memory usage */ | 396 | unsigned long hiwater_vm; /* High-water virtual memory usage */ |
393 | 397 | ||
394 | unsigned long total_vm; /* Total pages mapped */ | 398 | unsigned long total_vm; /* Total pages mapped */ |
395 | unsigned long locked_vm; /* Pages that have PG_mlocked set */ | 399 | unsigned long locked_vm; /* Pages that have PG_mlocked set */ |
396 | unsigned long pinned_vm; /* Refcount permanently increased */ | 400 | unsigned long pinned_vm; /* Refcount permanently increased */ |
397 | unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ | 401 | unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ |
398 | unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ | 402 | unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ |
399 | unsigned long stack_vm; /* VM_STACK */ | 403 | unsigned long stack_vm; /* VM_STACK */ |
400 | unsigned long def_flags; | 404 | unsigned long def_flags; |
401 | 405 | ||
402 | spinlock_t arg_lock; /* protect the below fields */ | 406 | spinlock_t arg_lock; /* protect the below fields */ |
403 | unsigned long start_code, end_code, start_data, end_data; | 407 | unsigned long start_code, end_code, start_data, end_data; |
404 | unsigned long start_brk, brk, start_stack; | 408 | unsigned long start_brk, brk, start_stack; |
405 | unsigned long arg_start, arg_end, env_start, env_end; | 409 | unsigned long arg_start, arg_end, env_start, env_end; |
406 | 410 | ||
407 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ | 411 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ |
408 | 412 | ||
409 | /* | 413 | /* |
410 | * Special counters, in some configurations protected by the | 414 | * Special counters, in some configurations protected by the |
411 | * page_table_lock, in other configurations by being atomic. | 415 | * page_table_lock, in other configurations by being atomic. |
412 | */ | 416 | */ |
413 | struct mm_rss_stat rss_stat; | 417 | struct mm_rss_stat rss_stat; |
414 | |||
415 | struct linux_binfmt *binfmt; | ||
416 | 418 | ||
417 | cpumask_var_t cpu_vm_mask_var; | 419 | struct linux_binfmt *binfmt; |
418 | 420 | ||
419 | /* Architecture-specific MM context */ | 421 | /* Architecture-specific MM context */ |
420 | mm_context_t context; | 422 | mm_context_t context; |
421 | 423 | ||
422 | unsigned long flags; /* Must use atomic bitops to access the bits */ | 424 | unsigned long flags; /* Must use atomic bitops to access */ |
423 | 425 | ||
424 | struct core_state *core_state; /* coredumping support */ | 426 | struct core_state *core_state; /* coredumping support */ |
425 | #ifdef CONFIG_MEMBARRIER | 427 | #ifdef CONFIG_MEMBARRIER |
426 | atomic_t membarrier_state; | 428 | atomic_t membarrier_state; |
427 | #endif | 429 | #endif |
428 | #ifdef CONFIG_AIO | 430 | #ifdef CONFIG_AIO |
429 | spinlock_t ioctx_lock; | 431 | spinlock_t ioctx_lock; |
430 | struct kioctx_table __rcu *ioctx_table; | 432 | struct kioctx_table __rcu *ioctx_table; |
431 | #endif | 433 | #endif |
432 | #ifdef CONFIG_MEMCG | 434 | #ifdef CONFIG_MEMCG |
433 | /* | 435 | /* |
434 | * "owner" points to a task that is regarded as the canonical | 436 | * "owner" points to a task that is regarded as the canonical |
435 | * user/owner of this mm. All of the following must be true in | 437 | * user/owner of this mm. All of the following must be true in |
436 | * order for it to be changed: | 438 | * order for it to be changed: |
437 | * | 439 | * |
438 | * current == mm->owner | 440 | * current == mm->owner |
439 | * current->mm != mm | 441 | * current->mm != mm |
440 | * new_owner->mm == mm | 442 | * new_owner->mm == mm |
441 | * new_owner->alloc_lock is held | 443 | * new_owner->alloc_lock is held |
442 | */ | 444 | */ |
443 | struct task_struct __rcu *owner; | 445 | struct task_struct __rcu *owner; |
444 | #endif | 446 | #endif |
445 | struct user_namespace *user_ns; | 447 | struct user_namespace *user_ns; |
446 | 448 | ||
447 | /* store ref to file /proc/<pid>/exe symlink points to */ | 449 | /* store ref to file /proc/<pid>/exe symlink points to */ |
448 | struct file __rcu *exe_file; | 450 | struct file __rcu *exe_file; |
449 | #ifdef CONFIG_MMU_NOTIFIER | 451 | #ifdef CONFIG_MMU_NOTIFIER |
450 | struct mmu_notifier_mm *mmu_notifier_mm; | 452 | struct mmu_notifier_mm *mmu_notifier_mm; |
451 | #endif | 453 | #endif |
452 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS | 454 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS |
453 | pgtable_t pmd_huge_pte; /* protected by page_table_lock */ | 455 | pgtable_t pmd_huge_pte; /* protected by page_table_lock */ |
454 | #endif | ||
455 | #ifdef CONFIG_CPUMASK_OFFSTACK | ||
456 | struct cpumask cpumask_allocation; | ||
457 | #endif | 456 | #endif |
458 | #ifdef CONFIG_NUMA_BALANCING | 457 | #ifdef CONFIG_NUMA_BALANCING |
459 | /* | 458 | /* |
460 | * numa_next_scan is the next time that the PTEs will be marked | 459 | * numa_next_scan is the next time that the PTEs will be marked |
461 | * pte_numa. NUMA hinting faults will gather statistics and migrate | 460 | * pte_numa. NUMA hinting faults will gather statistics and |
462 | * pages to new nodes if necessary. | 461 | * migrate pages to new nodes if necessary. |
463 | */ | 462 | */ |
464 | unsigned long numa_next_scan; | 463 | unsigned long numa_next_scan; |
465 | 464 | ||
466 | /* Restart point for scanning and setting pte_numa */ | 465 | /* Restart point for scanning and setting pte_numa */ |
467 | unsigned long numa_scan_offset; | 466 | unsigned long numa_scan_offset; |
468 | 467 | ||
469 | /* numa_scan_seq prevents two threads setting pte_numa */ | 468 | /* numa_scan_seq prevents two threads setting pte_numa */ |
470 | int numa_scan_seq; | 469 | int numa_scan_seq; |
471 | #endif | 470 | #endif |
472 | /* | 471 | /* |
473 | * An operation with batched TLB flushing is going on. Anything that | 472 | * An operation with batched TLB flushing is going on. Anything |
474 | * can move process memory needs to flush the TLB when moving a | 473 | * that can move process memory needs to flush the TLB when |
475 | * PROT_NONE or PROT_NUMA mapped page. | 474 | * moving a PROT_NONE or PROT_NUMA mapped page. |
476 | */ | 475 | */ |
477 | atomic_t tlb_flush_pending; | 476 | atomic_t tlb_flush_pending; |
478 | #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH | 477 | #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH |
479 | /* See flush_tlb_batched_pending() */ | 478 | /* See flush_tlb_batched_pending() */ |
480 | bool tlb_flush_batched; | 479 | bool tlb_flush_batched; |
481 | #endif | 480 | #endif |
482 | struct uprobes_state uprobes_state; | 481 | struct uprobes_state uprobes_state; |
483 | #ifdef CONFIG_HUGETLB_PAGE | 482 | #ifdef CONFIG_HUGETLB_PAGE |
484 | atomic_long_t hugetlb_usage; | 483 | atomic_long_t hugetlb_usage; |
485 | #endif | 484 | #endif |
486 | struct work_struct async_put_work; | 485 | struct work_struct async_put_work; |
487 | 486 | ||
488 | #if IS_ENABLED(CONFIG_HMM) | 487 | #if IS_ENABLED(CONFIG_HMM) |
489 | /* HMM needs to track a few things per mm */ | 488 | /* HMM needs to track a few things per mm */ |
490 | struct hmm *hmm; | 489 | struct hmm *hmm; |
491 | #endif | 490 | #endif |
492 | } __randomize_layout; | 491 | } __randomize_layout; |
492 | |||
493 | /* | ||
494 | * The mm_cpumask needs to be at the end of mm_struct, because it | ||
495 | * is dynamically sized based on nr_cpu_ids. | ||
496 | */ | ||
497 | unsigned long cpu_bitmap[]; | ||
498 | }; | ||
493 | 499 | ||
494 | extern struct mm_struct init_mm; | 500 | extern struct mm_struct init_mm; |
495 | 501 | ||
502 | /* Pointer magic because the dynamic array size confuses some compilers. */ | ||
496 | static inline void mm_init_cpumask(struct mm_struct *mm) | 503 | static inline void mm_init_cpumask(struct mm_struct *mm) |
497 | { | 504 | { |
498 | #ifdef CONFIG_CPUMASK_OFFSTACK | 505 | unsigned long cpu_bitmap = (unsigned long)mm; |
499 | mm->cpu_vm_mask_var = &mm->cpumask_allocation; | 506 | |
500 | #endif | 507 | cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap); |
501 | cpumask_clear(mm->cpu_vm_mask_var); | 508 | cpumask_clear((struct cpumask *)cpu_bitmap); |
502 | } | 509 | } |
503 | 510 | ||
504 | /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ | 511 | /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ |
505 | static inline cpumask_t *mm_cpumask(struct mm_struct *mm) | 512 | static inline cpumask_t *mm_cpumask(struct mm_struct *mm) |
506 | { | 513 | { |
507 | return mm->cpu_vm_mask_var; | 514 | return (struct cpumask *)&mm->cpu_bitmap; |
508 | } | 515 | } |
509 | 516 | ||
510 | struct mmu_gather; | 517 | struct mmu_gather; |
diff --git a/kernel/fork.c b/kernel/fork.c index 9440d61b925c..5b64c1b8461e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -2253,6 +2253,8 @@ static void sighand_ctor(void *data) | |||
2253 | 2253 | ||
2254 | void __init proc_caches_init(void) | 2254 | void __init proc_caches_init(void) |
2255 | { | 2255 | { |
2256 | unsigned int mm_size; | ||
2257 | |||
2256 | sighand_cachep = kmem_cache_create("sighand_cache", | 2258 | sighand_cachep = kmem_cache_create("sighand_cache", |
2257 | sizeof(struct sighand_struct), 0, | 2259 | sizeof(struct sighand_struct), 0, |
2258 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| | 2260 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| |
@@ -2269,15 +2271,16 @@ void __init proc_caches_init(void) | |||
2269 | sizeof(struct fs_struct), 0, | 2271 | sizeof(struct fs_struct), 0, |
2270 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, | 2272 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, |
2271 | NULL); | 2273 | NULL); |
2274 | |||
2272 | /* | 2275 | /* |
2273 | * FIXME! The "sizeof(struct mm_struct)" currently includes the | 2276 | * The mm_cpumask is located at the end of mm_struct, and is |
2274 | * whole struct cpumask for the OFFSTACK case. We could change | 2277 | * dynamically sized based on the maximum CPU number this system |
2275 | * this to *only* allocate as much of it as required by the | 2278 | * can have, taking hotplug into account (nr_cpu_ids). |
2276 | * maximum number of CPU's we can ever have. The cpumask_allocation | ||
2277 | * is at the end of the structure, exactly for that reason. | ||
2278 | */ | 2279 | */ |
2280 | mm_size = sizeof(struct mm_struct) + cpumask_size(); | ||
2281 | |||
2279 | mm_cachep = kmem_cache_create_usercopy("mm_struct", | 2282 | mm_cachep = kmem_cache_create_usercopy("mm_struct", |
2280 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, | 2283 | mm_size, ARCH_MIN_MMSTRUCT_ALIGN, |
2281 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, | 2284 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, |
2282 | offsetof(struct mm_struct, saved_auxv), | 2285 | offsetof(struct mm_struct, saved_auxv), |
2283 | sizeof_field(struct mm_struct, saved_auxv), | 2286 | sizeof_field(struct mm_struct, saved_auxv), |
diff --git a/mm/init-mm.c b/mm/init-mm.c index f0179c9c04c2..a787a319211e 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c | |||
@@ -15,6 +15,16 @@ | |||
15 | #define INIT_MM_CONTEXT(name) | 15 | #define INIT_MM_CONTEXT(name) |
16 | #endif | 16 | #endif |
17 | 17 | ||
18 | /* | ||
19 | * For dynamically allocated mm_structs, there is a dynamically sized cpumask | ||
20 | * at the end of the structure, the size of which depends on the maximum CPU | ||
21 | * number the system can see. That way we allocate only as much memory for | ||
22 | * mm_cpumask() as needed for the hundreds, or thousands of processes that | ||
23 | * a system typically runs. | ||
24 | * | ||
25 | * Since there is only one init_mm in the entire system, keep it simple | ||
26 | * and size this cpu_bitmask to NR_CPUS. | ||
27 | */ | ||
18 | struct mm_struct init_mm = { | 28 | struct mm_struct init_mm = { |
19 | .mm_rb = RB_ROOT, | 29 | .mm_rb = RB_ROOT, |
20 | .pgd = swapper_pg_dir, | 30 | .pgd = swapper_pg_dir, |
@@ -25,5 +35,6 @@ struct mm_struct init_mm = { | |||
25 | .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), | 35 | .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), |
26 | .mmlist = LIST_HEAD_INIT(init_mm.mmlist), | 36 | .mmlist = LIST_HEAD_INIT(init_mm.mmlist), |
27 | .user_ns = &init_user_ns, | 37 | .user_ns = &init_user_ns, |
38 | .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, | ||
28 | INIT_MM_CONTEXT(init_mm) | 39 | INIT_MM_CONTEXT(init_mm) |
29 | }; | 40 | }; |