aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRik van Riel <riel@surriel.com>2018-07-16 15:03:31 -0400
committerIngo Molnar <mingo@kernel.org>2018-07-17 03:35:30 -0400
commitc1a2f7f0c06454387c2cd7b93ff1491c715a8c69 (patch)
treef3b3cce7e45b2bab54681b23a3947a445ae38a37
parent37c45b2354cb2270f246679bedd8bf798cca351c (diff)
mm: Allocate the mm_cpumask (mm->cpu_bitmap[]) dynamically based on nr_cpu_ids
The mm_struct always contains a cpumask bitmap, regardless of CONFIG_CPUMASK_OFFSTACK. That means the first step can be to simplify things, and simply have one bitmask at the end of the mm_struct for the mm_cpumask. This does necessitate moving everything else in mm_struct into an anonymous sub-structure, which can be randomized when struct randomization is enabled. The second step is to determine the correct size for the mm_struct slab object from the size of the mm_struct (excluding the CPU bitmap) and the size the cpumask. For init_mm we can simply allocate the maximum size this kernel is compiled for, since we only have one init_mm in the system, anyway. Pointer magic by Mike Galbraith, to evade -Wstringop-overflow getting confused by the dynamically sized array. Tested-by: Song Liu <songliubraving@fb.com> Signed-off-by: Rik van Riel <riel@surriel.com> Signed-off-by: Mike Galbraith <efault@gmx.de> Signed-off-by: Rik van Riel <riel@surriel.com> Acked-by: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: kernel-team@fb.com Cc: luto@kernel.org Link: http://lkml.kernel.org/r/20180716190337.26133-2-riel@surriel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--drivers/firmware/efi/efi.c1
-rw-r--r--include/linux/mm_types.h241
-rw-r--r--kernel/fork.c15
-rw-r--r--mm/init-mm.c11
4 files changed, 145 insertions, 123 deletions
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 232f4915223b..7f0b19410a95 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -82,6 +82,7 @@ struct mm_struct efi_mm = {
82 .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), 82 .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem),
83 .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), 83 .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
84 .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), 84 .mmlist = LIST_HEAD_INIT(efi_mm.mmlist),
85 .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
85}; 86};
86 87
87static bool disable_runtime; 88static bool disable_runtime;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 99ce070e7dcb..efdc24dd9e97 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -335,176 +335,183 @@ struct core_state {
335 335
336struct kioctx_table; 336struct kioctx_table;
337struct mm_struct { 337struct mm_struct {
338 struct vm_area_struct *mmap; /* list of VMAs */ 338 struct {
339 struct rb_root mm_rb; 339 struct vm_area_struct *mmap; /* list of VMAs */
340 u32 vmacache_seqnum; /* per-thread vmacache */ 340 struct rb_root mm_rb;
341 u32 vmacache_seqnum; /* per-thread vmacache */
341#ifdef CONFIG_MMU 342#ifdef CONFIG_MMU
342 unsigned long (*get_unmapped_area) (struct file *filp, 343 unsigned long (*get_unmapped_area) (struct file *filp,
343 unsigned long addr, unsigned long len, 344 unsigned long addr, unsigned long len,
344 unsigned long pgoff, unsigned long flags); 345 unsigned long pgoff, unsigned long flags);
345#endif 346#endif
346 unsigned long mmap_base; /* base of mmap area */ 347 unsigned long mmap_base; /* base of mmap area */
347 unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ 348 unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
348#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES 349#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
349 /* Base adresses for compatible mmap() */ 350 /* Base adresses for compatible mmap() */
350 unsigned long mmap_compat_base; 351 unsigned long mmap_compat_base;
351 unsigned long mmap_compat_legacy_base; 352 unsigned long mmap_compat_legacy_base;
352#endif 353#endif
353 unsigned long task_size; /* size of task vm space */ 354 unsigned long task_size; /* size of task vm space */
354 unsigned long highest_vm_end; /* highest vma end address */ 355 unsigned long highest_vm_end; /* highest vma end address */
355 pgd_t * pgd; 356 pgd_t * pgd;
356 357
357 /** 358 /**
358 * @mm_users: The number of users including userspace. 359 * @mm_users: The number of users including userspace.
359 * 360 *
360 * Use mmget()/mmget_not_zero()/mmput() to modify. When this drops 361 * Use mmget()/mmget_not_zero()/mmput() to modify. When this
361 * to 0 (i.e. when the task exits and there are no other temporary 362 * drops to 0 (i.e. when the task exits and there are no other
362 * reference holders), we also release a reference on @mm_count 363 * temporary reference holders), we also release a reference on
363 * (which may then free the &struct mm_struct if @mm_count also 364 * @mm_count (which may then free the &struct mm_struct if
364 * drops to 0). 365 * @mm_count also drops to 0).
365 */ 366 */
366 atomic_t mm_users; 367 atomic_t mm_users;
367 368
368 /** 369 /**
369 * @mm_count: The number of references to &struct mm_struct 370 * @mm_count: The number of references to &struct mm_struct
370 * (@mm_users count as 1). 371 * (@mm_users count as 1).
371 * 372 *
372 * Use mmgrab()/mmdrop() to modify. When this drops to 0, the 373 * Use mmgrab()/mmdrop() to modify. When this drops to 0, the
373 * &struct mm_struct is freed. 374 * &struct mm_struct is freed.
374 */ 375 */
375 atomic_t mm_count; 376 atomic_t mm_count;
376 377
377#ifdef CONFIG_MMU 378#ifdef CONFIG_MMU
378 atomic_long_t pgtables_bytes; /* PTE page table pages */ 379 atomic_long_t pgtables_bytes; /* PTE page table pages */
379#endif 380#endif
380 int map_count; /* number of VMAs */ 381 int map_count; /* number of VMAs */
381 382
382 spinlock_t page_table_lock; /* Protects page tables and some counters */ 383 spinlock_t page_table_lock; /* Protects page tables and some
383 struct rw_semaphore mmap_sem; 384 * counters
385 */
386 struct rw_semaphore mmap_sem;
384 387
385 struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung 388 struct list_head mmlist; /* List of maybe swapped mm's. These
386 * together off init_mm.mmlist, and are protected 389 * are globally strung together off
387 * by mmlist_lock 390 * init_mm.mmlist, and are protected
388 */ 391 * by mmlist_lock
392 */
389 393
390 394
391 unsigned long hiwater_rss; /* High-watermark of RSS usage */ 395 unsigned long hiwater_rss; /* High-watermark of RSS usage */
392 unsigned long hiwater_vm; /* High-water virtual memory usage */ 396 unsigned long hiwater_vm; /* High-water virtual memory usage */
393 397
394 unsigned long total_vm; /* Total pages mapped */ 398 unsigned long total_vm; /* Total pages mapped */
395 unsigned long locked_vm; /* Pages that have PG_mlocked set */ 399 unsigned long locked_vm; /* Pages that have PG_mlocked set */
396 unsigned long pinned_vm; /* Refcount permanently increased */ 400 unsigned long pinned_vm; /* Refcount permanently increased */
397 unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ 401 unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
398 unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ 402 unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
399 unsigned long stack_vm; /* VM_STACK */ 403 unsigned long stack_vm; /* VM_STACK */
400 unsigned long def_flags; 404 unsigned long def_flags;
401 405
402 spinlock_t arg_lock; /* protect the below fields */ 406 spinlock_t arg_lock; /* protect the below fields */
403 unsigned long start_code, end_code, start_data, end_data; 407 unsigned long start_code, end_code, start_data, end_data;
404 unsigned long start_brk, brk, start_stack; 408 unsigned long start_brk, brk, start_stack;
405 unsigned long arg_start, arg_end, env_start, env_end; 409 unsigned long arg_start, arg_end, env_start, env_end;
406 410
407 unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ 411 unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
408 412
409 /* 413 /*
410 * Special counters, in some configurations protected by the 414 * Special counters, in some configurations protected by the
411 * page_table_lock, in other configurations by being atomic. 415 * page_table_lock, in other configurations by being atomic.
412 */ 416 */
413 struct mm_rss_stat rss_stat; 417 struct mm_rss_stat rss_stat;
414
415 struct linux_binfmt *binfmt;
416 418
417 cpumask_var_t cpu_vm_mask_var; 419 struct linux_binfmt *binfmt;
418 420
419 /* Architecture-specific MM context */ 421 /* Architecture-specific MM context */
420 mm_context_t context; 422 mm_context_t context;
421 423
422 unsigned long flags; /* Must use atomic bitops to access the bits */ 424 unsigned long flags; /* Must use atomic bitops to access */
423 425
424 struct core_state *core_state; /* coredumping support */ 426 struct core_state *core_state; /* coredumping support */
425#ifdef CONFIG_MEMBARRIER 427#ifdef CONFIG_MEMBARRIER
426 atomic_t membarrier_state; 428 atomic_t membarrier_state;
427#endif 429#endif
428#ifdef CONFIG_AIO 430#ifdef CONFIG_AIO
429 spinlock_t ioctx_lock; 431 spinlock_t ioctx_lock;
430 struct kioctx_table __rcu *ioctx_table; 432 struct kioctx_table __rcu *ioctx_table;
431#endif 433#endif
432#ifdef CONFIG_MEMCG 434#ifdef CONFIG_MEMCG
433 /* 435 /*
434 * "owner" points to a task that is regarded as the canonical 436 * "owner" points to a task that is regarded as the canonical
435 * user/owner of this mm. All of the following must be true in 437 * user/owner of this mm. All of the following must be true in
436 * order for it to be changed: 438 * order for it to be changed:
437 * 439 *
438 * current == mm->owner 440 * current == mm->owner
439 * current->mm != mm 441 * current->mm != mm
440 * new_owner->mm == mm 442 * new_owner->mm == mm
441 * new_owner->alloc_lock is held 443 * new_owner->alloc_lock is held
442 */ 444 */
443 struct task_struct __rcu *owner; 445 struct task_struct __rcu *owner;
444#endif 446#endif
445 struct user_namespace *user_ns; 447 struct user_namespace *user_ns;
446 448
447 /* store ref to file /proc/<pid>/exe symlink points to */ 449 /* store ref to file /proc/<pid>/exe symlink points to */
448 struct file __rcu *exe_file; 450 struct file __rcu *exe_file;
449#ifdef CONFIG_MMU_NOTIFIER 451#ifdef CONFIG_MMU_NOTIFIER
450 struct mmu_notifier_mm *mmu_notifier_mm; 452 struct mmu_notifier_mm *mmu_notifier_mm;
451#endif 453#endif
452#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS 454#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
453 pgtable_t pmd_huge_pte; /* protected by page_table_lock */ 455 pgtable_t pmd_huge_pte; /* protected by page_table_lock */
454#endif
455#ifdef CONFIG_CPUMASK_OFFSTACK
456 struct cpumask cpumask_allocation;
457#endif 456#endif
458#ifdef CONFIG_NUMA_BALANCING 457#ifdef CONFIG_NUMA_BALANCING
459 /* 458 /*
460 * numa_next_scan is the next time that the PTEs will be marked 459 * numa_next_scan is the next time that the PTEs will be marked
461 * pte_numa. NUMA hinting faults will gather statistics and migrate 460 * pte_numa. NUMA hinting faults will gather statistics and
462 * pages to new nodes if necessary. 461 * migrate pages to new nodes if necessary.
463 */ 462 */
464 unsigned long numa_next_scan; 463 unsigned long numa_next_scan;
465 464
466 /* Restart point for scanning and setting pte_numa */ 465 /* Restart point for scanning and setting pte_numa */
467 unsigned long numa_scan_offset; 466 unsigned long numa_scan_offset;
468 467
469 /* numa_scan_seq prevents two threads setting pte_numa */ 468 /* numa_scan_seq prevents two threads setting pte_numa */
470 int numa_scan_seq; 469 int numa_scan_seq;
471#endif 470#endif
472 /* 471 /*
473 * An operation with batched TLB flushing is going on. Anything that 472 * An operation with batched TLB flushing is going on. Anything
474 * can move process memory needs to flush the TLB when moving a 473 * that can move process memory needs to flush the TLB when
475 * PROT_NONE or PROT_NUMA mapped page. 474 * moving a PROT_NONE or PROT_NUMA mapped page.
476 */ 475 */
477 atomic_t tlb_flush_pending; 476 atomic_t tlb_flush_pending;
478#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 477#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
479 /* See flush_tlb_batched_pending() */ 478 /* See flush_tlb_batched_pending() */
480 bool tlb_flush_batched; 479 bool tlb_flush_batched;
481#endif 480#endif
482 struct uprobes_state uprobes_state; 481 struct uprobes_state uprobes_state;
483#ifdef CONFIG_HUGETLB_PAGE 482#ifdef CONFIG_HUGETLB_PAGE
484 atomic_long_t hugetlb_usage; 483 atomic_long_t hugetlb_usage;
485#endif 484#endif
486 struct work_struct async_put_work; 485 struct work_struct async_put_work;
487 486
488#if IS_ENABLED(CONFIG_HMM) 487#if IS_ENABLED(CONFIG_HMM)
489 /* HMM needs to track a few things per mm */ 488 /* HMM needs to track a few things per mm */
490 struct hmm *hmm; 489 struct hmm *hmm;
491#endif 490#endif
492} __randomize_layout; 491 } __randomize_layout;
492
493 /*
494 * The mm_cpumask needs to be at the end of mm_struct, because it
495 * is dynamically sized based on nr_cpu_ids.
496 */
497 unsigned long cpu_bitmap[];
498};
493 499
494extern struct mm_struct init_mm; 500extern struct mm_struct init_mm;
495 501
502/* Pointer magic because the dynamic array size confuses some compilers. */
496static inline void mm_init_cpumask(struct mm_struct *mm) 503static inline void mm_init_cpumask(struct mm_struct *mm)
497{ 504{
498#ifdef CONFIG_CPUMASK_OFFSTACK 505 unsigned long cpu_bitmap = (unsigned long)mm;
499 mm->cpu_vm_mask_var = &mm->cpumask_allocation; 506
500#endif 507 cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap);
501 cpumask_clear(mm->cpu_vm_mask_var); 508 cpumask_clear((struct cpumask *)cpu_bitmap);
502} 509}
503 510
504/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ 511/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
505static inline cpumask_t *mm_cpumask(struct mm_struct *mm) 512static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
506{ 513{
507 return mm->cpu_vm_mask_var; 514 return (struct cpumask *)&mm->cpu_bitmap;
508} 515}
509 516
510struct mmu_gather; 517struct mmu_gather;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9440d61b925c..5b64c1b8461e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2253,6 +2253,8 @@ static void sighand_ctor(void *data)
2253 2253
2254void __init proc_caches_init(void) 2254void __init proc_caches_init(void)
2255{ 2255{
2256 unsigned int mm_size;
2257
2256 sighand_cachep = kmem_cache_create("sighand_cache", 2258 sighand_cachep = kmem_cache_create("sighand_cache",
2257 sizeof(struct sighand_struct), 0, 2259 sizeof(struct sighand_struct), 0,
2258 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| 2260 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -2269,15 +2271,16 @@ void __init proc_caches_init(void)
2269 sizeof(struct fs_struct), 0, 2271 sizeof(struct fs_struct), 0,
2270 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, 2272 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2271 NULL); 2273 NULL);
2274
2272 /* 2275 /*
2273 * FIXME! The "sizeof(struct mm_struct)" currently includes the 2276 * The mm_cpumask is located at the end of mm_struct, and is
2274 * whole struct cpumask for the OFFSTACK case. We could change 2277 * dynamically sized based on the maximum CPU number this system
2275 * this to *only* allocate as much of it as required by the 2278 * can have, taking hotplug into account (nr_cpu_ids).
2276 * maximum number of CPU's we can ever have. The cpumask_allocation
2277 * is at the end of the structure, exactly for that reason.
2278 */ 2279 */
2280 mm_size = sizeof(struct mm_struct) + cpumask_size();
2281
2279 mm_cachep = kmem_cache_create_usercopy("mm_struct", 2282 mm_cachep = kmem_cache_create_usercopy("mm_struct",
2280 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 2283 mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
2281 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, 2284 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2282 offsetof(struct mm_struct, saved_auxv), 2285 offsetof(struct mm_struct, saved_auxv),
2283 sizeof_field(struct mm_struct, saved_auxv), 2286 sizeof_field(struct mm_struct, saved_auxv),
diff --git a/mm/init-mm.c b/mm/init-mm.c
index f0179c9c04c2..a787a319211e 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -15,6 +15,16 @@
15#define INIT_MM_CONTEXT(name) 15#define INIT_MM_CONTEXT(name)
16#endif 16#endif
17 17
18/*
19 * For dynamically allocated mm_structs, there is a dynamically sized cpumask
20 * at the end of the structure, the size of which depends on the maximum CPU
21 * number the system can see. That way we allocate only as much memory for
22 * mm_cpumask() as needed for the hundreds, or thousands of processes that
23 * a system typically runs.
24 *
25 * Since there is only one init_mm in the entire system, keep it simple
26 * and size this cpu_bitmask to NR_CPUS.
27 */
18struct mm_struct init_mm = { 28struct mm_struct init_mm = {
19 .mm_rb = RB_ROOT, 29 .mm_rb = RB_ROOT,
20 .pgd = swapper_pg_dir, 30 .pgd = swapper_pg_dir,
@@ -25,5 +35,6 @@ struct mm_struct init_mm = {
25 .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), 35 .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
26 .mmlist = LIST_HEAD_INIT(init_mm.mmlist), 36 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
27 .user_ns = &init_user_ns, 37 .user_ns = &init_user_ns,
38 .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
28 INIT_MM_CONTEXT(init_mm) 39 INIT_MM_CONTEXT(init_mm)
29}; 40};