diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-24 20:00:35 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-24 20:00:35 -0400 |
| commit | a153e67bda3639a46edac6205610ae63c0fdea4c (patch) | |
| tree | 4169223a18acde55f624666ad3e836d957fc60d1 | |
| parent | e288e931c12e77982aa582e2380cd072becfe488 (diff) | |
| parent | 497a045d13dcd7a00f5535ded1ebb49313d4a211 (diff) | |
Merge branch 'akpm' (patches from Andrew Morton)
Merge fixes from Andrew Morton:
"Bunch of fixes.
And a reversion of mhocko's "Soft limit rework" patch series. This is
actually your fault for opening the merge window when I was off racing ;)
I didn't read the email thread before sending everything off.
Johannes Weiner raised significant issues:
http://www.spinics.net/lists/cgroups/msg08813.html
and we agreed to back it all out"
I clearly need to be more aware of Andrew's racing schedule.
* akpm:
MAINTAINERS: update mach-bcm related email address
checkpatch: make extern in .h prototypes quieter
cciss: fix info leak in cciss_ioctl32_passthru()
cpqarray: fix info leak in ida_locked_ioctl()
kernel/reboot.c: re-enable the function of variable reboot_default
audit: fix endless wait in audit_log_start()
revert "memcg, vmscan: integrate soft reclaim tighter with zone shrinking code"
revert "memcg: get rid of soft-limit tree infrastructure"
revert "vmscan, memcg: do softlimit reclaim also for targeted reclaim"
revert "memcg: enhance memcg iterator to support predicates"
revert "memcg: track children in soft limit excess to improve soft limit"
revert "memcg, vmscan: do not attempt soft limit reclaim if it would not scan anything"
revert "memcg: track all children over limit in the root"
revert "memcg, vmscan: do not fall into reclaim-all pass too quickly"
fs/ocfs2/super.c: use a bigger nodestr in ocfs2_dismount_volume
watchdog: update watchdog_thresh properly
watchdog: update watchdog attributes atomically
| -rw-r--r-- | MAINTAINERS | 3 | ||||
| -rw-r--r-- | drivers/block/cciss.c | 1 | ||||
| -rw-r--r-- | drivers/block/cpqarray.c | 1 | ||||
| -rw-r--r-- | fs/ocfs2/super.c | 2 | ||||
| -rw-r--r-- | include/linux/memcontrol.h | 55 | ||||
| -rw-r--r-- | include/linux/smp.h | 6 | ||||
| -rw-r--r-- | kernel/audit.c | 5 | ||||
| -rw-r--r-- | kernel/reboot.c | 9 | ||||
| -rw-r--r-- | kernel/watchdog.c | 60 | ||||
| -rw-r--r-- | mm/memcontrol.c | 560 | ||||
| -rw-r--r-- | mm/vmscan.c | 83 | ||||
| -rwxr-xr-x | scripts/checkpatch.pl | 4 |
12 files changed, 527 insertions, 262 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index e61c2e83fc2b..b2f857c7ecf6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
| @@ -1812,7 +1812,8 @@ S: Supported | |||
| 1812 | F: drivers/net/ethernet/broadcom/bnx2x/ | 1812 | F: drivers/net/ethernet/broadcom/bnx2x/ |
| 1813 | 1813 | ||
| 1814 | BROADCOM BCM281XX/BCM11XXX ARM ARCHITECTURE | 1814 | BROADCOM BCM281XX/BCM11XXX ARM ARCHITECTURE |
| 1815 | M: Christian Daudt <csd@broadcom.com> | 1815 | M: Christian Daudt <bcm@fixthebug.org> |
| 1816 | L: bcm-kernel-feedback-list@broadcom.com | ||
| 1816 | T: git git://git.github.com/broadcom/bcm11351 | 1817 | T: git git://git.github.com/broadcom/bcm11351 |
| 1817 | S: Maintained | 1818 | S: Maintained |
| 1818 | F: arch/arm/mach-bcm/ | 1819 | F: arch/arm/mach-bcm/ |
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index d2d95ff5353b..edfa2515bc86 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c | |||
| @@ -1189,6 +1189,7 @@ static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode, | |||
| 1189 | int err; | 1189 | int err; |
| 1190 | u32 cp; | 1190 | u32 cp; |
| 1191 | 1191 | ||
| 1192 | memset(&arg64, 0, sizeof(arg64)); | ||
| 1192 | err = 0; | 1193 | err = 0; |
| 1193 | err |= | 1194 | err |= |
| 1194 | copy_from_user(&arg64.LUN_info, &arg32->LUN_info, | 1195 | copy_from_user(&arg64.LUN_info, &arg32->LUN_info, |
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c index 639d26b90b91..2b9440384536 100644 --- a/drivers/block/cpqarray.c +++ b/drivers/block/cpqarray.c | |||
| @@ -1193,6 +1193,7 @@ out_passthru: | |||
| 1193 | ida_pci_info_struct pciinfo; | 1193 | ida_pci_info_struct pciinfo; |
| 1194 | 1194 | ||
| 1195 | if (!arg) return -EINVAL; | 1195 | if (!arg) return -EINVAL; |
| 1196 | memset(&pciinfo, 0, sizeof(pciinfo)); | ||
| 1196 | pciinfo.bus = host->pci_dev->bus->number; | 1197 | pciinfo.bus = host->pci_dev->bus->number; |
| 1197 | pciinfo.dev_fn = host->pci_dev->devfn; | 1198 | pciinfo.dev_fn = host->pci_dev->devfn; |
| 1198 | pciinfo.board_id = host->board_id; | 1199 | pciinfo.board_id = host->board_id; |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 121da2dc3be8..d4e81e4a9b04 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
| @@ -1924,7 +1924,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) | |||
| 1924 | { | 1924 | { |
| 1925 | int tmp, hangup_needed = 0; | 1925 | int tmp, hangup_needed = 0; |
| 1926 | struct ocfs2_super *osb = NULL; | 1926 | struct ocfs2_super *osb = NULL; |
| 1927 | char nodestr[8]; | 1927 | char nodestr[12]; |
| 1928 | 1928 | ||
| 1929 | trace_ocfs2_dismount_volume(sb); | 1929 | trace_ocfs2_dismount_volume(sb); |
| 1930 | 1930 | ||
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 60e95872da29..ecc82b37c4cc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
| @@ -53,23 +53,6 @@ struct mem_cgroup_reclaim_cookie { | |||
| 53 | unsigned int generation; | 53 | unsigned int generation; |
| 54 | }; | 54 | }; |
| 55 | 55 | ||
| 56 | enum mem_cgroup_filter_t { | ||
| 57 | VISIT, /* visit current node */ | ||
| 58 | SKIP, /* skip the current node and continue traversal */ | ||
| 59 | SKIP_TREE, /* skip the whole subtree and continue traversal */ | ||
| 60 | }; | ||
| 61 | |||
| 62 | /* | ||
| 63 | * mem_cgroup_filter_t predicate might instruct mem_cgroup_iter_cond how to | ||
| 64 | * iterate through the hierarchy tree. Each tree element is checked by the | ||
| 65 | * predicate before it is returned by the iterator. If a filter returns | ||
| 66 | * SKIP or SKIP_TREE then the iterator code continues traversal (with the | ||
| 67 | * next node down the hierarchy or the next node that doesn't belong under the | ||
| 68 | * memcg's subtree). | ||
| 69 | */ | ||
| 70 | typedef enum mem_cgroup_filter_t | ||
| 71 | (*mem_cgroup_iter_filter)(struct mem_cgroup *memcg, struct mem_cgroup *root); | ||
| 72 | |||
| 73 | #ifdef CONFIG_MEMCG | 56 | #ifdef CONFIG_MEMCG |
| 74 | /* | 57 | /* |
| 75 | * All "charge" functions with gfp_mask should use GFP_KERNEL or | 58 | * All "charge" functions with gfp_mask should use GFP_KERNEL or |
| @@ -137,18 +120,9 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage, | |||
| 137 | extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, | 120 | extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, |
| 138 | struct page *oldpage, struct page *newpage, bool migration_ok); | 121 | struct page *oldpage, struct page *newpage, bool migration_ok); |
| 139 | 122 | ||
| 140 | struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | 123 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, |
| 141 | struct mem_cgroup *prev, | 124 | struct mem_cgroup *, |
| 142 | struct mem_cgroup_reclaim_cookie *reclaim, | 125 | struct mem_cgroup_reclaim_cookie *); |
| 143 | mem_cgroup_iter_filter cond); | ||
| 144 | |||
| 145 | static inline struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | ||
| 146 | struct mem_cgroup *prev, | ||
| 147 | struct mem_cgroup_reclaim_cookie *reclaim) | ||
| 148 | { | ||
| 149 | return mem_cgroup_iter_cond(root, prev, reclaim, NULL); | ||
| 150 | } | ||
| 151 | |||
| 152 | void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); | 126 | void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); |
| 153 | 127 | ||
| 154 | /* | 128 | /* |
| @@ -260,9 +234,9 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, | |||
| 260 | mem_cgroup_update_page_stat(page, idx, -1); | 234 | mem_cgroup_update_page_stat(page, idx, -1); |
| 261 | } | 235 | } |
| 262 | 236 | ||
| 263 | enum mem_cgroup_filter_t | 237 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, |
| 264 | mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, | 238 | gfp_t gfp_mask, |
| 265 | struct mem_cgroup *root); | 239 | unsigned long *total_scanned); |
| 266 | 240 | ||
| 267 | void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); | 241 | void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); |
| 268 | static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, | 242 | static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, |
| @@ -376,15 +350,6 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
| 376 | struct page *oldpage, struct page *newpage, bool migration_ok) | 350 | struct page *oldpage, struct page *newpage, bool migration_ok) |
| 377 | { | 351 | { |
| 378 | } | 352 | } |
| 379 | static inline struct mem_cgroup * | ||
| 380 | mem_cgroup_iter_cond(struct mem_cgroup *root, | ||
| 381 | struct mem_cgroup *prev, | ||
| 382 | struct mem_cgroup_reclaim_cookie *reclaim, | ||
| 383 | mem_cgroup_iter_filter cond) | ||
| 384 | { | ||
| 385 | /* first call must return non-NULL, second return NULL */ | ||
| 386 | return (struct mem_cgroup *)(unsigned long)!prev; | ||
| 387 | } | ||
| 388 | 353 | ||
| 389 | static inline struct mem_cgroup * | 354 | static inline struct mem_cgroup * |
| 390 | mem_cgroup_iter(struct mem_cgroup *root, | 355 | mem_cgroup_iter(struct mem_cgroup *root, |
| @@ -471,11 +436,11 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, | |||
| 471 | } | 436 | } |
| 472 | 437 | ||
| 473 | static inline | 438 | static inline |
| 474 | enum mem_cgroup_filter_t | 439 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, |
| 475 | mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, | 440 | gfp_t gfp_mask, |
| 476 | struct mem_cgroup *root) | 441 | unsigned long *total_scanned) |
| 477 | { | 442 | { |
| 478 | return VISIT; | 443 | return 0; |
| 479 | } | 444 | } |
| 480 | 445 | ||
| 481 | static inline void mem_cgroup_split_huge_fixup(struct page *head) | 446 | static inline void mem_cgroup_split_huge_fixup(struct page *head) |
diff --git a/include/linux/smp.h b/include/linux/smp.h index cfb7ca094b38..731f5237d5f4 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h | |||
| @@ -155,6 +155,12 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, | |||
| 155 | 155 | ||
| 156 | static inline void kick_all_cpus_sync(void) { } | 156 | static inline void kick_all_cpus_sync(void) { } |
| 157 | 157 | ||
| 158 | static inline void __smp_call_function_single(int cpuid, | ||
| 159 | struct call_single_data *data, int wait) | ||
| 160 | { | ||
| 161 | on_each_cpu(data->func, data->info, wait); | ||
| 162 | } | ||
| 163 | |||
| 158 | #endif /* !SMP */ | 164 | #endif /* !SMP */ |
| 159 | 165 | ||
| 160 | /* | 166 | /* |
diff --git a/kernel/audit.c b/kernel/audit.c index 91e53d04b6a9..7b0e23a740ce 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -1117,9 +1117,10 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | |||
| 1117 | 1117 | ||
| 1118 | sleep_time = timeout_start + audit_backlog_wait_time - | 1118 | sleep_time = timeout_start + audit_backlog_wait_time - |
| 1119 | jiffies; | 1119 | jiffies; |
| 1120 | if ((long)sleep_time > 0) | 1120 | if ((long)sleep_time > 0) { |
| 1121 | wait_for_auditd(sleep_time); | 1121 | wait_for_auditd(sleep_time); |
| 1122 | continue; | 1122 | continue; |
| 1123 | } | ||
| 1123 | } | 1124 | } |
| 1124 | if (audit_rate_check() && printk_ratelimit()) | 1125 | if (audit_rate_check() && printk_ratelimit()) |
| 1125 | printk(KERN_WARNING | 1126 | printk(KERN_WARNING |
diff --git a/kernel/reboot.c b/kernel/reboot.c index 269ed9384cc4..f813b3474646 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c | |||
| @@ -32,7 +32,14 @@ EXPORT_SYMBOL(cad_pid); | |||
| 32 | #endif | 32 | #endif |
| 33 | enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; | 33 | enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; |
| 34 | 34 | ||
| 35 | int reboot_default; | 35 | /* |
| 36 | * This variable is used privately to keep track of whether or not | ||
| 37 | * reboot_type is still set to its default value (i.e., reboot= hasn't | ||
| 38 | * been set on the command line). This is needed so that we can | ||
| 39 | * suppress DMI scanning for reboot quirks. Without it, it's | ||
| 40 | * impossible to override a faulty reboot quirk without recompiling. | ||
| 41 | */ | ||
| 42 | int reboot_default = 1; | ||
| 36 | int reboot_cpu; | 43 | int reboot_cpu; |
| 37 | enum reboot_type reboot_type = BOOT_ACPI; | 44 | enum reboot_type reboot_type = BOOT_ACPI; |
| 38 | int reboot_force; | 45 | int reboot_force; |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 51c4f34d258e..4431610f049a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
| @@ -486,7 +486,52 @@ static struct smp_hotplug_thread watchdog_threads = { | |||
| 486 | .unpark = watchdog_enable, | 486 | .unpark = watchdog_enable, |
| 487 | }; | 487 | }; |
| 488 | 488 | ||
| 489 | static int watchdog_enable_all_cpus(void) | 489 | static void restart_watchdog_hrtimer(void *info) |
| 490 | { | ||
| 491 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | ||
| 492 | int ret; | ||
| 493 | |||
| 494 | /* | ||
| 495 | * No need to cancel and restart hrtimer if it is currently executing | ||
| 496 | * because it will reprogram itself with the new period now. | ||
| 497 | * We should never see it unqueued here because we are running per-cpu | ||
| 498 | * with interrupts disabled. | ||
| 499 | */ | ||
| 500 | ret = hrtimer_try_to_cancel(hrtimer); | ||
| 501 | if (ret == 1) | ||
| 502 | hrtimer_start(hrtimer, ns_to_ktime(sample_period), | ||
| 503 | HRTIMER_MODE_REL_PINNED); | ||
| 504 | } | ||
| 505 | |||
| 506 | static void update_timers(int cpu) | ||
| 507 | { | ||
| 508 | struct call_single_data data = {.func = restart_watchdog_hrtimer}; | ||
| 509 | /* | ||
| 510 | * Make sure that perf event counter will adopt to a new | ||
| 511 | * sampling period. Updating the sampling period directly would | ||
| 512 | * be much nicer but we do not have an API for that now so | ||
| 513 | * let's use a big hammer. | ||
| 514 | * Hrtimer will adopt the new period on the next tick but this | ||
| 515 | * might be late already so we have to restart the timer as well. | ||
| 516 | */ | ||
| 517 | watchdog_nmi_disable(cpu); | ||
| 518 | __smp_call_function_single(cpu, &data, 1); | ||
| 519 | watchdog_nmi_enable(cpu); | ||
| 520 | } | ||
| 521 | |||
| 522 | static void update_timers_all_cpus(void) | ||
| 523 | { | ||
| 524 | int cpu; | ||
| 525 | |||
| 526 | get_online_cpus(); | ||
| 527 | preempt_disable(); | ||
| 528 | for_each_online_cpu(cpu) | ||
| 529 | update_timers(cpu); | ||
| 530 | preempt_enable(); | ||
| 531 | put_online_cpus(); | ||
| 532 | } | ||
| 533 | |||
| 534 | static int watchdog_enable_all_cpus(bool sample_period_changed) | ||
| 490 | { | 535 | { |
| 491 | int err = 0; | 536 | int err = 0; |
| 492 | 537 | ||
| @@ -496,6 +541,8 @@ static int watchdog_enable_all_cpus(void) | |||
| 496 | pr_err("Failed to create watchdog threads, disabled\n"); | 541 | pr_err("Failed to create watchdog threads, disabled\n"); |
| 497 | else | 542 | else |
| 498 | watchdog_running = 1; | 543 | watchdog_running = 1; |
| 544 | } else if (sample_period_changed) { | ||
| 545 | update_timers_all_cpus(); | ||
| 499 | } | 546 | } |
| 500 | 547 | ||
| 501 | return err; | 548 | return err; |
| @@ -520,13 +567,15 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
| 520 | void __user *buffer, size_t *lenp, loff_t *ppos) | 567 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 521 | { | 568 | { |
| 522 | int err, old_thresh, old_enabled; | 569 | int err, old_thresh, old_enabled; |
| 570 | static DEFINE_MUTEX(watchdog_proc_mutex); | ||
| 523 | 571 | ||
| 572 | mutex_lock(&watchdog_proc_mutex); | ||
| 524 | old_thresh = ACCESS_ONCE(watchdog_thresh); | 573 | old_thresh = ACCESS_ONCE(watchdog_thresh); |
| 525 | old_enabled = ACCESS_ONCE(watchdog_user_enabled); | 574 | old_enabled = ACCESS_ONCE(watchdog_user_enabled); |
| 526 | 575 | ||
| 527 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 576 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
| 528 | if (err || !write) | 577 | if (err || !write) |
| 529 | return err; | 578 | goto out; |
| 530 | 579 | ||
| 531 | set_sample_period(); | 580 | set_sample_period(); |
| 532 | /* | 581 | /* |
| @@ -535,7 +584,7 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
| 535 | * watchdog_*_all_cpus() function takes care of this. | 584 | * watchdog_*_all_cpus() function takes care of this. |
| 536 | */ | 585 | */ |
| 537 | if (watchdog_user_enabled && watchdog_thresh) | 586 | if (watchdog_user_enabled && watchdog_thresh) |
| 538 | err = watchdog_enable_all_cpus(); | 587 | err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); |
| 539 | else | 588 | else |
| 540 | watchdog_disable_all_cpus(); | 589 | watchdog_disable_all_cpus(); |
| 541 | 590 | ||
| @@ -544,7 +593,8 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
| 544 | watchdog_thresh = old_thresh; | 593 | watchdog_thresh = old_thresh; |
| 545 | watchdog_user_enabled = old_enabled; | 594 | watchdog_user_enabled = old_enabled; |
| 546 | } | 595 | } |
| 547 | 596 | out: | |
| 597 | mutex_unlock(&watchdog_proc_mutex); | ||
| 548 | return err; | 598 | return err; |
| 549 | } | 599 | } |
| 550 | #endif /* CONFIG_SYSCTL */ | 600 | #endif /* CONFIG_SYSCTL */ |
| @@ -554,5 +604,5 @@ void __init lockup_detector_init(void) | |||
| 554 | set_sample_period(); | 604 | set_sample_period(); |
| 555 | 605 | ||
| 556 | if (watchdog_user_enabled) | 606 | if (watchdog_user_enabled) |
| 557 | watchdog_enable_all_cpus(); | 607 | watchdog_enable_all_cpus(false); |
| 558 | } | 608 | } |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5ff3ce13029..1c52ddbc839b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -39,6 +39,7 @@ | |||
| 39 | #include <linux/limits.h> | 39 | #include <linux/limits.h> |
| 40 | #include <linux/export.h> | 40 | #include <linux/export.h> |
| 41 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
| 42 | #include <linux/rbtree.h> | ||
| 42 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
| 43 | #include <linux/swap.h> | 44 | #include <linux/swap.h> |
| 44 | #include <linux/swapops.h> | 45 | #include <linux/swapops.h> |
| @@ -160,6 +161,10 @@ struct mem_cgroup_per_zone { | |||
| 160 | 161 | ||
| 161 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 162 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; |
| 162 | 163 | ||
| 164 | struct rb_node tree_node; /* RB tree node */ | ||
| 165 | unsigned long long usage_in_excess;/* Set to the value by which */ | ||
| 166 | /* the soft limit is exceeded*/ | ||
| 167 | bool on_tree; | ||
| 163 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ | 168 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ |
| 164 | /* use container_of */ | 169 | /* use container_of */ |
| 165 | }; | 170 | }; |
| @@ -168,6 +173,26 @@ struct mem_cgroup_per_node { | |||
| 168 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | 173 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; |
| 169 | }; | 174 | }; |
| 170 | 175 | ||
| 176 | /* | ||
| 177 | * Cgroups above their limits are maintained in a RB-Tree, independent of | ||
| 178 | * their hierarchy representation | ||
| 179 | */ | ||
| 180 | |||
| 181 | struct mem_cgroup_tree_per_zone { | ||
| 182 | struct rb_root rb_root; | ||
| 183 | spinlock_t lock; | ||
| 184 | }; | ||
| 185 | |||
| 186 | struct mem_cgroup_tree_per_node { | ||
| 187 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
| 188 | }; | ||
| 189 | |||
| 190 | struct mem_cgroup_tree { | ||
| 191 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | ||
| 192 | }; | ||
| 193 | |||
| 194 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | ||
| 195 | |||
| 171 | struct mem_cgroup_threshold { | 196 | struct mem_cgroup_threshold { |
| 172 | struct eventfd_ctx *eventfd; | 197 | struct eventfd_ctx *eventfd; |
| 173 | u64 threshold; | 198 | u64 threshold; |
| @@ -303,22 +328,6 @@ struct mem_cgroup { | |||
| 303 | atomic_t numainfo_events; | 328 | atomic_t numainfo_events; |
| 304 | atomic_t numainfo_updating; | 329 | atomic_t numainfo_updating; |
| 305 | #endif | 330 | #endif |
| 306 | /* | ||
| 307 | * Protects soft_contributed transitions. | ||
| 308 | * See mem_cgroup_update_soft_limit | ||
| 309 | */ | ||
| 310 | spinlock_t soft_lock; | ||
| 311 | |||
| 312 | /* | ||
| 313 | * If true then this group has increased parents' children_in_excess | ||
| 314 | * when it got over the soft limit. | ||
| 315 | * When a group falls bellow the soft limit, parents' children_in_excess | ||
| 316 | * is decreased and soft_contributed changed to false. | ||
| 317 | */ | ||
| 318 | bool soft_contributed; | ||
| 319 | |||
| 320 | /* Number of children that are in soft limit excess */ | ||
| 321 | atomic_t children_in_excess; | ||
| 322 | 331 | ||
| 323 | struct mem_cgroup_per_node *nodeinfo[0]; | 332 | struct mem_cgroup_per_node *nodeinfo[0]; |
| 324 | /* WARNING: nodeinfo must be the last member here */ | 333 | /* WARNING: nodeinfo must be the last member here */ |
| @@ -422,6 +431,7 @@ static bool move_file(void) | |||
| 422 | * limit reclaim to prevent infinite loops, if they ever occur. | 431 | * limit reclaim to prevent infinite loops, if they ever occur. |
| 423 | */ | 432 | */ |
| 424 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 | 433 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 |
| 434 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 | ||
| 425 | 435 | ||
| 426 | enum charge_type { | 436 | enum charge_type { |
| 427 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 437 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
| @@ -648,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) | |||
| 648 | return mem_cgroup_zoneinfo(memcg, nid, zid); | 658 | return mem_cgroup_zoneinfo(memcg, nid, zid); |
| 649 | } | 659 | } |
| 650 | 660 | ||
| 661 | static struct mem_cgroup_tree_per_zone * | ||
| 662 | soft_limit_tree_node_zone(int nid, int zid) | ||
| 663 | { | ||
| 664 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
| 665 | } | ||
| 666 | |||
| 667 | static struct mem_cgroup_tree_per_zone * | ||
| 668 | soft_limit_tree_from_page(struct page *page) | ||
| 669 | { | ||
| 670 | int nid = page_to_nid(page); | ||
| 671 | int zid = page_zonenum(page); | ||
| 672 | |||
| 673 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
| 674 | } | ||
| 675 | |||
| 676 | static void | ||
| 677 | __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, | ||
| 678 | struct mem_cgroup_per_zone *mz, | ||
| 679 | struct mem_cgroup_tree_per_zone *mctz, | ||
| 680 | unsigned long long new_usage_in_excess) | ||
| 681 | { | ||
| 682 | struct rb_node **p = &mctz->rb_root.rb_node; | ||
| 683 | struct rb_node *parent = NULL; | ||
| 684 | struct mem_cgroup_per_zone *mz_node; | ||
| 685 | |||
| 686 | if (mz->on_tree) | ||
| 687 | return; | ||
| 688 | |||
| 689 | mz->usage_in_excess = new_usage_in_excess; | ||
| 690 | if (!mz->usage_in_excess) | ||
| 691 | return; | ||
| 692 | while (*p) { | ||
| 693 | parent = *p; | ||
| 694 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | ||
| 695 | tree_node); | ||
| 696 | if (mz->usage_in_excess < mz_node->usage_in_excess) | ||
| 697 | p = &(*p)->rb_left; | ||
| 698 | /* | ||
| 699 | * We can't avoid mem cgroups that are over their soft | ||
| 700 | * limit by the same amount | ||
| 701 | */ | ||
| 702 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | ||
| 703 | p = &(*p)->rb_right; | ||
| 704 | } | ||
| 705 | rb_link_node(&mz->tree_node, parent, p); | ||
| 706 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | ||
| 707 | mz->on_tree = true; | ||
| 708 | } | ||
| 709 | |||
| 710 | static void | ||
| 711 | __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
| 712 | struct mem_cgroup_per_zone *mz, | ||
| 713 | struct mem_cgroup_tree_per_zone *mctz) | ||
| 714 | { | ||
| 715 | if (!mz->on_tree) | ||
| 716 | return; | ||
| 717 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
| 718 | mz->on_tree = false; | ||
| 719 | } | ||
| 720 | |||
| 721 | static void | ||
| 722 | mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
| 723 | struct mem_cgroup_per_zone *mz, | ||
| 724 | struct mem_cgroup_tree_per_zone *mctz) | ||
| 725 | { | ||
| 726 | spin_lock(&mctz->lock); | ||
| 727 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
| 728 | spin_unlock(&mctz->lock); | ||
| 729 | } | ||
| 730 | |||
| 731 | |||
| 732 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | ||
| 733 | { | ||
| 734 | unsigned long long excess; | ||
| 735 | struct mem_cgroup_per_zone *mz; | ||
| 736 | struct mem_cgroup_tree_per_zone *mctz; | ||
| 737 | int nid = page_to_nid(page); | ||
| 738 | int zid = page_zonenum(page); | ||
| 739 | mctz = soft_limit_tree_from_page(page); | ||
| 740 | |||
| 741 | /* | ||
| 742 | * Necessary to update all ancestors when hierarchy is used. | ||
| 743 | * because their event counter is not touched. | ||
| 744 | */ | ||
| 745 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { | ||
| 746 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
| 747 | excess = res_counter_soft_limit_excess(&memcg->res); | ||
| 748 | /* | ||
| 749 | * We have to update the tree if mz is on RB-tree or | ||
| 750 | * mem is over its softlimit. | ||
| 751 | */ | ||
| 752 | if (excess || mz->on_tree) { | ||
| 753 | spin_lock(&mctz->lock); | ||
| 754 | /* if on-tree, remove it */ | ||
| 755 | if (mz->on_tree) | ||
| 756 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
| 757 | /* | ||
| 758 | * Insert again. mz->usage_in_excess will be updated. | ||
| 759 | * If excess is 0, no tree ops. | ||
| 760 | */ | ||
| 761 | __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); | ||
| 762 | spin_unlock(&mctz->lock); | ||
| 763 | } | ||
| 764 | } | ||
| 765 | } | ||
| 766 | |||
| 767 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) | ||
| 768 | { | ||
| 769 | int node, zone; | ||
| 770 | struct mem_cgroup_per_zone *mz; | ||
| 771 | struct mem_cgroup_tree_per_zone *mctz; | ||
| 772 | |||
| 773 | for_each_node(node) { | ||
| 774 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
| 775 | mz = mem_cgroup_zoneinfo(memcg, node, zone); | ||
| 776 | mctz = soft_limit_tree_node_zone(node, zone); | ||
| 777 | mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
| 778 | } | ||
| 779 | } | ||
| 780 | } | ||
| 781 | |||
| 782 | static struct mem_cgroup_per_zone * | ||
| 783 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
| 784 | { | ||
| 785 | struct rb_node *rightmost = NULL; | ||
| 786 | struct mem_cgroup_per_zone *mz; | ||
| 787 | |||
| 788 | retry: | ||
| 789 | mz = NULL; | ||
| 790 | rightmost = rb_last(&mctz->rb_root); | ||
| 791 | if (!rightmost) | ||
| 792 | goto done; /* Nothing to reclaim from */ | ||
| 793 | |||
| 794 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | ||
| 795 | /* | ||
| 796 | * Remove the node now but someone else can add it back, | ||
| 797 | * we will to add it back at the end of reclaim to its correct | ||
| 798 | * position in the tree. | ||
| 799 | */ | ||
| 800 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
| 801 | if (!res_counter_soft_limit_excess(&mz->memcg->res) || | ||
| 802 | !css_tryget(&mz->memcg->css)) | ||
| 803 | goto retry; | ||
| 804 | done: | ||
| 805 | return mz; | ||
| 806 | } | ||
| 807 | |||
| 808 | static struct mem_cgroup_per_zone * | ||
| 809 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
| 810 | { | ||
| 811 | struct mem_cgroup_per_zone *mz; | ||
| 812 | |||
| 813 | spin_lock(&mctz->lock); | ||
| 814 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | ||
| 815 | spin_unlock(&mctz->lock); | ||
| 816 | return mz; | ||
| 817 | } | ||
| 818 | |||
| 651 | /* | 819 | /* |
| 652 | * Implementation Note: reading percpu statistics for memcg. | 820 | * Implementation Note: reading percpu statistics for memcg. |
| 653 | * | 821 | * |
| @@ -822,48 +990,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, | |||
| 822 | } | 990 | } |
| 823 | 991 | ||
| 824 | /* | 992 | /* |
| 825 | * Called from rate-limited memcg_check_events when enough | ||
| 826 | * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure | ||
| 827 | * that all the parents up the hierarchy will be notified that this group | ||
| 828 | * is in excess or that it is not in excess anymore. mmecg->soft_contributed | ||
| 829 | * makes the transition a single action whenever the state flips from one to | ||
| 830 | * the other. | ||
| 831 | */ | ||
| 832 | static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) | ||
| 833 | { | ||
| 834 | unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); | ||
| 835 | struct mem_cgroup *parent = memcg; | ||
| 836 | int delta = 0; | ||
| 837 | |||
| 838 | spin_lock(&memcg->soft_lock); | ||
| 839 | if (excess) { | ||
| 840 | if (!memcg->soft_contributed) { | ||
| 841 | delta = 1; | ||
| 842 | memcg->soft_contributed = true; | ||
| 843 | } | ||
| 844 | } else { | ||
| 845 | if (memcg->soft_contributed) { | ||
| 846 | delta = -1; | ||
| 847 | memcg->soft_contributed = false; | ||
| 848 | } | ||
| 849 | } | ||
| 850 | |||
| 851 | /* | ||
| 852 | * Necessary to update all ancestors when hierarchy is used | ||
| 853 | * because their event counter is not touched. | ||
| 854 | * We track children even outside the hierarchy for the root | ||
| 855 | * cgroup because tree walk starting at root should visit | ||
| 856 | * all cgroups and we want to prevent from pointless tree | ||
| 857 | * walk if no children is below the limit. | ||
| 858 | */ | ||
| 859 | while (delta && (parent = parent_mem_cgroup(parent))) | ||
| 860 | atomic_add(delta, &parent->children_in_excess); | ||
| 861 | if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) | ||
| 862 | atomic_add(delta, &root_mem_cgroup->children_in_excess); | ||
| 863 | spin_unlock(&memcg->soft_lock); | ||
| 864 | } | ||
| 865 | |||
| 866 | /* | ||
| 867 | * Check events in order. | 993 | * Check events in order. |
| 868 | * | 994 | * |
| 869 | */ | 995 | */ |
| @@ -886,7 +1012,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
| 886 | 1012 | ||
| 887 | mem_cgroup_threshold(memcg); | 1013 | mem_cgroup_threshold(memcg); |
| 888 | if (unlikely(do_softlimit)) | 1014 | if (unlikely(do_softlimit)) |
| 889 | mem_cgroup_update_soft_limit(memcg); | 1015 | mem_cgroup_update_tree(memcg, page); |
| 890 | #if MAX_NUMNODES > 1 | 1016 | #if MAX_NUMNODES > 1 |
| 891 | if (unlikely(do_numainfo)) | 1017 | if (unlikely(do_numainfo)) |
| 892 | atomic_inc(&memcg->numainfo_events); | 1018 | atomic_inc(&memcg->numainfo_events); |
| @@ -929,15 +1055,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
| 929 | return memcg; | 1055 | return memcg; |
| 930 | } | 1056 | } |
| 931 | 1057 | ||
| 932 | static enum mem_cgroup_filter_t | ||
| 933 | mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, | ||
| 934 | mem_cgroup_iter_filter cond) | ||
| 935 | { | ||
| 936 | if (!cond) | ||
| 937 | return VISIT; | ||
| 938 | return cond(memcg, root); | ||
| 939 | } | ||
| 940 | |||
| 941 | /* | 1058 | /* |
| 942 | * Returns a next (in a pre-order walk) alive memcg (with elevated css | 1059 | * Returns a next (in a pre-order walk) alive memcg (with elevated css |
| 943 | * ref. count) or NULL if the whole root's subtree has been visited. | 1060 | * ref. count) or NULL if the whole root's subtree has been visited. |
| @@ -945,7 +1062,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, | |||
| 945 | * helper function to be used by mem_cgroup_iter | 1062 | * helper function to be used by mem_cgroup_iter |
| 946 | */ | 1063 | */ |
| 947 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, | 1064 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, |
| 948 | struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) | 1065 | struct mem_cgroup *last_visited) |
| 949 | { | 1066 | { |
| 950 | struct cgroup_subsys_state *prev_css, *next_css; | 1067 | struct cgroup_subsys_state *prev_css, *next_css; |
| 951 | 1068 | ||
| @@ -963,31 +1080,11 @@ skip_node: | |||
| 963 | if (next_css) { | 1080 | if (next_css) { |
| 964 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); | 1081 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); |
| 965 | 1082 | ||
| 966 | switch (mem_cgroup_filter(mem, root, cond)) { | 1083 | if (css_tryget(&mem->css)) |
| 967 | case SKIP: | 1084 | return mem; |
| 1085 | else { | ||
| 968 | prev_css = next_css; | 1086 | prev_css = next_css; |
| 969 | goto skip_node; | 1087 | goto skip_node; |
| 970 | case SKIP_TREE: | ||
| 971 | if (mem == root) | ||
| 972 | return NULL; | ||
| 973 | /* | ||
| 974 | * css_rightmost_descendant is not an optimal way to | ||
| 975 | * skip through a subtree (especially for imbalanced | ||
| 976 | * trees leaning to right) but that's what we have right | ||
| 977 | * now. More effective solution would be traversing | ||
| 978 | * right-up for first non-NULL without calling | ||
| 979 | * css_next_descendant_pre afterwards. | ||
| 980 | */ | ||
| 981 | prev_css = css_rightmost_descendant(next_css); | ||
| 982 | goto skip_node; | ||
| 983 | case VISIT: | ||
| 984 | if (css_tryget(&mem->css)) | ||
| 985 | return mem; | ||
| 986 | else { | ||
| 987 | prev_css = next_css; | ||
| 988 | goto skip_node; | ||
| 989 | } | ||
| 990 | break; | ||
| 991 | } | 1088 | } |
| 992 | } | 1089 | } |
| 993 | 1090 | ||
| @@ -1051,7 +1148,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | |||
| 1051 | * @root: hierarchy root | 1148 | * @root: hierarchy root |
| 1052 | * @prev: previously returned memcg, NULL on first invocation | 1149 | * @prev: previously returned memcg, NULL on first invocation |
| 1053 | * @reclaim: cookie for shared reclaim walks, NULL for full walks | 1150 | * @reclaim: cookie for shared reclaim walks, NULL for full walks |
| 1054 | * @cond: filter for visited nodes, NULL for no filter | ||
| 1055 | * | 1151 | * |
| 1056 | * Returns references to children of the hierarchy below @root, or | 1152 | * Returns references to children of the hierarchy below @root, or |
| 1057 | * @root itself, or %NULL after a full round-trip. | 1153 | * @root itself, or %NULL after a full round-trip. |
| @@ -1064,18 +1160,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | |||
| 1064 | * divide up the memcgs in the hierarchy among all concurrent | 1160 | * divide up the memcgs in the hierarchy among all concurrent |
| 1065 | * reclaimers operating on the same zone and priority. | 1161 | * reclaimers operating on the same zone and priority. |
| 1066 | */ | 1162 | */ |
| 1067 | struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | 1163 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, |
| 1068 | struct mem_cgroup *prev, | 1164 | struct mem_cgroup *prev, |
| 1069 | struct mem_cgroup_reclaim_cookie *reclaim, | 1165 | struct mem_cgroup_reclaim_cookie *reclaim) |
| 1070 | mem_cgroup_iter_filter cond) | ||
| 1071 | { | 1166 | { |
| 1072 | struct mem_cgroup *memcg = NULL; | 1167 | struct mem_cgroup *memcg = NULL; |
| 1073 | struct mem_cgroup *last_visited = NULL; | 1168 | struct mem_cgroup *last_visited = NULL; |
| 1074 | 1169 | ||
| 1075 | if (mem_cgroup_disabled()) { | 1170 | if (mem_cgroup_disabled()) |
| 1076 | /* first call must return non-NULL, second return NULL */ | 1171 | return NULL; |
| 1077 | return (struct mem_cgroup *)(unsigned long)!prev; | ||
| 1078 | } | ||
| 1079 | 1172 | ||
| 1080 | if (!root) | 1173 | if (!root) |
| 1081 | root = root_mem_cgroup; | 1174 | root = root_mem_cgroup; |
| @@ -1086,9 +1179,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | |||
| 1086 | if (!root->use_hierarchy && root != root_mem_cgroup) { | 1179 | if (!root->use_hierarchy && root != root_mem_cgroup) { |
| 1087 | if (prev) | 1180 | if (prev) |
| 1088 | goto out_css_put; | 1181 | goto out_css_put; |
| 1089 | if (mem_cgroup_filter(root, root, cond) == VISIT) | 1182 | return root; |
| 1090 | return root; | ||
| 1091 | return NULL; | ||
| 1092 | } | 1183 | } |
| 1093 | 1184 | ||
| 1094 | rcu_read_lock(); | 1185 | rcu_read_lock(); |
| @@ -1111,7 +1202,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | |||
| 1111 | last_visited = mem_cgroup_iter_load(iter, root, &seq); | 1202 | last_visited = mem_cgroup_iter_load(iter, root, &seq); |
| 1112 | } | 1203 | } |
| 1113 | 1204 | ||
| 1114 | memcg = __mem_cgroup_iter_next(root, last_visited, cond); | 1205 | memcg = __mem_cgroup_iter_next(root, last_visited); |
| 1115 | 1206 | ||
| 1116 | if (reclaim) { | 1207 | if (reclaim) { |
| 1117 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); | 1208 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); |
| @@ -1122,11 +1213,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | |||
| 1122 | reclaim->generation = iter->generation; | 1213 | reclaim->generation = iter->generation; |
| 1123 | } | 1214 | } |
| 1124 | 1215 | ||
| 1125 | /* | 1216 | if (prev && !memcg) |
| 1126 | * We have finished the whole tree walk or no group has been | ||
| 1127 | * visited because filter told us to skip the root node. | ||
| 1128 | */ | ||
| 1129 | if (!memcg && (prev || (cond && !last_visited))) | ||
| 1130 | goto out_unlock; | 1217 | goto out_unlock; |
| 1131 | } | 1218 | } |
| 1132 | out_unlock: | 1219 | out_unlock: |
| @@ -1767,7 +1854,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | |||
| 1767 | return total; | 1854 | return total; |
| 1768 | } | 1855 | } |
| 1769 | 1856 | ||
| 1770 | #if MAX_NUMNODES > 1 | ||
| 1771 | /** | 1857 | /** |
| 1772 | * test_mem_cgroup_node_reclaimable | 1858 | * test_mem_cgroup_node_reclaimable |
| 1773 | * @memcg: the target memcg | 1859 | * @memcg: the target memcg |
| @@ -1790,6 +1876,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, | |||
| 1790 | return false; | 1876 | return false; |
| 1791 | 1877 | ||
| 1792 | } | 1878 | } |
| 1879 | #if MAX_NUMNODES > 1 | ||
| 1793 | 1880 | ||
| 1794 | /* | 1881 | /* |
| 1795 | * Always updating the nodemask is not very good - even if we have an empty | 1882 | * Always updating the nodemask is not very good - even if we have an empty |
| @@ -1857,50 +1944,104 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | |||
| 1857 | return node; | 1944 | return node; |
| 1858 | } | 1945 | } |
| 1859 | 1946 | ||
| 1947 | /* | ||
| 1948 | * Check all nodes whether it contains reclaimable pages or not. | ||
| 1949 | * For quick scan, we make use of scan_nodes. This will allow us to skip | ||
| 1950 | * unused nodes. But scan_nodes is lazily updated and may not cotain | ||
| 1951 | * enough new information. We need to do double check. | ||
| 1952 | */ | ||
| 1953 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
| 1954 | { | ||
| 1955 | int nid; | ||
| 1956 | |||
| 1957 | /* | ||
| 1958 | * quick check...making use of scan_node. | ||
| 1959 | * We can skip unused nodes. | ||
| 1960 | */ | ||
| 1961 | if (!nodes_empty(memcg->scan_nodes)) { | ||
| 1962 | for (nid = first_node(memcg->scan_nodes); | ||
| 1963 | nid < MAX_NUMNODES; | ||
| 1964 | nid = next_node(nid, memcg->scan_nodes)) { | ||
| 1965 | |||
| 1966 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
| 1967 | return true; | ||
| 1968 | } | ||
| 1969 | } | ||
| 1970 | /* | ||
| 1971 | * Check rest of nodes. | ||
| 1972 | */ | ||
| 1973 | for_each_node_state(nid, N_MEMORY) { | ||
| 1974 | if (node_isset(nid, memcg->scan_nodes)) | ||
| 1975 | continue; | ||
| 1976 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
| 1977 | return true; | ||
| 1978 | } | ||
| 1979 | return false; | ||
| 1980 | } | ||
| 1981 | |||
| 1860 | #else | 1982 | #else |
| 1861 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | 1983 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
| 1862 | { | 1984 | { |
| 1863 | return 0; | 1985 | return 0; |
| 1864 | } | 1986 | } |
| 1865 | 1987 | ||
| 1866 | #endif | 1988 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) |
| 1867 | |||
| 1868 | /* | ||
| 1869 | * A group is eligible for the soft limit reclaim under the given root | ||
| 1870 | * hierarchy if | ||
| 1871 | * a) it is over its soft limit | ||
| 1872 | * b) any parent up the hierarchy is over its soft limit | ||
| 1873 | * | ||
| 1874 | * If the given group doesn't have any children over the limit then it | ||
| 1875 | * doesn't make any sense to iterate its subtree. | ||
| 1876 | */ | ||
| 1877 | enum mem_cgroup_filter_t | ||
| 1878 | mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, | ||
| 1879 | struct mem_cgroup *root) | ||
| 1880 | { | 1989 | { |
| 1881 | struct mem_cgroup *parent; | 1990 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); |
| 1882 | 1991 | } | |
| 1883 | if (!memcg) | 1992 | #endif |
| 1884 | memcg = root_mem_cgroup; | ||
| 1885 | parent = memcg; | ||
| 1886 | |||
| 1887 | if (res_counter_soft_limit_excess(&memcg->res)) | ||
| 1888 | return VISIT; | ||
| 1889 | 1993 | ||
| 1890 | /* | 1994 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, |
| 1891 | * If any parent up to the root in the hierarchy is over its soft limit | 1995 | struct zone *zone, |
| 1892 | * then we have to obey and reclaim from this group as well. | 1996 | gfp_t gfp_mask, |
| 1893 | */ | 1997 | unsigned long *total_scanned) |
| 1894 | while ((parent = parent_mem_cgroup(parent))) { | 1998 | { |
| 1895 | if (res_counter_soft_limit_excess(&parent->res)) | 1999 | struct mem_cgroup *victim = NULL; |
| 1896 | return VISIT; | 2000 | int total = 0; |
| 1897 | if (parent == root) | 2001 | int loop = 0; |
| 2002 | unsigned long excess; | ||
| 2003 | unsigned long nr_scanned; | ||
| 2004 | struct mem_cgroup_reclaim_cookie reclaim = { | ||
| 2005 | .zone = zone, | ||
| 2006 | .priority = 0, | ||
| 2007 | }; | ||
| 2008 | |||
| 2009 | excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; | ||
| 2010 | |||
| 2011 | while (1) { | ||
| 2012 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); | ||
| 2013 | if (!victim) { | ||
| 2014 | loop++; | ||
| 2015 | if (loop >= 2) { | ||
| 2016 | /* | ||
| 2017 | * If we have not been able to reclaim | ||
| 2018 | * anything, it might because there are | ||
| 2019 | * no reclaimable pages under this hierarchy | ||
| 2020 | */ | ||
| 2021 | if (!total) | ||
| 2022 | break; | ||
| 2023 | /* | ||
| 2024 | * We want to do more targeted reclaim. | ||
| 2025 | * excess >> 2 is not to excessive so as to | ||
| 2026 | * reclaim too much, nor too less that we keep | ||
| 2027 | * coming back to reclaim from this cgroup | ||
| 2028 | */ | ||
| 2029 | if (total >= (excess >> 2) || | ||
| 2030 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) | ||
| 2031 | break; | ||
| 2032 | } | ||
| 2033 | continue; | ||
| 2034 | } | ||
| 2035 | if (!mem_cgroup_reclaimable(victim, false)) | ||
| 2036 | continue; | ||
| 2037 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, | ||
| 2038 | zone, &nr_scanned); | ||
| 2039 | *total_scanned += nr_scanned; | ||
| 2040 | if (!res_counter_soft_limit_excess(&root_memcg->res)) | ||
| 1898 | break; | 2041 | break; |
| 1899 | } | 2042 | } |
| 1900 | 2043 | mem_cgroup_iter_break(root_memcg, victim); | |
| 1901 | if (!atomic_read(&memcg->children_in_excess)) | 2044 | return total; |
| 1902 | return SKIP_TREE; | ||
| 1903 | return SKIP; | ||
| 1904 | } | 2045 | } |
| 1905 | 2046 | ||
| 1906 | static DEFINE_SPINLOCK(memcg_oom_lock); | 2047 | static DEFINE_SPINLOCK(memcg_oom_lock); |
| @@ -2812,7 +2953,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
| 2812 | unlock_page_cgroup(pc); | 2953 | unlock_page_cgroup(pc); |
| 2813 | 2954 | ||
| 2814 | /* | 2955 | /* |
| 2815 | * "charge_statistics" updated event counter. | 2956 | * "charge_statistics" updated event counter. Then, check it. |
| 2957 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
| 2958 | * if they exceeds softlimit. | ||
| 2816 | */ | 2959 | */ |
| 2817 | memcg_check_events(memcg, page); | 2960 | memcg_check_events(memcg, page); |
| 2818 | } | 2961 | } |
| @@ -4647,6 +4790,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
| 4647 | return ret; | 4790 | return ret; |
| 4648 | } | 4791 | } |
| 4649 | 4792 | ||
| 4793 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | ||
| 4794 | gfp_t gfp_mask, | ||
| 4795 | unsigned long *total_scanned) | ||
| 4796 | { | ||
| 4797 | unsigned long nr_reclaimed = 0; | ||
| 4798 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | ||
| 4799 | unsigned long reclaimed; | ||
| 4800 | int loop = 0; | ||
| 4801 | struct mem_cgroup_tree_per_zone *mctz; | ||
| 4802 | unsigned long long excess; | ||
| 4803 | unsigned long nr_scanned; | ||
| 4804 | |||
| 4805 | if (order > 0) | ||
| 4806 | return 0; | ||
| 4807 | |||
| 4808 | mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); | ||
| 4809 | /* | ||
| 4810 | * This loop can run a while, specially if mem_cgroup's continuously | ||
| 4811 | * keep exceeding their soft limit and putting the system under | ||
| 4812 | * pressure | ||
| 4813 | */ | ||
| 4814 | do { | ||
| 4815 | if (next_mz) | ||
| 4816 | mz = next_mz; | ||
| 4817 | else | ||
| 4818 | mz = mem_cgroup_largest_soft_limit_node(mctz); | ||
| 4819 | if (!mz) | ||
| 4820 | break; | ||
| 4821 | |||
| 4822 | nr_scanned = 0; | ||
| 4823 | reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, | ||
| 4824 | gfp_mask, &nr_scanned); | ||
| 4825 | nr_reclaimed += reclaimed; | ||
| 4826 | *total_scanned += nr_scanned; | ||
| 4827 | spin_lock(&mctz->lock); | ||
| 4828 | |||
| 4829 | /* | ||
| 4830 | * If we failed to reclaim anything from this memory cgroup | ||
| 4831 | * it is time to move on to the next cgroup | ||
| 4832 | */ | ||
| 4833 | next_mz = NULL; | ||
| 4834 | if (!reclaimed) { | ||
| 4835 | do { | ||
| 4836 | /* | ||
| 4837 | * Loop until we find yet another one. | ||
| 4838 | * | ||
| 4839 | * By the time we get the soft_limit lock | ||
| 4840 | * again, someone might have aded the | ||
| 4841 | * group back on the RB tree. Iterate to | ||
| 4842 | * make sure we get a different mem. | ||
| 4843 | * mem_cgroup_largest_soft_limit_node returns | ||
| 4844 | * NULL if no other cgroup is present on | ||
| 4845 | * the tree | ||
| 4846 | */ | ||
| 4847 | next_mz = | ||
| 4848 | __mem_cgroup_largest_soft_limit_node(mctz); | ||
| 4849 | if (next_mz == mz) | ||
| 4850 | css_put(&next_mz->memcg->css); | ||
| 4851 | else /* next_mz == NULL or other memcg */ | ||
| 4852 | break; | ||
| 4853 | } while (1); | ||
| 4854 | } | ||
| 4855 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
| 4856 | excess = res_counter_soft_limit_excess(&mz->memcg->res); | ||
| 4857 | /* | ||
| 4858 | * One school of thought says that we should not add | ||
| 4859 | * back the node to the tree if reclaim returns 0. | ||
| 4860 | * But our reclaim could return 0, simply because due | ||
| 4861 | * to priority we are exposing a smaller subset of | ||
| 4862 | * memory to reclaim from. Consider this as a longer | ||
| 4863 | * term TODO. | ||
| 4864 | */ | ||
| 4865 | /* If excess == 0, no tree ops */ | ||
| 4866 | __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); | ||
| 4867 | spin_unlock(&mctz->lock); | ||
| 4868 | css_put(&mz->memcg->css); | ||
| 4869 | loop++; | ||
| 4870 | /* | ||
| 4871 | * Could not reclaim anything and there are no more | ||
| 4872 | * mem cgroups to try or we seem to be looping without | ||
| 4873 | * reclaiming anything. | ||
| 4874 | */ | ||
| 4875 | if (!nr_reclaimed && | ||
| 4876 | (next_mz == NULL || | ||
| 4877 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | ||
| 4878 | break; | ||
| 4879 | } while (!nr_reclaimed); | ||
| 4880 | if (next_mz) | ||
| 4881 | css_put(&next_mz->memcg->css); | ||
| 4882 | return nr_reclaimed; | ||
| 4883 | } | ||
| 4884 | |||
| 4650 | /** | 4885 | /** |
| 4651 | * mem_cgroup_force_empty_list - clears LRU of a group | 4886 | * mem_cgroup_force_empty_list - clears LRU of a group |
| 4652 | * @memcg: group to clear | 4887 | * @memcg: group to clear |
| @@ -5911,6 +6146,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
| 5911 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 6146 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
| 5912 | mz = &pn->zoneinfo[zone]; | 6147 | mz = &pn->zoneinfo[zone]; |
| 5913 | lruvec_init(&mz->lruvec); | 6148 | lruvec_init(&mz->lruvec); |
| 6149 | mz->usage_in_excess = 0; | ||
| 6150 | mz->on_tree = false; | ||
| 5914 | mz->memcg = memcg; | 6151 | mz->memcg = memcg; |
| 5915 | } | 6152 | } |
| 5916 | memcg->nodeinfo[node] = pn; | 6153 | memcg->nodeinfo[node] = pn; |
| @@ -5966,6 +6203,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
| 5966 | int node; | 6203 | int node; |
| 5967 | size_t size = memcg_size(); | 6204 | size_t size = memcg_size(); |
| 5968 | 6205 | ||
| 6206 | mem_cgroup_remove_from_trees(memcg); | ||
| 5969 | free_css_id(&mem_cgroup_subsys, &memcg->css); | 6207 | free_css_id(&mem_cgroup_subsys, &memcg->css); |
| 5970 | 6208 | ||
| 5971 | for_each_node(node) | 6209 | for_each_node(node) |
| @@ -6002,6 +6240,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
| 6002 | } | 6240 | } |
| 6003 | EXPORT_SYMBOL(parent_mem_cgroup); | 6241 | EXPORT_SYMBOL(parent_mem_cgroup); |
| 6004 | 6242 | ||
| 6243 | static void __init mem_cgroup_soft_limit_tree_init(void) | ||
| 6244 | { | ||
| 6245 | struct mem_cgroup_tree_per_node *rtpn; | ||
| 6246 | struct mem_cgroup_tree_per_zone *rtpz; | ||
| 6247 | int tmp, node, zone; | ||
| 6248 | |||
| 6249 | for_each_node(node) { | ||
| 6250 | tmp = node; | ||
| 6251 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
| 6252 | tmp = -1; | ||
| 6253 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
| 6254 | BUG_ON(!rtpn); | ||
| 6255 | |||
| 6256 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
| 6257 | |||
| 6258 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
| 6259 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
| 6260 | rtpz->rb_root = RB_ROOT; | ||
| 6261 | spin_lock_init(&rtpz->lock); | ||
| 6262 | } | ||
| 6263 | } | ||
| 6264 | } | ||
| 6265 | |||
| 6005 | static struct cgroup_subsys_state * __ref | 6266 | static struct cgroup_subsys_state * __ref |
| 6006 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | 6267 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) |
| 6007 | { | 6268 | { |
| @@ -6031,7 +6292,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
| 6031 | mutex_init(&memcg->thresholds_lock); | 6292 | mutex_init(&memcg->thresholds_lock); |
| 6032 | spin_lock_init(&memcg->move_lock); | 6293 | spin_lock_init(&memcg->move_lock); |
| 6033 | vmpressure_init(&memcg->vmpressure); | 6294 | vmpressure_init(&memcg->vmpressure); |
| 6034 | spin_lock_init(&memcg->soft_lock); | ||
| 6035 | 6295 | ||
| 6036 | return &memcg->css; | 6296 | return &memcg->css; |
| 6037 | 6297 | ||
| @@ -6109,13 +6369,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
| 6109 | 6369 | ||
| 6110 | mem_cgroup_invalidate_reclaim_iterators(memcg); | 6370 | mem_cgroup_invalidate_reclaim_iterators(memcg); |
| 6111 | mem_cgroup_reparent_charges(memcg); | 6371 | mem_cgroup_reparent_charges(memcg); |
| 6112 | if (memcg->soft_contributed) { | ||
| 6113 | while ((memcg = parent_mem_cgroup(memcg))) | ||
| 6114 | atomic_dec(&memcg->children_in_excess); | ||
| 6115 | |||
| 6116 | if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) | ||
| 6117 | atomic_dec(&root_mem_cgroup->children_in_excess); | ||
| 6118 | } | ||
| 6119 | mem_cgroup_destroy_all_caches(memcg); | 6372 | mem_cgroup_destroy_all_caches(memcg); |
| 6120 | vmpressure_cleanup(&memcg->vmpressure); | 6373 | vmpressure_cleanup(&memcg->vmpressure); |
| 6121 | } | 6374 | } |
| @@ -6790,6 +7043,7 @@ static int __init mem_cgroup_init(void) | |||
| 6790 | { | 7043 | { |
| 6791 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | 7044 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
| 6792 | enable_swap_cgroup(); | 7045 | enable_swap_cgroup(); |
| 7046 | mem_cgroup_soft_limit_tree_init(); | ||
| 6793 | memcg_stock_init(); | 7047 | memcg_stock_init(); |
| 6794 | return 0; | 7048 | return 0; |
| 6795 | } | 7049 | } |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ed1b775bdc9..beb35778c69f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -139,23 +139,11 @@ static bool global_reclaim(struct scan_control *sc) | |||
| 139 | { | 139 | { |
| 140 | return !sc->target_mem_cgroup; | 140 | return !sc->target_mem_cgroup; |
| 141 | } | 141 | } |
| 142 | |||
| 143 | static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) | ||
| 144 | { | ||
| 145 | struct mem_cgroup *root = sc->target_mem_cgroup; | ||
| 146 | return !mem_cgroup_disabled() && | ||
| 147 | mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE; | ||
| 148 | } | ||
| 149 | #else | 142 | #else |
| 150 | static bool global_reclaim(struct scan_control *sc) | 143 | static bool global_reclaim(struct scan_control *sc) |
| 151 | { | 144 | { |
| 152 | return true; | 145 | return true; |
| 153 | } | 146 | } |
| 154 | |||
| 155 | static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) | ||
| 156 | { | ||
| 157 | return false; | ||
| 158 | } | ||
| 159 | #endif | 147 | #endif |
| 160 | 148 | ||
| 161 | unsigned long zone_reclaimable_pages(struct zone *zone) | 149 | unsigned long zone_reclaimable_pages(struct zone *zone) |
| @@ -2176,11 +2164,9 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
| 2176 | } | 2164 | } |
| 2177 | } | 2165 | } |
| 2178 | 2166 | ||
| 2179 | static int | 2167 | static void shrink_zone(struct zone *zone, struct scan_control *sc) |
| 2180 | __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) | ||
| 2181 | { | 2168 | { |
| 2182 | unsigned long nr_reclaimed, nr_scanned; | 2169 | unsigned long nr_reclaimed, nr_scanned; |
| 2183 | int groups_scanned = 0; | ||
| 2184 | 2170 | ||
| 2185 | do { | 2171 | do { |
| 2186 | struct mem_cgroup *root = sc->target_mem_cgroup; | 2172 | struct mem_cgroup *root = sc->target_mem_cgroup; |
| @@ -2188,17 +2174,15 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) | |||
| 2188 | .zone = zone, | 2174 | .zone = zone, |
| 2189 | .priority = sc->priority, | 2175 | .priority = sc->priority, |
| 2190 | }; | 2176 | }; |
| 2191 | struct mem_cgroup *memcg = NULL; | 2177 | struct mem_cgroup *memcg; |
| 2192 | mem_cgroup_iter_filter filter = (soft_reclaim) ? | ||
| 2193 | mem_cgroup_soft_reclaim_eligible : NULL; | ||
| 2194 | 2178 | ||
| 2195 | nr_reclaimed = sc->nr_reclaimed; | 2179 | nr_reclaimed = sc->nr_reclaimed; |
| 2196 | nr_scanned = sc->nr_scanned; | 2180 | nr_scanned = sc->nr_scanned; |
| 2197 | 2181 | ||
| 2198 | while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { | 2182 | memcg = mem_cgroup_iter(root, NULL, &reclaim); |
| 2183 | do { | ||
| 2199 | struct lruvec *lruvec; | 2184 | struct lruvec *lruvec; |
| 2200 | 2185 | ||
| 2201 | groups_scanned++; | ||
| 2202 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2186 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
| 2203 | 2187 | ||
| 2204 | shrink_lruvec(lruvec, sc); | 2188 | shrink_lruvec(lruvec, sc); |
| @@ -2218,7 +2202,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) | |||
| 2218 | mem_cgroup_iter_break(root, memcg); | 2202 | mem_cgroup_iter_break(root, memcg); |
| 2219 | break; | 2203 | break; |
| 2220 | } | 2204 | } |
| 2221 | } | 2205 | memcg = mem_cgroup_iter(root, memcg, &reclaim); |
| 2206 | } while (memcg); | ||
| 2222 | 2207 | ||
| 2223 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, | 2208 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, |
| 2224 | sc->nr_scanned - nr_scanned, | 2209 | sc->nr_scanned - nr_scanned, |
| @@ -2226,37 +2211,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) | |||
| 2226 | 2211 | ||
| 2227 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, | 2212 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, |
| 2228 | sc->nr_scanned - nr_scanned, sc)); | 2213 | sc->nr_scanned - nr_scanned, sc)); |
| 2229 | |||
| 2230 | return groups_scanned; | ||
| 2231 | } | ||
| 2232 | |||
| 2233 | |||
| 2234 | static void shrink_zone(struct zone *zone, struct scan_control *sc) | ||
| 2235 | { | ||
| 2236 | bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); | ||
| 2237 | unsigned long nr_scanned = sc->nr_scanned; | ||
| 2238 | int scanned_groups; | ||
| 2239 | |||
| 2240 | scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim); | ||
| 2241 | /* | ||
| 2242 | * memcg iterator might race with other reclaimer or start from | ||
| 2243 | * a incomplete tree walk so the tree walk in __shrink_zone | ||
| 2244 | * might have missed groups that are above the soft limit. Try | ||
| 2245 | * another loop to catch up with others. Do it just once to | ||
| 2246 | * prevent from reclaim latencies when other reclaimers always | ||
| 2247 | * preempt this one. | ||
| 2248 | */ | ||
| 2249 | if (do_soft_reclaim && !scanned_groups) | ||
| 2250 | __shrink_zone(zone, sc, do_soft_reclaim); | ||
| 2251 | |||
| 2252 | /* | ||
| 2253 | * No group is over the soft limit or those that are do not have | ||
| 2254 | * pages in the zone we are reclaiming so we have to reclaim everybody | ||
| 2255 | */ | ||
| 2256 | if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) { | ||
| 2257 | __shrink_zone(zone, sc, false); | ||
| 2258 | return; | ||
| 2259 | } | ||
| 2260 | } | 2214 | } |
| 2261 | 2215 | ||
| 2262 | /* Returns true if compaction should go ahead for a high-order request */ | 2216 | /* Returns true if compaction should go ahead for a high-order request */ |
| @@ -2320,6 +2274,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
| 2320 | { | 2274 | { |
| 2321 | struct zoneref *z; | 2275 | struct zoneref *z; |
| 2322 | struct zone *zone; | 2276 | struct zone *zone; |
| 2277 | unsigned long nr_soft_reclaimed; | ||
| 2278 | unsigned long nr_soft_scanned; | ||
| 2323 | bool aborted_reclaim = false; | 2279 | bool aborted_reclaim = false; |
| 2324 | 2280 | ||
| 2325 | /* | 2281 | /* |
| @@ -2359,6 +2315,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
| 2359 | continue; | 2315 | continue; |
| 2360 | } | 2316 | } |
| 2361 | } | 2317 | } |
| 2318 | /* | ||
| 2319 | * This steals pages from memory cgroups over softlimit | ||
| 2320 | * and returns the number of reclaimed pages and | ||
| 2321 | * scanned pages. This works for global memory pressure | ||
| 2322 | * and balancing, not for a memcg's limit. | ||
| 2323 | */ | ||
| 2324 | nr_soft_scanned = 0; | ||
| 2325 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
| 2326 | sc->order, sc->gfp_mask, | ||
| 2327 | &nr_soft_scanned); | ||
| 2328 | sc->nr_reclaimed += nr_soft_reclaimed; | ||
| 2329 | sc->nr_scanned += nr_soft_scanned; | ||
| 2362 | /* need some check for avoid more shrink_zone() */ | 2330 | /* need some check for avoid more shrink_zone() */ |
| 2363 | } | 2331 | } |
| 2364 | 2332 | ||
| @@ -2952,6 +2920,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
| 2952 | { | 2920 | { |
| 2953 | int i; | 2921 | int i; |
| 2954 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2922 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
| 2923 | unsigned long nr_soft_reclaimed; | ||
| 2924 | unsigned long nr_soft_scanned; | ||
| 2955 | struct scan_control sc = { | 2925 | struct scan_control sc = { |
| 2956 | .gfp_mask = GFP_KERNEL, | 2926 | .gfp_mask = GFP_KERNEL, |
| 2957 | .priority = DEF_PRIORITY, | 2927 | .priority = DEF_PRIORITY, |
| @@ -3066,6 +3036,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
| 3066 | 3036 | ||
| 3067 | sc.nr_scanned = 0; | 3037 | sc.nr_scanned = 0; |
| 3068 | 3038 | ||
| 3039 | nr_soft_scanned = 0; | ||
| 3040 | /* | ||
| 3041 | * Call soft limit reclaim before calling shrink_zone. | ||
| 3042 | */ | ||
| 3043 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
| 3044 | order, sc.gfp_mask, | ||
| 3045 | &nr_soft_scanned); | ||
| 3046 | sc.nr_reclaimed += nr_soft_reclaimed; | ||
| 3047 | |||
| 3069 | /* | 3048 | /* |
| 3070 | * There should be no need to raise the scanning | 3049 | * There should be no need to raise the scanning |
| 3071 | * priority if enough pages are already being scanned | 3050 | * priority if enough pages are already being scanned |
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 47016c304c84..66cad506b8a2 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl | |||
| @@ -3975,8 +3975,8 @@ sub string_find_replace { | |||
| 3975 | # check for new externs in .h files. | 3975 | # check for new externs in .h files. |
| 3976 | if ($realfile =~ /\.h$/ && | 3976 | if ($realfile =~ /\.h$/ && |
| 3977 | $line =~ /^\+\s*(extern\s+)$Type\s*$Ident\s*\(/s) { | 3977 | $line =~ /^\+\s*(extern\s+)$Type\s*$Ident\s*\(/s) { |
| 3978 | if (WARN("AVOID_EXTERNS", | 3978 | if (CHK("AVOID_EXTERNS", |
| 3979 | "extern prototypes should be avoided in .h files\n" . $herecurr) && | 3979 | "extern prototypes should be avoided in .h files\n" . $herecurr) && |
| 3980 | $fix) { | 3980 | $fix) { |
| 3981 | $fixed[$linenr - 1] =~ s/(.*)\bextern\b\s*(.*)/$1$2/; | 3981 | $fixed[$linenr - 1] =~ s/(.*)\bextern\b\s*(.*)/$1$2/; |
| 3982 | } | 3982 | } |
