aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-09-24 20:00:35 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-24 20:00:35 -0400
commita153e67bda3639a46edac6205610ae63c0fdea4c (patch)
tree4169223a18acde55f624666ad3e836d957fc60d1
parente288e931c12e77982aa582e2380cd072becfe488 (diff)
parent497a045d13dcd7a00f5535ded1ebb49313d4a211 (diff)
Merge branch 'akpm' (patches from Andrew Morton)
Merge fixes from Andrew Morton: "Bunch of fixes. And a reversion of mhocko's "Soft limit rework" patch series. This is actually your fault for opening the merge window when I was off racing ;) I didn't read the email thread before sending everything off. Johannes Weiner raised significant issues: http://www.spinics.net/lists/cgroups/msg08813.html and we agreed to back it all out" I clearly need to be more aware of Andrew's racing schedule. * akpm: MAINTAINERS: update mach-bcm related email address checkpatch: make extern in .h prototypes quieter cciss: fix info leak in cciss_ioctl32_passthru() cpqarray: fix info leak in ida_locked_ioctl() kernel/reboot.c: re-enable the function of variable reboot_default audit: fix endless wait in audit_log_start() revert "memcg, vmscan: integrate soft reclaim tighter with zone shrinking code" revert "memcg: get rid of soft-limit tree infrastructure" revert "vmscan, memcg: do softlimit reclaim also for targeted reclaim" revert "memcg: enhance memcg iterator to support predicates" revert "memcg: track children in soft limit excess to improve soft limit" revert "memcg, vmscan: do not attempt soft limit reclaim if it would not scan anything" revert "memcg: track all children over limit in the root" revert "memcg, vmscan: do not fall into reclaim-all pass too quickly" fs/ocfs2/super.c: use a bigger nodestr in ocfs2_dismount_volume watchdog: update watchdog_thresh properly watchdog: update watchdog attributes atomically
-rw-r--r--MAINTAINERS3
-rw-r--r--drivers/block/cciss.c1
-rw-r--r--drivers/block/cpqarray.c1
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--include/linux/memcontrol.h55
-rw-r--r--include/linux/smp.h6
-rw-r--r--kernel/audit.c5
-rw-r--r--kernel/reboot.c9
-rw-r--r--kernel/watchdog.c60
-rw-r--r--mm/memcontrol.c560
-rw-r--r--mm/vmscan.c83
-rwxr-xr-xscripts/checkpatch.pl4
12 files changed, 527 insertions, 262 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index e61c2e83fc2b..b2f857c7ecf6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1812,7 +1812,8 @@ S: Supported
1812F: drivers/net/ethernet/broadcom/bnx2x/ 1812F: drivers/net/ethernet/broadcom/bnx2x/
1813 1813
1814BROADCOM BCM281XX/BCM11XXX ARM ARCHITECTURE 1814BROADCOM BCM281XX/BCM11XXX ARM ARCHITECTURE
1815M: Christian Daudt <csd@broadcom.com> 1815M: Christian Daudt <bcm@fixthebug.org>
1816L: bcm-kernel-feedback-list@broadcom.com
1816T: git git://git.github.com/broadcom/bcm11351 1817T: git git://git.github.com/broadcom/bcm11351
1817S: Maintained 1818S: Maintained
1818F: arch/arm/mach-bcm/ 1819F: arch/arm/mach-bcm/
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index d2d95ff5353b..edfa2515bc86 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1189,6 +1189,7 @@ static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode,
1189 int err; 1189 int err;
1190 u32 cp; 1190 u32 cp;
1191 1191
1192 memset(&arg64, 0, sizeof(arg64));
1192 err = 0; 1193 err = 0;
1193 err |= 1194 err |=
1194 copy_from_user(&arg64.LUN_info, &arg32->LUN_info, 1195 copy_from_user(&arg64.LUN_info, &arg32->LUN_info,
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 639d26b90b91..2b9440384536 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -1193,6 +1193,7 @@ out_passthru:
1193 ida_pci_info_struct pciinfo; 1193 ida_pci_info_struct pciinfo;
1194 1194
1195 if (!arg) return -EINVAL; 1195 if (!arg) return -EINVAL;
1196 memset(&pciinfo, 0, sizeof(pciinfo));
1196 pciinfo.bus = host->pci_dev->bus->number; 1197 pciinfo.bus = host->pci_dev->bus->number;
1197 pciinfo.dev_fn = host->pci_dev->devfn; 1198 pciinfo.dev_fn = host->pci_dev->devfn;
1198 pciinfo.board_id = host->board_id; 1199 pciinfo.board_id = host->board_id;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 121da2dc3be8..d4e81e4a9b04 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1924,7 +1924,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1924{ 1924{
1925 int tmp, hangup_needed = 0; 1925 int tmp, hangup_needed = 0;
1926 struct ocfs2_super *osb = NULL; 1926 struct ocfs2_super *osb = NULL;
1927 char nodestr[8]; 1927 char nodestr[12];
1928 1928
1929 trace_ocfs2_dismount_volume(sb); 1929 trace_ocfs2_dismount_volume(sb);
1930 1930
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 60e95872da29..ecc82b37c4cc 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -53,23 +53,6 @@ struct mem_cgroup_reclaim_cookie {
53 unsigned int generation; 53 unsigned int generation;
54}; 54};
55 55
56enum mem_cgroup_filter_t {
57 VISIT, /* visit current node */
58 SKIP, /* skip the current node and continue traversal */
59 SKIP_TREE, /* skip the whole subtree and continue traversal */
60};
61
62/*
63 * mem_cgroup_filter_t predicate might instruct mem_cgroup_iter_cond how to
64 * iterate through the hierarchy tree. Each tree element is checked by the
65 * predicate before it is returned by the iterator. If a filter returns
66 * SKIP or SKIP_TREE then the iterator code continues traversal (with the
67 * next node down the hierarchy or the next node that doesn't belong under the
68 * memcg's subtree).
69 */
70typedef enum mem_cgroup_filter_t
71(*mem_cgroup_iter_filter)(struct mem_cgroup *memcg, struct mem_cgroup *root);
72
73#ifdef CONFIG_MEMCG 56#ifdef CONFIG_MEMCG
74/* 57/*
75 * All "charge" functions with gfp_mask should use GFP_KERNEL or 58 * All "charge" functions with gfp_mask should use GFP_KERNEL or
@@ -137,18 +120,9 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
137extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, 120extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
138 struct page *oldpage, struct page *newpage, bool migration_ok); 121 struct page *oldpage, struct page *newpage, bool migration_ok);
139 122
140struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, 123struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
141 struct mem_cgroup *prev, 124 struct mem_cgroup *,
142 struct mem_cgroup_reclaim_cookie *reclaim, 125 struct mem_cgroup_reclaim_cookie *);
143 mem_cgroup_iter_filter cond);
144
145static inline struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
146 struct mem_cgroup *prev,
147 struct mem_cgroup_reclaim_cookie *reclaim)
148{
149 return mem_cgroup_iter_cond(root, prev, reclaim, NULL);
150}
151
152void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); 126void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
153 127
154/* 128/*
@@ -260,9 +234,9 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
260 mem_cgroup_update_page_stat(page, idx, -1); 234 mem_cgroup_update_page_stat(page, idx, -1);
261} 235}
262 236
263enum mem_cgroup_filter_t 237unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
264mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, 238 gfp_t gfp_mask,
265 struct mem_cgroup *root); 239 unsigned long *total_scanned);
266 240
267void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); 241void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
268static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, 242static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
@@ -376,15 +350,6 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
376 struct page *oldpage, struct page *newpage, bool migration_ok) 350 struct page *oldpage, struct page *newpage, bool migration_ok)
377{ 351{
378} 352}
379static inline struct mem_cgroup *
380mem_cgroup_iter_cond(struct mem_cgroup *root,
381 struct mem_cgroup *prev,
382 struct mem_cgroup_reclaim_cookie *reclaim,
383 mem_cgroup_iter_filter cond)
384{
385 /* first call must return non-NULL, second return NULL */
386 return (struct mem_cgroup *)(unsigned long)!prev;
387}
388 353
389static inline struct mem_cgroup * 354static inline struct mem_cgroup *
390mem_cgroup_iter(struct mem_cgroup *root, 355mem_cgroup_iter(struct mem_cgroup *root,
@@ -471,11 +436,11 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
471} 436}
472 437
473static inline 438static inline
474enum mem_cgroup_filter_t 439unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
475mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, 440 gfp_t gfp_mask,
476 struct mem_cgroup *root) 441 unsigned long *total_scanned)
477{ 442{
478 return VISIT; 443 return 0;
479} 444}
480 445
481static inline void mem_cgroup_split_huge_fixup(struct page *head) 446static inline void mem_cgroup_split_huge_fixup(struct page *head)
diff --git a/include/linux/smp.h b/include/linux/smp.h
index cfb7ca094b38..731f5237d5f4 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -155,6 +155,12 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func,
155 155
156static inline void kick_all_cpus_sync(void) { } 156static inline void kick_all_cpus_sync(void) { }
157 157
158static inline void __smp_call_function_single(int cpuid,
159 struct call_single_data *data, int wait)
160{
161 on_each_cpu(data->func, data->info, wait);
162}
163
158#endif /* !SMP */ 164#endif /* !SMP */
159 165
160/* 166/*
diff --git a/kernel/audit.c b/kernel/audit.c
index 91e53d04b6a9..7b0e23a740ce 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1117,9 +1117,10 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1117 1117
1118 sleep_time = timeout_start + audit_backlog_wait_time - 1118 sleep_time = timeout_start + audit_backlog_wait_time -
1119 jiffies; 1119 jiffies;
1120 if ((long)sleep_time > 0) 1120 if ((long)sleep_time > 0) {
1121 wait_for_auditd(sleep_time); 1121 wait_for_auditd(sleep_time);
1122 continue; 1122 continue;
1123 }
1123 } 1124 }
1124 if (audit_rate_check() && printk_ratelimit()) 1125 if (audit_rate_check() && printk_ratelimit())
1125 printk(KERN_WARNING 1126 printk(KERN_WARNING
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 269ed9384cc4..f813b3474646 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -32,7 +32,14 @@ EXPORT_SYMBOL(cad_pid);
32#endif 32#endif
33enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; 33enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
34 34
35int reboot_default; 35/*
36 * This variable is used privately to keep track of whether or not
37 * reboot_type is still set to its default value (i.e., reboot= hasn't
38 * been set on the command line). This is needed so that we can
39 * suppress DMI scanning for reboot quirks. Without it, it's
40 * impossible to override a faulty reboot quirk without recompiling.
41 */
42int reboot_default = 1;
36int reboot_cpu; 43int reboot_cpu;
37enum reboot_type reboot_type = BOOT_ACPI; 44enum reboot_type reboot_type = BOOT_ACPI;
38int reboot_force; 45int reboot_force;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 51c4f34d258e..4431610f049a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -486,7 +486,52 @@ static struct smp_hotplug_thread watchdog_threads = {
486 .unpark = watchdog_enable, 486 .unpark = watchdog_enable,
487}; 487};
488 488
489static int watchdog_enable_all_cpus(void) 489static void restart_watchdog_hrtimer(void *info)
490{
491 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
492 int ret;
493
494 /*
495 * No need to cancel and restart hrtimer if it is currently executing
496 * because it will reprogram itself with the new period now.
497 * We should never see it unqueued here because we are running per-cpu
498 * with interrupts disabled.
499 */
500 ret = hrtimer_try_to_cancel(hrtimer);
501 if (ret == 1)
502 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
503 HRTIMER_MODE_REL_PINNED);
504}
505
506static void update_timers(int cpu)
507{
508 struct call_single_data data = {.func = restart_watchdog_hrtimer};
509 /*
510 * Make sure that perf event counter will adopt to a new
511 * sampling period. Updating the sampling period directly would
512 * be much nicer but we do not have an API for that now so
513 * let's use a big hammer.
514 * Hrtimer will adopt the new period on the next tick but this
515 * might be late already so we have to restart the timer as well.
516 */
517 watchdog_nmi_disable(cpu);
518 __smp_call_function_single(cpu, &data, 1);
519 watchdog_nmi_enable(cpu);
520}
521
522static void update_timers_all_cpus(void)
523{
524 int cpu;
525
526 get_online_cpus();
527 preempt_disable();
528 for_each_online_cpu(cpu)
529 update_timers(cpu);
530 preempt_enable();
531 put_online_cpus();
532}
533
534static int watchdog_enable_all_cpus(bool sample_period_changed)
490{ 535{
491 int err = 0; 536 int err = 0;
492 537
@@ -496,6 +541,8 @@ static int watchdog_enable_all_cpus(void)
496 pr_err("Failed to create watchdog threads, disabled\n"); 541 pr_err("Failed to create watchdog threads, disabled\n");
497 else 542 else
498 watchdog_running = 1; 543 watchdog_running = 1;
544 } else if (sample_period_changed) {
545 update_timers_all_cpus();
499 } 546 }
500 547
501 return err; 548 return err;
@@ -520,13 +567,15 @@ int proc_dowatchdog(struct ctl_table *table, int write,
520 void __user *buffer, size_t *lenp, loff_t *ppos) 567 void __user *buffer, size_t *lenp, loff_t *ppos)
521{ 568{
522 int err, old_thresh, old_enabled; 569 int err, old_thresh, old_enabled;
570 static DEFINE_MUTEX(watchdog_proc_mutex);
523 571
572 mutex_lock(&watchdog_proc_mutex);
524 old_thresh = ACCESS_ONCE(watchdog_thresh); 573 old_thresh = ACCESS_ONCE(watchdog_thresh);
525 old_enabled = ACCESS_ONCE(watchdog_user_enabled); 574 old_enabled = ACCESS_ONCE(watchdog_user_enabled);
526 575
527 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 576 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
528 if (err || !write) 577 if (err || !write)
529 return err; 578 goto out;
530 579
531 set_sample_period(); 580 set_sample_period();
532 /* 581 /*
@@ -535,7 +584,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
535 * watchdog_*_all_cpus() function takes care of this. 584 * watchdog_*_all_cpus() function takes care of this.
536 */ 585 */
537 if (watchdog_user_enabled && watchdog_thresh) 586 if (watchdog_user_enabled && watchdog_thresh)
538 err = watchdog_enable_all_cpus(); 587 err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
539 else 588 else
540 watchdog_disable_all_cpus(); 589 watchdog_disable_all_cpus();
541 590
@@ -544,7 +593,8 @@ int proc_dowatchdog(struct ctl_table *table, int write,
544 watchdog_thresh = old_thresh; 593 watchdog_thresh = old_thresh;
545 watchdog_user_enabled = old_enabled; 594 watchdog_user_enabled = old_enabled;
546 } 595 }
547 596out:
597 mutex_unlock(&watchdog_proc_mutex);
548 return err; 598 return err;
549} 599}
550#endif /* CONFIG_SYSCTL */ 600#endif /* CONFIG_SYSCTL */
@@ -554,5 +604,5 @@ void __init lockup_detector_init(void)
554 set_sample_period(); 604 set_sample_period();
555 605
556 if (watchdog_user_enabled) 606 if (watchdog_user_enabled)
557 watchdog_enable_all_cpus(); 607 watchdog_enable_all_cpus(false);
558} 608}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d5ff3ce13029..1c52ddbc839b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,6 +39,7 @@
39#include <linux/limits.h> 39#include <linux/limits.h>
40#include <linux/export.h> 40#include <linux/export.h>
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/rbtree.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
43#include <linux/swap.h> 44#include <linux/swap.h>
44#include <linux/swapops.h> 45#include <linux/swapops.h>
@@ -160,6 +161,10 @@ struct mem_cgroup_per_zone {
160 161
161 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 162 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
162 163
164 struct rb_node tree_node; /* RB tree node */
165 unsigned long long usage_in_excess;/* Set to the value by which */
166 /* the soft limit is exceeded*/
167 bool on_tree;
163 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 168 struct mem_cgroup *memcg; /* Back pointer, we cannot */
164 /* use container_of */ 169 /* use container_of */
165}; 170};
@@ -168,6 +173,26 @@ struct mem_cgroup_per_node {
168 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 173 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
169}; 174};
170 175
176/*
177 * Cgroups above their limits are maintained in a RB-Tree, independent of
178 * their hierarchy representation
179 */
180
181struct mem_cgroup_tree_per_zone {
182 struct rb_root rb_root;
183 spinlock_t lock;
184};
185
186struct mem_cgroup_tree_per_node {
187 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
188};
189
190struct mem_cgroup_tree {
191 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
192};
193
194static struct mem_cgroup_tree soft_limit_tree __read_mostly;
195
171struct mem_cgroup_threshold { 196struct mem_cgroup_threshold {
172 struct eventfd_ctx *eventfd; 197 struct eventfd_ctx *eventfd;
173 u64 threshold; 198 u64 threshold;
@@ -303,22 +328,6 @@ struct mem_cgroup {
303 atomic_t numainfo_events; 328 atomic_t numainfo_events;
304 atomic_t numainfo_updating; 329 atomic_t numainfo_updating;
305#endif 330#endif
306 /*
307 * Protects soft_contributed transitions.
308 * See mem_cgroup_update_soft_limit
309 */
310 spinlock_t soft_lock;
311
312 /*
313 * If true then this group has increased parents' children_in_excess
314 * when it got over the soft limit.
315 * When a group falls bellow the soft limit, parents' children_in_excess
316 * is decreased and soft_contributed changed to false.
317 */
318 bool soft_contributed;
319
320 /* Number of children that are in soft limit excess */
321 atomic_t children_in_excess;
322 331
323 struct mem_cgroup_per_node *nodeinfo[0]; 332 struct mem_cgroup_per_node *nodeinfo[0];
324 /* WARNING: nodeinfo must be the last member here */ 333 /* WARNING: nodeinfo must be the last member here */
@@ -422,6 +431,7 @@ static bool move_file(void)
422 * limit reclaim to prevent infinite loops, if they ever occur. 431 * limit reclaim to prevent infinite loops, if they ever occur.
423 */ 432 */
424#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 433#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
434#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
425 435
426enum charge_type { 436enum charge_type {
427 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 437 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -648,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
648 return mem_cgroup_zoneinfo(memcg, nid, zid); 658 return mem_cgroup_zoneinfo(memcg, nid, zid);
649} 659}
650 660
661static struct mem_cgroup_tree_per_zone *
662soft_limit_tree_node_zone(int nid, int zid)
663{
664 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
665}
666
667static struct mem_cgroup_tree_per_zone *
668soft_limit_tree_from_page(struct page *page)
669{
670 int nid = page_to_nid(page);
671 int zid = page_zonenum(page);
672
673 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
674}
675
676static void
677__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
678 struct mem_cgroup_per_zone *mz,
679 struct mem_cgroup_tree_per_zone *mctz,
680 unsigned long long new_usage_in_excess)
681{
682 struct rb_node **p = &mctz->rb_root.rb_node;
683 struct rb_node *parent = NULL;
684 struct mem_cgroup_per_zone *mz_node;
685
686 if (mz->on_tree)
687 return;
688
689 mz->usage_in_excess = new_usage_in_excess;
690 if (!mz->usage_in_excess)
691 return;
692 while (*p) {
693 parent = *p;
694 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
695 tree_node);
696 if (mz->usage_in_excess < mz_node->usage_in_excess)
697 p = &(*p)->rb_left;
698 /*
699 * We can't avoid mem cgroups that are over their soft
700 * limit by the same amount
701 */
702 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
703 p = &(*p)->rb_right;
704 }
705 rb_link_node(&mz->tree_node, parent, p);
706 rb_insert_color(&mz->tree_node, &mctz->rb_root);
707 mz->on_tree = true;
708}
709
710static void
711__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
712 struct mem_cgroup_per_zone *mz,
713 struct mem_cgroup_tree_per_zone *mctz)
714{
715 if (!mz->on_tree)
716 return;
717 rb_erase(&mz->tree_node, &mctz->rb_root);
718 mz->on_tree = false;
719}
720
721static void
722mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
723 struct mem_cgroup_per_zone *mz,
724 struct mem_cgroup_tree_per_zone *mctz)
725{
726 spin_lock(&mctz->lock);
727 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
728 spin_unlock(&mctz->lock);
729}
730
731
732static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
733{
734 unsigned long long excess;
735 struct mem_cgroup_per_zone *mz;
736 struct mem_cgroup_tree_per_zone *mctz;
737 int nid = page_to_nid(page);
738 int zid = page_zonenum(page);
739 mctz = soft_limit_tree_from_page(page);
740
741 /*
742 * Necessary to update all ancestors when hierarchy is used.
743 * because their event counter is not touched.
744 */
745 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
746 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
747 excess = res_counter_soft_limit_excess(&memcg->res);
748 /*
749 * We have to update the tree if mz is on RB-tree or
750 * mem is over its softlimit.
751 */
752 if (excess || mz->on_tree) {
753 spin_lock(&mctz->lock);
754 /* if on-tree, remove it */
755 if (mz->on_tree)
756 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
757 /*
758 * Insert again. mz->usage_in_excess will be updated.
759 * If excess is 0, no tree ops.
760 */
761 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
762 spin_unlock(&mctz->lock);
763 }
764 }
765}
766
767static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
768{
769 int node, zone;
770 struct mem_cgroup_per_zone *mz;
771 struct mem_cgroup_tree_per_zone *mctz;
772
773 for_each_node(node) {
774 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
775 mz = mem_cgroup_zoneinfo(memcg, node, zone);
776 mctz = soft_limit_tree_node_zone(node, zone);
777 mem_cgroup_remove_exceeded(memcg, mz, mctz);
778 }
779 }
780}
781
782static struct mem_cgroup_per_zone *
783__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
784{
785 struct rb_node *rightmost = NULL;
786 struct mem_cgroup_per_zone *mz;
787
788retry:
789 mz = NULL;
790 rightmost = rb_last(&mctz->rb_root);
791 if (!rightmost)
792 goto done; /* Nothing to reclaim from */
793
794 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
795 /*
796 * Remove the node now but someone else can add it back,
797 * we will to add it back at the end of reclaim to its correct
798 * position in the tree.
799 */
800 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
801 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
802 !css_tryget(&mz->memcg->css))
803 goto retry;
804done:
805 return mz;
806}
807
808static struct mem_cgroup_per_zone *
809mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
810{
811 struct mem_cgroup_per_zone *mz;
812
813 spin_lock(&mctz->lock);
814 mz = __mem_cgroup_largest_soft_limit_node(mctz);
815 spin_unlock(&mctz->lock);
816 return mz;
817}
818
651/* 819/*
652 * Implementation Note: reading percpu statistics for memcg. 820 * Implementation Note: reading percpu statistics for memcg.
653 * 821 *
@@ -822,48 +990,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
822} 990}
823 991
824/* 992/*
825 * Called from rate-limited memcg_check_events when enough
826 * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
827 * that all the parents up the hierarchy will be notified that this group
828 * is in excess or that it is not in excess anymore. mmecg->soft_contributed
829 * makes the transition a single action whenever the state flips from one to
830 * the other.
831 */
832static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
833{
834 unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
835 struct mem_cgroup *parent = memcg;
836 int delta = 0;
837
838 spin_lock(&memcg->soft_lock);
839 if (excess) {
840 if (!memcg->soft_contributed) {
841 delta = 1;
842 memcg->soft_contributed = true;
843 }
844 } else {
845 if (memcg->soft_contributed) {
846 delta = -1;
847 memcg->soft_contributed = false;
848 }
849 }
850
851 /*
852 * Necessary to update all ancestors when hierarchy is used
853 * because their event counter is not touched.
854 * We track children even outside the hierarchy for the root
855 * cgroup because tree walk starting at root should visit
856 * all cgroups and we want to prevent from pointless tree
857 * walk if no children is below the limit.
858 */
859 while (delta && (parent = parent_mem_cgroup(parent)))
860 atomic_add(delta, &parent->children_in_excess);
861 if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
862 atomic_add(delta, &root_mem_cgroup->children_in_excess);
863 spin_unlock(&memcg->soft_lock);
864}
865
866/*
867 * Check events in order. 993 * Check events in order.
868 * 994 *
869 */ 995 */
@@ -886,7 +1012,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
886 1012
887 mem_cgroup_threshold(memcg); 1013 mem_cgroup_threshold(memcg);
888 if (unlikely(do_softlimit)) 1014 if (unlikely(do_softlimit))
889 mem_cgroup_update_soft_limit(memcg); 1015 mem_cgroup_update_tree(memcg, page);
890#if MAX_NUMNODES > 1 1016#if MAX_NUMNODES > 1
891 if (unlikely(do_numainfo)) 1017 if (unlikely(do_numainfo))
892 atomic_inc(&memcg->numainfo_events); 1018 atomic_inc(&memcg->numainfo_events);
@@ -929,15 +1055,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
929 return memcg; 1055 return memcg;
930} 1056}
931 1057
932static enum mem_cgroup_filter_t
933mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
934 mem_cgroup_iter_filter cond)
935{
936 if (!cond)
937 return VISIT;
938 return cond(memcg, root);
939}
940
941/* 1058/*
942 * Returns a next (in a pre-order walk) alive memcg (with elevated css 1059 * Returns a next (in a pre-order walk) alive memcg (with elevated css
943 * ref. count) or NULL if the whole root's subtree has been visited. 1060 * ref. count) or NULL if the whole root's subtree has been visited.
@@ -945,7 +1062,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
945 * helper function to be used by mem_cgroup_iter 1062 * helper function to be used by mem_cgroup_iter
946 */ 1063 */
947static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 1064static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
948 struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) 1065 struct mem_cgroup *last_visited)
949{ 1066{
950 struct cgroup_subsys_state *prev_css, *next_css; 1067 struct cgroup_subsys_state *prev_css, *next_css;
951 1068
@@ -963,31 +1080,11 @@ skip_node:
963 if (next_css) { 1080 if (next_css) {
964 struct mem_cgroup *mem = mem_cgroup_from_css(next_css); 1081 struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
965 1082
966 switch (mem_cgroup_filter(mem, root, cond)) { 1083 if (css_tryget(&mem->css))
967 case SKIP: 1084 return mem;
1085 else {
968 prev_css = next_css; 1086 prev_css = next_css;
969 goto skip_node; 1087 goto skip_node;
970 case SKIP_TREE:
971 if (mem == root)
972 return NULL;
973 /*
974 * css_rightmost_descendant is not an optimal way to
975 * skip through a subtree (especially for imbalanced
976 * trees leaning to right) but that's what we have right
977 * now. More effective solution would be traversing
978 * right-up for first non-NULL without calling
979 * css_next_descendant_pre afterwards.
980 */
981 prev_css = css_rightmost_descendant(next_css);
982 goto skip_node;
983 case VISIT:
984 if (css_tryget(&mem->css))
985 return mem;
986 else {
987 prev_css = next_css;
988 goto skip_node;
989 }
990 break;
991 } 1088 }
992 } 1089 }
993 1090
@@ -1051,7 +1148,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1051 * @root: hierarchy root 1148 * @root: hierarchy root
1052 * @prev: previously returned memcg, NULL on first invocation 1149 * @prev: previously returned memcg, NULL on first invocation
1053 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1150 * @reclaim: cookie for shared reclaim walks, NULL for full walks
1054 * @cond: filter for visited nodes, NULL for no filter
1055 * 1151 *
1056 * Returns references to children of the hierarchy below @root, or 1152 * Returns references to children of the hierarchy below @root, or
1057 * @root itself, or %NULL after a full round-trip. 1153 * @root itself, or %NULL after a full round-trip.
@@ -1064,18 +1160,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1064 * divide up the memcgs in the hierarchy among all concurrent 1160 * divide up the memcgs in the hierarchy among all concurrent
1065 * reclaimers operating on the same zone and priority. 1161 * reclaimers operating on the same zone and priority.
1066 */ 1162 */
1067struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, 1163struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1068 struct mem_cgroup *prev, 1164 struct mem_cgroup *prev,
1069 struct mem_cgroup_reclaim_cookie *reclaim, 1165 struct mem_cgroup_reclaim_cookie *reclaim)
1070 mem_cgroup_iter_filter cond)
1071{ 1166{
1072 struct mem_cgroup *memcg = NULL; 1167 struct mem_cgroup *memcg = NULL;
1073 struct mem_cgroup *last_visited = NULL; 1168 struct mem_cgroup *last_visited = NULL;
1074 1169
1075 if (mem_cgroup_disabled()) { 1170 if (mem_cgroup_disabled())
1076 /* first call must return non-NULL, second return NULL */ 1171 return NULL;
1077 return (struct mem_cgroup *)(unsigned long)!prev;
1078 }
1079 1172
1080 if (!root) 1173 if (!root)
1081 root = root_mem_cgroup; 1174 root = root_mem_cgroup;
@@ -1086,9 +1179,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1086 if (!root->use_hierarchy && root != root_mem_cgroup) { 1179 if (!root->use_hierarchy && root != root_mem_cgroup) {
1087 if (prev) 1180 if (prev)
1088 goto out_css_put; 1181 goto out_css_put;
1089 if (mem_cgroup_filter(root, root, cond) == VISIT) 1182 return root;
1090 return root;
1091 return NULL;
1092 } 1183 }
1093 1184
1094 rcu_read_lock(); 1185 rcu_read_lock();
@@ -1111,7 +1202,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1111 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1202 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1112 } 1203 }
1113 1204
1114 memcg = __mem_cgroup_iter_next(root, last_visited, cond); 1205 memcg = __mem_cgroup_iter_next(root, last_visited);
1115 1206
1116 if (reclaim) { 1207 if (reclaim) {
1117 mem_cgroup_iter_update(iter, last_visited, memcg, seq); 1208 mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1122,11 +1213,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1122 reclaim->generation = iter->generation; 1213 reclaim->generation = iter->generation;
1123 } 1214 }
1124 1215
1125 /* 1216 if (prev && !memcg)
1126 * We have finished the whole tree walk or no group has been
1127 * visited because filter told us to skip the root node.
1128 */
1129 if (!memcg && (prev || (cond && !last_visited)))
1130 goto out_unlock; 1217 goto out_unlock;
1131 } 1218 }
1132out_unlock: 1219out_unlock:
@@ -1767,7 +1854,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1767 return total; 1854 return total;
1768} 1855}
1769 1856
1770#if MAX_NUMNODES > 1
1771/** 1857/**
1772 * test_mem_cgroup_node_reclaimable 1858 * test_mem_cgroup_node_reclaimable
1773 * @memcg: the target memcg 1859 * @memcg: the target memcg
@@ -1790,6 +1876,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1790 return false; 1876 return false;
1791 1877
1792} 1878}
1879#if MAX_NUMNODES > 1
1793 1880
1794/* 1881/*
1795 * Always updating the nodemask is not very good - even if we have an empty 1882 * Always updating the nodemask is not very good - even if we have an empty
@@ -1857,50 +1944,104 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1857 return node; 1944 return node;
1858} 1945}
1859 1946
1947/*
1948 * Check all nodes whether it contains reclaimable pages or not.
1949 * For quick scan, we make use of scan_nodes. This will allow us to skip
1950 * unused nodes. But scan_nodes is lazily updated and may not cotain
1951 * enough new information. We need to do double check.
1952 */
1953static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1954{
1955 int nid;
1956
1957 /*
1958 * quick check...making use of scan_node.
1959 * We can skip unused nodes.
1960 */
1961 if (!nodes_empty(memcg->scan_nodes)) {
1962 for (nid = first_node(memcg->scan_nodes);
1963 nid < MAX_NUMNODES;
1964 nid = next_node(nid, memcg->scan_nodes)) {
1965
1966 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1967 return true;
1968 }
1969 }
1970 /*
1971 * Check rest of nodes.
1972 */
1973 for_each_node_state(nid, N_MEMORY) {
1974 if (node_isset(nid, memcg->scan_nodes))
1975 continue;
1976 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1977 return true;
1978 }
1979 return false;
1980}
1981
1860#else 1982#else
1861int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1983int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1862{ 1984{
1863 return 0; 1985 return 0;
1864} 1986}
1865 1987
1866#endif 1988static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1867
1868/*
1869 * A group is eligible for the soft limit reclaim under the given root
1870 * hierarchy if
1871 * a) it is over its soft limit
1872 * b) any parent up the hierarchy is over its soft limit
1873 *
1874 * If the given group doesn't have any children over the limit then it
1875 * doesn't make any sense to iterate its subtree.
1876 */
1877enum mem_cgroup_filter_t
1878mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
1879 struct mem_cgroup *root)
1880{ 1989{
1881 struct mem_cgroup *parent; 1990 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1882 1991}
1883 if (!memcg) 1992#endif
1884 memcg = root_mem_cgroup;
1885 parent = memcg;
1886
1887 if (res_counter_soft_limit_excess(&memcg->res))
1888 return VISIT;
1889 1993
1890 /* 1994static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1891 * If any parent up to the root in the hierarchy is over its soft limit 1995 struct zone *zone,
1892 * then we have to obey and reclaim from this group as well. 1996 gfp_t gfp_mask,
1893 */ 1997 unsigned long *total_scanned)
1894 while ((parent = parent_mem_cgroup(parent))) { 1998{
1895 if (res_counter_soft_limit_excess(&parent->res)) 1999 struct mem_cgroup *victim = NULL;
1896 return VISIT; 2000 int total = 0;
1897 if (parent == root) 2001 int loop = 0;
2002 unsigned long excess;
2003 unsigned long nr_scanned;
2004 struct mem_cgroup_reclaim_cookie reclaim = {
2005 .zone = zone,
2006 .priority = 0,
2007 };
2008
2009 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
2010
2011 while (1) {
2012 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
2013 if (!victim) {
2014 loop++;
2015 if (loop >= 2) {
2016 /*
2017 * If we have not been able to reclaim
2018 * anything, it might because there are
2019 * no reclaimable pages under this hierarchy
2020 */
2021 if (!total)
2022 break;
2023 /*
2024 * We want to do more targeted reclaim.
2025 * excess >> 2 is not to excessive so as to
2026 * reclaim too much, nor too less that we keep
2027 * coming back to reclaim from this cgroup
2028 */
2029 if (total >= (excess >> 2) ||
2030 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
2031 break;
2032 }
2033 continue;
2034 }
2035 if (!mem_cgroup_reclaimable(victim, false))
2036 continue;
2037 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
2038 zone, &nr_scanned);
2039 *total_scanned += nr_scanned;
2040 if (!res_counter_soft_limit_excess(&root_memcg->res))
1898 break; 2041 break;
1899 } 2042 }
1900 2043 mem_cgroup_iter_break(root_memcg, victim);
1901 if (!atomic_read(&memcg->children_in_excess)) 2044 return total;
1902 return SKIP_TREE;
1903 return SKIP;
1904} 2045}
1905 2046
1906static DEFINE_SPINLOCK(memcg_oom_lock); 2047static DEFINE_SPINLOCK(memcg_oom_lock);
@@ -2812,7 +2953,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2812 unlock_page_cgroup(pc); 2953 unlock_page_cgroup(pc);
2813 2954
2814 /* 2955 /*
2815 * "charge_statistics" updated event counter. 2956 * "charge_statistics" updated event counter. Then, check it.
2957 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2958 * if they exceeds softlimit.
2816 */ 2959 */
2817 memcg_check_events(memcg, page); 2960 memcg_check_events(memcg, page);
2818} 2961}
@@ -4647,6 +4790,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4647 return ret; 4790 return ret;
4648} 4791}
4649 4792
4793unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4794 gfp_t gfp_mask,
4795 unsigned long *total_scanned)
4796{
4797 unsigned long nr_reclaimed = 0;
4798 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
4799 unsigned long reclaimed;
4800 int loop = 0;
4801 struct mem_cgroup_tree_per_zone *mctz;
4802 unsigned long long excess;
4803 unsigned long nr_scanned;
4804
4805 if (order > 0)
4806 return 0;
4807
4808 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
4809 /*
4810 * This loop can run a while, specially if mem_cgroup's continuously
4811 * keep exceeding their soft limit and putting the system under
4812 * pressure
4813 */
4814 do {
4815 if (next_mz)
4816 mz = next_mz;
4817 else
4818 mz = mem_cgroup_largest_soft_limit_node(mctz);
4819 if (!mz)
4820 break;
4821
4822 nr_scanned = 0;
4823 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
4824 gfp_mask, &nr_scanned);
4825 nr_reclaimed += reclaimed;
4826 *total_scanned += nr_scanned;
4827 spin_lock(&mctz->lock);
4828
4829 /*
4830 * If we failed to reclaim anything from this memory cgroup
4831 * it is time to move on to the next cgroup
4832 */
4833 next_mz = NULL;
4834 if (!reclaimed) {
4835 do {
4836 /*
4837 * Loop until we find yet another one.
4838 *
4839 * By the time we get the soft_limit lock
4840 * again, someone might have aded the
4841 * group back on the RB tree. Iterate to
4842 * make sure we get a different mem.
4843 * mem_cgroup_largest_soft_limit_node returns
4844 * NULL if no other cgroup is present on
4845 * the tree
4846 */
4847 next_mz =
4848 __mem_cgroup_largest_soft_limit_node(mctz);
4849 if (next_mz == mz)
4850 css_put(&next_mz->memcg->css);
4851 else /* next_mz == NULL or other memcg */
4852 break;
4853 } while (1);
4854 }
4855 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
4856 excess = res_counter_soft_limit_excess(&mz->memcg->res);
4857 /*
4858 * One school of thought says that we should not add
4859 * back the node to the tree if reclaim returns 0.
4860 * But our reclaim could return 0, simply because due
4861 * to priority we are exposing a smaller subset of
4862 * memory to reclaim from. Consider this as a longer
4863 * term TODO.
4864 */
4865 /* If excess == 0, no tree ops */
4866 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
4867 spin_unlock(&mctz->lock);
4868 css_put(&mz->memcg->css);
4869 loop++;
4870 /*
4871 * Could not reclaim anything and there are no more
4872 * mem cgroups to try or we seem to be looping without
4873 * reclaiming anything.
4874 */
4875 if (!nr_reclaimed &&
4876 (next_mz == NULL ||
4877 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
4878 break;
4879 } while (!nr_reclaimed);
4880 if (next_mz)
4881 css_put(&next_mz->memcg->css);
4882 return nr_reclaimed;
4883}
4884
4650/** 4885/**
4651 * mem_cgroup_force_empty_list - clears LRU of a group 4886 * mem_cgroup_force_empty_list - clears LRU of a group
4652 * @memcg: group to clear 4887 * @memcg: group to clear
@@ -5911,6 +6146,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5911 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6146 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5912 mz = &pn->zoneinfo[zone]; 6147 mz = &pn->zoneinfo[zone];
5913 lruvec_init(&mz->lruvec); 6148 lruvec_init(&mz->lruvec);
6149 mz->usage_in_excess = 0;
6150 mz->on_tree = false;
5914 mz->memcg = memcg; 6151 mz->memcg = memcg;
5915 } 6152 }
5916 memcg->nodeinfo[node] = pn; 6153 memcg->nodeinfo[node] = pn;
@@ -5966,6 +6203,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
5966 int node; 6203 int node;
5967 size_t size = memcg_size(); 6204 size_t size = memcg_size();
5968 6205
6206 mem_cgroup_remove_from_trees(memcg);
5969 free_css_id(&mem_cgroup_subsys, &memcg->css); 6207 free_css_id(&mem_cgroup_subsys, &memcg->css);
5970 6208
5971 for_each_node(node) 6209 for_each_node(node)
@@ -6002,6 +6240,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6002} 6240}
6003EXPORT_SYMBOL(parent_mem_cgroup); 6241EXPORT_SYMBOL(parent_mem_cgroup);
6004 6242
6243static void __init mem_cgroup_soft_limit_tree_init(void)
6244{
6245 struct mem_cgroup_tree_per_node *rtpn;
6246 struct mem_cgroup_tree_per_zone *rtpz;
6247 int tmp, node, zone;
6248
6249 for_each_node(node) {
6250 tmp = node;
6251 if (!node_state(node, N_NORMAL_MEMORY))
6252 tmp = -1;
6253 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6254 BUG_ON(!rtpn);
6255
6256 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6257
6258 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6259 rtpz = &rtpn->rb_tree_per_zone[zone];
6260 rtpz->rb_root = RB_ROOT;
6261 spin_lock_init(&rtpz->lock);
6262 }
6263 }
6264}
6265
6005static struct cgroup_subsys_state * __ref 6266static struct cgroup_subsys_state * __ref
6006mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 6267mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6007{ 6268{
@@ -6031,7 +6292,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6031 mutex_init(&memcg->thresholds_lock); 6292 mutex_init(&memcg->thresholds_lock);
6032 spin_lock_init(&memcg->move_lock); 6293 spin_lock_init(&memcg->move_lock);
6033 vmpressure_init(&memcg->vmpressure); 6294 vmpressure_init(&memcg->vmpressure);
6034 spin_lock_init(&memcg->soft_lock);
6035 6295
6036 return &memcg->css; 6296 return &memcg->css;
6037 6297
@@ -6109,13 +6369,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6109 6369
6110 mem_cgroup_invalidate_reclaim_iterators(memcg); 6370 mem_cgroup_invalidate_reclaim_iterators(memcg);
6111 mem_cgroup_reparent_charges(memcg); 6371 mem_cgroup_reparent_charges(memcg);
6112 if (memcg->soft_contributed) {
6113 while ((memcg = parent_mem_cgroup(memcg)))
6114 atomic_dec(&memcg->children_in_excess);
6115
6116 if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
6117 atomic_dec(&root_mem_cgroup->children_in_excess);
6118 }
6119 mem_cgroup_destroy_all_caches(memcg); 6372 mem_cgroup_destroy_all_caches(memcg);
6120 vmpressure_cleanup(&memcg->vmpressure); 6373 vmpressure_cleanup(&memcg->vmpressure);
6121} 6374}
@@ -6790,6 +7043,7 @@ static int __init mem_cgroup_init(void)
6790{ 7043{
6791 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 7044 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6792 enable_swap_cgroup(); 7045 enable_swap_cgroup();
7046 mem_cgroup_soft_limit_tree_init();
6793 memcg_stock_init(); 7047 memcg_stock_init();
6794 return 0; 7048 return 0;
6795} 7049}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8ed1b775bdc9..beb35778c69f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -139,23 +139,11 @@ static bool global_reclaim(struct scan_control *sc)
139{ 139{
140 return !sc->target_mem_cgroup; 140 return !sc->target_mem_cgroup;
141} 141}
142
143static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
144{
145 struct mem_cgroup *root = sc->target_mem_cgroup;
146 return !mem_cgroup_disabled() &&
147 mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE;
148}
149#else 142#else
150static bool global_reclaim(struct scan_control *sc) 143static bool global_reclaim(struct scan_control *sc)
151{ 144{
152 return true; 145 return true;
153} 146}
154
155static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
156{
157 return false;
158}
159#endif 147#endif
160 148
161unsigned long zone_reclaimable_pages(struct zone *zone) 149unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -2176,11 +2164,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
2176 } 2164 }
2177} 2165}
2178 2166
2179static int 2167static void shrink_zone(struct zone *zone, struct scan_control *sc)
2180__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
2181{ 2168{
2182 unsigned long nr_reclaimed, nr_scanned; 2169 unsigned long nr_reclaimed, nr_scanned;
2183 int groups_scanned = 0;
2184 2170
2185 do { 2171 do {
2186 struct mem_cgroup *root = sc->target_mem_cgroup; 2172 struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2188,17 +2174,15 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
2188 .zone = zone, 2174 .zone = zone,
2189 .priority = sc->priority, 2175 .priority = sc->priority,
2190 }; 2176 };
2191 struct mem_cgroup *memcg = NULL; 2177 struct mem_cgroup *memcg;
2192 mem_cgroup_iter_filter filter = (soft_reclaim) ?
2193 mem_cgroup_soft_reclaim_eligible : NULL;
2194 2178
2195 nr_reclaimed = sc->nr_reclaimed; 2179 nr_reclaimed = sc->nr_reclaimed;
2196 nr_scanned = sc->nr_scanned; 2180 nr_scanned = sc->nr_scanned;
2197 2181
2198 while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { 2182 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2183 do {
2199 struct lruvec *lruvec; 2184 struct lruvec *lruvec;
2200 2185
2201 groups_scanned++;
2202 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2186 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2203 2187
2204 shrink_lruvec(lruvec, sc); 2188 shrink_lruvec(lruvec, sc);
@@ -2218,7 +2202,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
2218 mem_cgroup_iter_break(root, memcg); 2202 mem_cgroup_iter_break(root, memcg);
2219 break; 2203 break;
2220 } 2204 }
2221 } 2205 memcg = mem_cgroup_iter(root, memcg, &reclaim);
2206 } while (memcg);
2222 2207
2223 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2208 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
2224 sc->nr_scanned - nr_scanned, 2209 sc->nr_scanned - nr_scanned,
@@ -2226,37 +2211,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
2226 2211
2227 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, 2212 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
2228 sc->nr_scanned - nr_scanned, sc)); 2213 sc->nr_scanned - nr_scanned, sc));
2229
2230 return groups_scanned;
2231}
2232
2233
2234static void shrink_zone(struct zone *zone, struct scan_control *sc)
2235{
2236 bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
2237 unsigned long nr_scanned = sc->nr_scanned;
2238 int scanned_groups;
2239
2240 scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim);
2241 /*
2242 * memcg iterator might race with other reclaimer or start from
2243 * a incomplete tree walk so the tree walk in __shrink_zone
2244 * might have missed groups that are above the soft limit. Try
2245 * another loop to catch up with others. Do it just once to
2246 * prevent from reclaim latencies when other reclaimers always
2247 * preempt this one.
2248 */
2249 if (do_soft_reclaim && !scanned_groups)
2250 __shrink_zone(zone, sc, do_soft_reclaim);
2251
2252 /*
2253 * No group is over the soft limit or those that are do not have
2254 * pages in the zone we are reclaiming so we have to reclaim everybody
2255 */
2256 if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) {
2257 __shrink_zone(zone, sc, false);
2258 return;
2259 }
2260} 2214}
2261 2215
2262/* Returns true if compaction should go ahead for a high-order request */ 2216/* Returns true if compaction should go ahead for a high-order request */
@@ -2320,6 +2274,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2320{ 2274{
2321 struct zoneref *z; 2275 struct zoneref *z;
2322 struct zone *zone; 2276 struct zone *zone;
2277 unsigned long nr_soft_reclaimed;
2278 unsigned long nr_soft_scanned;
2323 bool aborted_reclaim = false; 2279 bool aborted_reclaim = false;
2324 2280
2325 /* 2281 /*
@@ -2359,6 +2315,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2359 continue; 2315 continue;
2360 } 2316 }
2361 } 2317 }
2318 /*
2319 * This steals pages from memory cgroups over softlimit
2320 * and returns the number of reclaimed pages and
2321 * scanned pages. This works for global memory pressure
2322 * and balancing, not for a memcg's limit.
2323 */
2324 nr_soft_scanned = 0;
2325 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2326 sc->order, sc->gfp_mask,
2327 &nr_soft_scanned);
2328 sc->nr_reclaimed += nr_soft_reclaimed;
2329 sc->nr_scanned += nr_soft_scanned;
2362 /* need some check for avoid more shrink_zone() */ 2330 /* need some check for avoid more shrink_zone() */
2363 } 2331 }
2364 2332
@@ -2952,6 +2920,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2952{ 2920{
2953 int i; 2921 int i;
2954 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2922 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2923 unsigned long nr_soft_reclaimed;
2924 unsigned long nr_soft_scanned;
2955 struct scan_control sc = { 2925 struct scan_control sc = {
2956 .gfp_mask = GFP_KERNEL, 2926 .gfp_mask = GFP_KERNEL,
2957 .priority = DEF_PRIORITY, 2927 .priority = DEF_PRIORITY,
@@ -3066,6 +3036,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3066 3036
3067 sc.nr_scanned = 0; 3037 sc.nr_scanned = 0;
3068 3038
3039 nr_soft_scanned = 0;
3040 /*
3041 * Call soft limit reclaim before calling shrink_zone.
3042 */
3043 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
3044 order, sc.gfp_mask,
3045 &nr_soft_scanned);
3046 sc.nr_reclaimed += nr_soft_reclaimed;
3047
3069 /* 3048 /*
3070 * There should be no need to raise the scanning 3049 * There should be no need to raise the scanning
3071 * priority if enough pages are already being scanned 3050 * priority if enough pages are already being scanned
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 47016c304c84..66cad506b8a2 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3975,8 +3975,8 @@ sub string_find_replace {
3975# check for new externs in .h files. 3975# check for new externs in .h files.
3976 if ($realfile =~ /\.h$/ && 3976 if ($realfile =~ /\.h$/ &&
3977 $line =~ /^\+\s*(extern\s+)$Type\s*$Ident\s*\(/s) { 3977 $line =~ /^\+\s*(extern\s+)$Type\s*$Ident\s*\(/s) {
3978 if (WARN("AVOID_EXTERNS", 3978 if (CHK("AVOID_EXTERNS",
3979 "extern prototypes should be avoided in .h files\n" . $herecurr) && 3979 "extern prototypes should be avoided in .h files\n" . $herecurr) &&
3980 $fix) { 3980 $fix) {
3981 $fixed[$linenr - 1] =~ s/(.*)\bextern\b\s*(.*)/$1$2/; 3981 $fixed[$linenr - 1] =~ s/(.*)\bextern\b\s*(.*)/$1$2/;
3982 } 3982 }