diff options
| -rw-r--r-- | fs/fs-writeback.c | 4 | ||||
| -rw-r--r-- | fs/super.c | 2 | ||||
| -rw-r--r-- | include/linux/backing-dev.h | 4 | ||||
| -rw-r--r-- | include/linux/flex_proportions.h | 101 | ||||
| -rw-r--r-- | lib/Makefile | 2 | ||||
| -rw-r--r-- | lib/flex_proportions.c | 272 | ||||
| -rw-r--r-- | mm/backing-dev.c | 6 | ||||
| -rw-r--r-- | mm/page-writeback.c | 107 |
8 files changed, 448 insertions, 50 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8f660dd6137a..50d0b78130a1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
| @@ -628,8 +628,8 @@ static long writeback_sb_inodes(struct super_block *sb, | |||
| 628 | } | 628 | } |
| 629 | 629 | ||
| 630 | /* | 630 | /* |
| 631 | * Don't bother with new inodes or inodes beeing freed, first | 631 | * Don't bother with new inodes or inodes being freed, first |
| 632 | * kind does not need peridic writeout yet, and for the latter | 632 | * kind does not need periodic writeout yet, and for the latter |
| 633 | * kind writeout is handled by the freer. | 633 | * kind writeout is handled by the freer. |
| 634 | */ | 634 | */ |
| 635 | spin_lock(&inode->i_lock); | 635 | spin_lock(&inode->i_lock); |
diff --git a/fs/super.c b/fs/super.c index c743fb3be4b8..4c5d82f56ec4 100644 --- a/fs/super.c +++ b/fs/super.c | |||
| @@ -320,7 +320,7 @@ static int grab_super(struct super_block *s) __releases(sb_lock) | |||
| 320 | 320 | ||
| 321 | /* | 321 | /* |
| 322 | * grab_super_passive - acquire a passive reference | 322 | * grab_super_passive - acquire a passive reference |
| 323 | * @s: reference we are trying to grab | 323 | * @sb: reference we are trying to grab |
| 324 | * | 324 | * |
| 325 | * Tries to acquire a passive reference. This is used in places where we | 325 | * Tries to acquire a passive reference. This is used in places where we |
| 326 | * cannot take an active reference but we need to ensure that the | 326 | * cannot take an active reference but we need to ensure that the |
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index b1038bd686ac..489de625cd25 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h | |||
| @@ -10,7 +10,7 @@ | |||
| 10 | 10 | ||
| 11 | #include <linux/percpu_counter.h> | 11 | #include <linux/percpu_counter.h> |
| 12 | #include <linux/log2.h> | 12 | #include <linux/log2.h> |
| 13 | #include <linux/proportions.h> | 13 | #include <linux/flex_proportions.h> |
| 14 | #include <linux/kernel.h> | 14 | #include <linux/kernel.h> |
| 15 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
| 16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
| @@ -89,7 +89,7 @@ struct backing_dev_info { | |||
| 89 | unsigned long dirty_ratelimit; | 89 | unsigned long dirty_ratelimit; |
| 90 | unsigned long balanced_dirty_ratelimit; | 90 | unsigned long balanced_dirty_ratelimit; |
| 91 | 91 | ||
| 92 | struct prop_local_percpu completions; | 92 | struct fprop_local_percpu completions; |
| 93 | int dirty_exceeded; | 93 | int dirty_exceeded; |
| 94 | 94 | ||
| 95 | unsigned int min_ratio; | 95 | unsigned int min_ratio; |
diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h new file mode 100644 index 000000000000..4ebc49fae391 --- /dev/null +++ b/include/linux/flex_proportions.h | |||
| @@ -0,0 +1,101 @@ | |||
| 1 | /* | ||
| 2 | * Floating proportions with flexible aging period | ||
| 3 | * | ||
| 4 | * Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz> | ||
| 5 | */ | ||
| 6 | |||
| 7 | #ifndef _LINUX_FLEX_PROPORTIONS_H | ||
| 8 | #define _LINUX_FLEX_PROPORTIONS_H | ||
| 9 | |||
| 10 | #include <linux/percpu_counter.h> | ||
| 11 | #include <linux/spinlock.h> | ||
| 12 | #include <linux/seqlock.h> | ||
| 13 | |||
| 14 | /* | ||
| 15 | * When maximum proportion of some event type is specified, this is the | ||
| 16 | * precision with which we allow limitting. Note that this creates an upper | ||
| 17 | * bound on the number of events per period like | ||
| 18 | * ULLONG_MAX >> FPROP_FRAC_SHIFT. | ||
| 19 | */ | ||
| 20 | #define FPROP_FRAC_SHIFT 10 | ||
| 21 | #define FPROP_FRAC_BASE (1UL << FPROP_FRAC_SHIFT) | ||
| 22 | |||
| 23 | /* | ||
| 24 | * ---- Global proportion definitions ---- | ||
| 25 | */ | ||
| 26 | struct fprop_global { | ||
| 27 | /* Number of events in the current period */ | ||
| 28 | struct percpu_counter events; | ||
| 29 | /* Current period */ | ||
| 30 | unsigned int period; | ||
| 31 | /* Synchronization with period transitions */ | ||
| 32 | seqcount_t sequence; | ||
| 33 | }; | ||
| 34 | |||
| 35 | int fprop_global_init(struct fprop_global *p); | ||
| 36 | void fprop_global_destroy(struct fprop_global *p); | ||
| 37 | bool fprop_new_period(struct fprop_global *p, int periods); | ||
| 38 | |||
| 39 | /* | ||
| 40 | * ---- SINGLE ---- | ||
| 41 | */ | ||
| 42 | struct fprop_local_single { | ||
| 43 | /* the local events counter */ | ||
| 44 | unsigned long events; | ||
| 45 | /* Period in which we last updated events */ | ||
| 46 | unsigned int period; | ||
| 47 | raw_spinlock_t lock; /* Protect period and numerator */ | ||
| 48 | }; | ||
| 49 | |||
| 50 | #define INIT_FPROP_LOCAL_SINGLE(name) \ | ||
| 51 | { .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ | ||
| 52 | } | ||
| 53 | |||
| 54 | int fprop_local_init_single(struct fprop_local_single *pl); | ||
| 55 | void fprop_local_destroy_single(struct fprop_local_single *pl); | ||
| 56 | void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl); | ||
| 57 | void fprop_fraction_single(struct fprop_global *p, | ||
| 58 | struct fprop_local_single *pl, unsigned long *numerator, | ||
| 59 | unsigned long *denominator); | ||
| 60 | |||
| 61 | static inline | ||
| 62 | void fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl) | ||
| 63 | { | ||
| 64 | unsigned long flags; | ||
| 65 | |||
| 66 | local_irq_save(flags); | ||
| 67 | __fprop_inc_single(p, pl); | ||
| 68 | local_irq_restore(flags); | ||
| 69 | } | ||
| 70 | |||
| 71 | /* | ||
| 72 | * ---- PERCPU ---- | ||
| 73 | */ | ||
| 74 | struct fprop_local_percpu { | ||
| 75 | /* the local events counter */ | ||
| 76 | struct percpu_counter events; | ||
| 77 | /* Period in which we last updated events */ | ||
| 78 | unsigned int period; | ||
| 79 | raw_spinlock_t lock; /* Protect period and numerator */ | ||
| 80 | }; | ||
| 81 | |||
| 82 | int fprop_local_init_percpu(struct fprop_local_percpu *pl); | ||
| 83 | void fprop_local_destroy_percpu(struct fprop_local_percpu *pl); | ||
| 84 | void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl); | ||
| 85 | void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl, | ||
| 86 | int max_frac); | ||
| 87 | void fprop_fraction_percpu(struct fprop_global *p, | ||
| 88 | struct fprop_local_percpu *pl, unsigned long *numerator, | ||
| 89 | unsigned long *denominator); | ||
| 90 | |||
| 91 | static inline | ||
| 92 | void fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl) | ||
| 93 | { | ||
| 94 | unsigned long flags; | ||
| 95 | |||
| 96 | local_irq_save(flags); | ||
| 97 | __fprop_inc_percpu(p, pl); | ||
| 98 | local_irq_restore(flags); | ||
| 99 | } | ||
| 100 | |||
| 101 | #endif | ||
diff --git a/lib/Makefile b/lib/Makefile index 9cb4104f47d9..42d283edc4d3 100644 --- a/lib/Makefile +++ b/lib/Makefile | |||
| @@ -11,7 +11,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ | |||
| 11 | rbtree.o radix-tree.o dump_stack.o timerqueue.o\ | 11 | rbtree.o radix-tree.o dump_stack.o timerqueue.o\ |
| 12 | idr.o int_sqrt.o extable.o prio_tree.o \ | 12 | idr.o int_sqrt.o extable.o prio_tree.o \ |
| 13 | sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ | 13 | sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ |
| 14 | proportions.o prio_heap.o ratelimit.o show_mem.o \ | 14 | proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ |
| 15 | is_single_threaded.o plist.o decompress.o | 15 | is_single_threaded.o plist.o decompress.o |
| 16 | 16 | ||
| 17 | lib-$(CONFIG_MMU) += ioremap.o | 17 | lib-$(CONFIG_MMU) += ioremap.o |
diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c new file mode 100644 index 000000000000..c785554f9523 --- /dev/null +++ b/lib/flex_proportions.c | |||
| @@ -0,0 +1,272 @@ | |||
| 1 | /* | ||
| 2 | * Floating proportions with flexible aging period | ||
| 3 | * | ||
| 4 | * Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz> | ||
| 5 | * | ||
| 6 | * The goal of this code is: Given different types of event, measure proportion | ||
| 7 | * of each type of event over time. The proportions are measured with | ||
| 8 | * exponentially decaying history to give smooth transitions. A formula | ||
| 9 | * expressing proportion of event of type 'j' is: | ||
| 10 | * | ||
| 11 | * p_{j} = (\Sum_{i>=0} x_{i,j}/2^{i+1})/(\Sum_{i>=0} x_i/2^{i+1}) | ||
| 12 | * | ||
| 13 | * Where x_{i,j} is j's number of events in i-th last time period and x_i is | ||
| 14 | * total number of events in i-th last time period. | ||
| 15 | * | ||
| 16 | * Note that p_{j}'s are normalised, i.e. | ||
| 17 | * | ||
| 18 | * \Sum_{j} p_{j} = 1, | ||
| 19 | * | ||
| 20 | * This formula can be straightforwardly computed by maintaing denominator | ||
| 21 | * (let's call it 'd') and for each event type its numerator (let's call it | ||
| 22 | * 'n_j'). When an event of type 'j' happens, we simply need to do: | ||
| 23 | * n_j++; d++; | ||
| 24 | * | ||
| 25 | * When a new period is declared, we could do: | ||
| 26 | * d /= 2 | ||
| 27 | * for each j | ||
| 28 | * n_j /= 2 | ||
| 29 | * | ||
| 30 | * To avoid iteration over all event types, we instead shift numerator of event | ||
| 31 | * j lazily when someone asks for a proportion of event j or when event j | ||
| 32 | * occurs. This can bit trivially implemented by remembering last period in | ||
| 33 | * which something happened with proportion of type j. | ||
| 34 | */ | ||
| 35 | #include <linux/flex_proportions.h> | ||
| 36 | |||
| 37 | int fprop_global_init(struct fprop_global *p) | ||
| 38 | { | ||
| 39 | int err; | ||
| 40 | |||
| 41 | p->period = 0; | ||
| 42 | /* Use 1 to avoid dealing with periods with 0 events... */ | ||
| 43 | err = percpu_counter_init(&p->events, 1); | ||
| 44 | if (err) | ||
| 45 | return err; | ||
| 46 | seqcount_init(&p->sequence); | ||
| 47 | return 0; | ||
| 48 | } | ||
| 49 | |||
| 50 | void fprop_global_destroy(struct fprop_global *p) | ||
| 51 | { | ||
| 52 | percpu_counter_destroy(&p->events); | ||
| 53 | } | ||
| 54 | |||
| 55 | /* | ||
| 56 | * Declare @periods new periods. It is upto the caller to make sure period | ||
| 57 | * transitions cannot happen in parallel. | ||
| 58 | * | ||
| 59 | * The function returns true if the proportions are still defined and false | ||
| 60 | * if aging zeroed out all events. This can be used to detect whether declaring | ||
| 61 | * further periods has any effect. | ||
| 62 | */ | ||
| 63 | bool fprop_new_period(struct fprop_global *p, int periods) | ||
| 64 | { | ||
| 65 | u64 events; | ||
| 66 | unsigned long flags; | ||
| 67 | |||
| 68 | local_irq_save(flags); | ||
| 69 | events = percpu_counter_sum(&p->events); | ||
| 70 | /* | ||
| 71 | * Don't do anything if there are no events. | ||
| 72 | */ | ||
| 73 | if (events <= 1) { | ||
| 74 | local_irq_restore(flags); | ||
| 75 | return false; | ||
| 76 | } | ||
| 77 | write_seqcount_begin(&p->sequence); | ||
| 78 | if (periods < 64) | ||
| 79 | events -= events >> periods; | ||
| 80 | /* Use addition to avoid losing events happening between sum and set */ | ||
| 81 | percpu_counter_add(&p->events, -events); | ||
| 82 | p->period += periods; | ||
| 83 | write_seqcount_end(&p->sequence); | ||
| 84 | local_irq_restore(flags); | ||
| 85 | |||
| 86 | return true; | ||
| 87 | } | ||
| 88 | |||
| 89 | /* | ||
| 90 | * ---- SINGLE ---- | ||
| 91 | */ | ||
| 92 | |||
| 93 | int fprop_local_init_single(struct fprop_local_single *pl) | ||
| 94 | { | ||
| 95 | pl->events = 0; | ||
| 96 | pl->period = 0; | ||
| 97 | raw_spin_lock_init(&pl->lock); | ||
| 98 | return 0; | ||
| 99 | } | ||
| 100 | |||
| 101 | void fprop_local_destroy_single(struct fprop_local_single *pl) | ||
| 102 | { | ||
| 103 | } | ||
| 104 | |||
| 105 | static void fprop_reflect_period_single(struct fprop_global *p, | ||
| 106 | struct fprop_local_single *pl) | ||
| 107 | { | ||
| 108 | unsigned int period = p->period; | ||
| 109 | unsigned long flags; | ||
| 110 | |||
| 111 | /* Fast path - period didn't change */ | ||
| 112 | if (pl->period == period) | ||
| 113 | return; | ||
| 114 | raw_spin_lock_irqsave(&pl->lock, flags); | ||
| 115 | /* Someone updated pl->period while we were spinning? */ | ||
| 116 | if (pl->period >= period) { | ||
| 117 | raw_spin_unlock_irqrestore(&pl->lock, flags); | ||
| 118 | return; | ||
| 119 | } | ||
| 120 | /* Aging zeroed our fraction? */ | ||
| 121 | if (period - pl->period < BITS_PER_LONG) | ||
| 122 | pl->events >>= period - pl->period; | ||
| 123 | else | ||
| 124 | pl->events = 0; | ||
| 125 | pl->period = period; | ||
| 126 | raw_spin_unlock_irqrestore(&pl->lock, flags); | ||
| 127 | } | ||
| 128 | |||
| 129 | /* Event of type pl happened */ | ||
| 130 | void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl) | ||
| 131 | { | ||
| 132 | fprop_reflect_period_single(p, pl); | ||
| 133 | pl->events++; | ||
| 134 | percpu_counter_add(&p->events, 1); | ||
| 135 | } | ||
| 136 | |||
| 137 | /* Return fraction of events of type pl */ | ||
| 138 | void fprop_fraction_single(struct fprop_global *p, | ||
| 139 | struct fprop_local_single *pl, | ||
| 140 | unsigned long *numerator, unsigned long *denominator) | ||
| 141 | { | ||
| 142 | unsigned int seq; | ||
| 143 | s64 num, den; | ||
| 144 | |||
| 145 | do { | ||
| 146 | seq = read_seqcount_begin(&p->sequence); | ||
| 147 | fprop_reflect_period_single(p, pl); | ||
| 148 | num = pl->events; | ||
| 149 | den = percpu_counter_read_positive(&p->events); | ||
| 150 | } while (read_seqcount_retry(&p->sequence, seq)); | ||
| 151 | |||
| 152 | /* | ||
| 153 | * Make fraction <= 1 and denominator > 0 even in presence of percpu | ||
| 154 | * counter errors | ||
| 155 | */ | ||
| 156 | if (den <= num) { | ||
| 157 | if (num) | ||
| 158 | den = num; | ||
| 159 | else | ||
| 160 | den = 1; | ||
| 161 | } | ||
| 162 | *denominator = den; | ||
| 163 | *numerator = num; | ||
| 164 | } | ||
| 165 | |||
| 166 | /* | ||
| 167 | * ---- PERCPU ---- | ||
| 168 | */ | ||
| 169 | #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) | ||
| 170 | |||
| 171 | int fprop_local_init_percpu(struct fprop_local_percpu *pl) | ||
| 172 | { | ||
| 173 | int err; | ||
| 174 | |||
| 175 | err = percpu_counter_init(&pl->events, 0); | ||
| 176 | if (err) | ||
| 177 | return err; | ||
| 178 | pl->period = 0; | ||
| 179 | raw_spin_lock_init(&pl->lock); | ||
| 180 | return 0; | ||
| 181 | } | ||
| 182 | |||
| 183 | void fprop_local_destroy_percpu(struct fprop_local_percpu *pl) | ||
| 184 | { | ||
| 185 | percpu_counter_destroy(&pl->events); | ||
| 186 | } | ||
| 187 | |||
| 188 | static void fprop_reflect_period_percpu(struct fprop_global *p, | ||
| 189 | struct fprop_local_percpu *pl) | ||
| 190 | { | ||
| 191 | unsigned int period = p->period; | ||
| 192 | unsigned long flags; | ||
| 193 | |||
| 194 | /* Fast path - period didn't change */ | ||
| 195 | if (pl->period == period) | ||
| 196 | return; | ||
| 197 | raw_spin_lock_irqsave(&pl->lock, flags); | ||
| 198 | /* Someone updated pl->period while we were spinning? */ | ||
| 199 | if (pl->period >= period) { | ||
| 200 | raw_spin_unlock_irqrestore(&pl->lock, flags); | ||
| 201 | return; | ||
| 202 | } | ||
| 203 | /* Aging zeroed our fraction? */ | ||
| 204 | if (period - pl->period < BITS_PER_LONG) { | ||
| 205 | s64 val = percpu_counter_read(&pl->events); | ||
| 206 | |||
| 207 | if (val < (nr_cpu_ids * PROP_BATCH)) | ||
| 208 | val = percpu_counter_sum(&pl->events); | ||
| 209 | |||
| 210 | __percpu_counter_add(&pl->events, | ||
| 211 | -val + (val >> (period-pl->period)), PROP_BATCH); | ||
| 212 | } else | ||
| 213 | percpu_counter_set(&pl->events, 0); | ||
| 214 | pl->period = period; | ||
| 215 | raw_spin_unlock_irqrestore(&pl->lock, flags); | ||
| 216 | } | ||
| 217 | |||
| 218 | /* Event of type pl happened */ | ||
| 219 | void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl) | ||
| 220 | { | ||
| 221 | fprop_reflect_period_percpu(p, pl); | ||
| 222 | __percpu_counter_add(&pl->events, 1, PROP_BATCH); | ||
| 223 | percpu_counter_add(&p->events, 1); | ||
| 224 | } | ||
| 225 | |||
| 226 | void fprop_fraction_percpu(struct fprop_global *p, | ||
| 227 | struct fprop_local_percpu *pl, | ||
| 228 | unsigned long *numerator, unsigned long *denominator) | ||
| 229 | { | ||
| 230 | unsigned int seq; | ||
| 231 | s64 num, den; | ||
| 232 | |||
| 233 | do { | ||
| 234 | seq = read_seqcount_begin(&p->sequence); | ||
| 235 | fprop_reflect_period_percpu(p, pl); | ||
| 236 | num = percpu_counter_read_positive(&pl->events); | ||
| 237 | den = percpu_counter_read_positive(&p->events); | ||
| 238 | } while (read_seqcount_retry(&p->sequence, seq)); | ||
| 239 | |||
| 240 | /* | ||
| 241 | * Make fraction <= 1 and denominator > 0 even in presence of percpu | ||
| 242 | * counter errors | ||
| 243 | */ | ||
| 244 | if (den <= num) { | ||
| 245 | if (num) | ||
| 246 | den = num; | ||
| 247 | else | ||
| 248 | den = 1; | ||
| 249 | } | ||
| 250 | *denominator = den; | ||
| 251 | *numerator = num; | ||
| 252 | } | ||
| 253 | |||
| 254 | /* | ||
| 255 | * Like __fprop_inc_percpu() except that event is counted only if the given | ||
| 256 | * type has fraction smaller than @max_frac/FPROP_FRAC_BASE | ||
| 257 | */ | ||
| 258 | void __fprop_inc_percpu_max(struct fprop_global *p, | ||
| 259 | struct fprop_local_percpu *pl, int max_frac) | ||
| 260 | { | ||
| 261 | if (unlikely(max_frac < FPROP_FRAC_BASE)) { | ||
| 262 | unsigned long numerator, denominator; | ||
| 263 | |||
| 264 | fprop_fraction_percpu(p, pl, &numerator, &denominator); | ||
| 265 | if (numerator > | ||
| 266 | (((u64)denominator) * max_frac) >> FPROP_FRAC_SHIFT) | ||
| 267 | return; | ||
| 268 | } else | ||
| 269 | fprop_reflect_period_percpu(p, pl); | ||
| 270 | __percpu_counter_add(&pl->events, 1, PROP_BATCH); | ||
| 271 | percpu_counter_add(&p->events, 1); | ||
| 272 | } | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dd8e2aafb07e..3387aea11209 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
| @@ -677,7 +677,7 @@ int bdi_init(struct backing_dev_info *bdi) | |||
| 677 | 677 | ||
| 678 | bdi->min_ratio = 0; | 678 | bdi->min_ratio = 0; |
| 679 | bdi->max_ratio = 100; | 679 | bdi->max_ratio = 100; |
| 680 | bdi->max_prop_frac = PROP_FRAC_BASE; | 680 | bdi->max_prop_frac = FPROP_FRAC_BASE; |
| 681 | spin_lock_init(&bdi->wb_lock); | 681 | spin_lock_init(&bdi->wb_lock); |
| 682 | INIT_LIST_HEAD(&bdi->bdi_list); | 682 | INIT_LIST_HEAD(&bdi->bdi_list); |
| 683 | INIT_LIST_HEAD(&bdi->work_list); | 683 | INIT_LIST_HEAD(&bdi->work_list); |
| @@ -700,7 +700,7 @@ int bdi_init(struct backing_dev_info *bdi) | |||
| 700 | bdi->write_bandwidth = INIT_BW; | 700 | bdi->write_bandwidth = INIT_BW; |
| 701 | bdi->avg_write_bandwidth = INIT_BW; | 701 | bdi->avg_write_bandwidth = INIT_BW; |
| 702 | 702 | ||
| 703 | err = prop_local_init_percpu(&bdi->completions); | 703 | err = fprop_local_init_percpu(&bdi->completions); |
| 704 | 704 | ||
| 705 | if (err) { | 705 | if (err) { |
| 706 | err: | 706 | err: |
| @@ -744,7 +744,7 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
| 744 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | 744 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) |
| 745 | percpu_counter_destroy(&bdi->bdi_stat[i]); | 745 | percpu_counter_destroy(&bdi->bdi_stat[i]); |
| 746 | 746 | ||
| 747 | prop_local_destroy_percpu(&bdi->completions); | 747 | fprop_local_destroy_percpu(&bdi->completions); |
| 748 | } | 748 | } |
| 749 | EXPORT_SYMBOL(bdi_destroy); | 749 | EXPORT_SYMBOL(bdi_destroy); |
| 750 | 750 | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 93d8d2f7108c..e5363f34e025 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -34,6 +34,7 @@ | |||
| 34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
| 35 | #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ | 35 | #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ |
| 36 | #include <linux/pagevec.h> | 36 | #include <linux/pagevec.h> |
| 37 | #include <linux/timer.h> | ||
| 37 | #include <trace/events/writeback.h> | 38 | #include <trace/events/writeback.h> |
| 38 | 39 | ||
| 39 | /* | 40 | /* |
| @@ -135,7 +136,20 @@ unsigned long global_dirty_limit; | |||
| 135 | * measured in page writeback completions. | 136 | * measured in page writeback completions. |
| 136 | * | 137 | * |
| 137 | */ | 138 | */ |
| 138 | static struct prop_descriptor vm_completions; | 139 | static struct fprop_global writeout_completions; |
| 140 | |||
| 141 | static void writeout_period(unsigned long t); | ||
| 142 | /* Timer for aging of writeout_completions */ | ||
| 143 | static struct timer_list writeout_period_timer = | ||
| 144 | TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0); | ||
| 145 | static unsigned long writeout_period_time = 0; | ||
| 146 | |||
| 147 | /* | ||
| 148 | * Length of period for aging writeout fractions of bdis. This is an | ||
| 149 | * arbitrarily chosen number. The longer the period, the slower fractions will | ||
| 150 | * reflect changes in current writeout rate. | ||
| 151 | */ | ||
| 152 | #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) | ||
| 139 | 153 | ||
| 140 | /* | 154 | /* |
| 141 | * Work out the current dirty-memory clamping and background writeout | 155 | * Work out the current dirty-memory clamping and background writeout |
| @@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone) | |||
| 322 | zone_page_state(zone, NR_WRITEBACK) <= limit; | 336 | zone_page_state(zone, NR_WRITEBACK) <= limit; |
| 323 | } | 337 | } |
| 324 | 338 | ||
| 325 | /* | ||
| 326 | * couple the period to the dirty_ratio: | ||
| 327 | * | ||
| 328 | * period/2 ~ roundup_pow_of_two(dirty limit) | ||
| 329 | */ | ||
| 330 | static int calc_period_shift(void) | ||
| 331 | { | ||
| 332 | unsigned long dirty_total; | ||
| 333 | |||
| 334 | if (vm_dirty_bytes) | ||
| 335 | dirty_total = vm_dirty_bytes / PAGE_SIZE; | ||
| 336 | else | ||
| 337 | dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) / | ||
| 338 | 100; | ||
| 339 | return 2 + ilog2(dirty_total - 1); | ||
| 340 | } | ||
| 341 | |||
| 342 | /* | ||
| 343 | * update the period when the dirty threshold changes. | ||
| 344 | */ | ||
| 345 | static void update_completion_period(void) | ||
| 346 | { | ||
| 347 | int shift = calc_period_shift(); | ||
| 348 | prop_change_shift(&vm_completions, shift); | ||
| 349 | |||
| 350 | writeback_set_ratelimit(); | ||
| 351 | } | ||
| 352 | |||
| 353 | int dirty_background_ratio_handler(struct ctl_table *table, int write, | 339 | int dirty_background_ratio_handler(struct ctl_table *table, int write, |
| 354 | void __user *buffer, size_t *lenp, | 340 | void __user *buffer, size_t *lenp, |
| 355 | loff_t *ppos) | 341 | loff_t *ppos) |
| @@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, | |||
| 383 | 369 | ||
| 384 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 370 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
| 385 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { | 371 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { |
| 386 | update_completion_period(); | 372 | writeback_set_ratelimit(); |
| 387 | vm_dirty_bytes = 0; | 373 | vm_dirty_bytes = 0; |
| 388 | } | 374 | } |
| 389 | return ret; | 375 | return ret; |
| @@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write, | |||
| 398 | 384 | ||
| 399 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); | 385 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
| 400 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { | 386 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { |
| 401 | update_completion_period(); | 387 | writeback_set_ratelimit(); |
| 402 | vm_dirty_ratio = 0; | 388 | vm_dirty_ratio = 0; |
| 403 | } | 389 | } |
| 404 | return ret; | 390 | return ret; |
| 405 | } | 391 | } |
| 406 | 392 | ||
| 393 | static unsigned long wp_next_time(unsigned long cur_time) | ||
| 394 | { | ||
| 395 | cur_time += VM_COMPLETIONS_PERIOD_LEN; | ||
| 396 | /* 0 has a special meaning... */ | ||
| 397 | if (!cur_time) | ||
| 398 | return 1; | ||
| 399 | return cur_time; | ||
| 400 | } | ||
| 401 | |||
| 407 | /* | 402 | /* |
| 408 | * Increment the BDI's writeout completion count and the global writeout | 403 | * Increment the BDI's writeout completion count and the global writeout |
| 409 | * completion count. Called from test_clear_page_writeback(). | 404 | * completion count. Called from test_clear_page_writeback(). |
| @@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write, | |||
| 411 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) | 406 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) |
| 412 | { | 407 | { |
| 413 | __inc_bdi_stat(bdi, BDI_WRITTEN); | 408 | __inc_bdi_stat(bdi, BDI_WRITTEN); |
| 414 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, | 409 | __fprop_inc_percpu_max(&writeout_completions, &bdi->completions, |
| 415 | bdi->max_prop_frac); | 410 | bdi->max_prop_frac); |
| 411 | /* First event after period switching was turned off? */ | ||
| 412 | if (!unlikely(writeout_period_time)) { | ||
| 413 | /* | ||
| 414 | * We can race with other __bdi_writeout_inc calls here but | ||
| 415 | * it does not cause any harm since the resulting time when | ||
| 416 | * timer will fire and what is in writeout_period_time will be | ||
| 417 | * roughly the same. | ||
| 418 | */ | ||
| 419 | writeout_period_time = wp_next_time(jiffies); | ||
| 420 | mod_timer(&writeout_period_timer, writeout_period_time); | ||
| 421 | } | ||
| 416 | } | 422 | } |
| 417 | 423 | ||
| 418 | void bdi_writeout_inc(struct backing_dev_info *bdi) | 424 | void bdi_writeout_inc(struct backing_dev_info *bdi) |
| @@ -431,11 +437,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc); | |||
| 431 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | 437 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, |
| 432 | long *numerator, long *denominator) | 438 | long *numerator, long *denominator) |
| 433 | { | 439 | { |
| 434 | prop_fraction_percpu(&vm_completions, &bdi->completions, | 440 | fprop_fraction_percpu(&writeout_completions, &bdi->completions, |
| 435 | numerator, denominator); | 441 | numerator, denominator); |
| 436 | } | 442 | } |
| 437 | 443 | ||
| 438 | /* | 444 | /* |
| 445 | * On idle system, we can be called long after we scheduled because we use | ||
| 446 | * deferred timers so count with missed periods. | ||
| 447 | */ | ||
| 448 | static void writeout_period(unsigned long t) | ||
| 449 | { | ||
| 450 | int miss_periods = (jiffies - writeout_period_time) / | ||
| 451 | VM_COMPLETIONS_PERIOD_LEN; | ||
| 452 | |||
| 453 | if (fprop_new_period(&writeout_completions, miss_periods + 1)) { | ||
| 454 | writeout_period_time = wp_next_time(writeout_period_time + | ||
| 455 | miss_periods * VM_COMPLETIONS_PERIOD_LEN); | ||
| 456 | mod_timer(&writeout_period_timer, writeout_period_time); | ||
| 457 | } else { | ||
| 458 | /* | ||
| 459 | * Aging has zeroed all fractions. Stop wasting CPU on period | ||
| 460 | * updates. | ||
| 461 | */ | ||
| 462 | writeout_period_time = 0; | ||
| 463 | } | ||
| 464 | } | ||
| 465 | |||
| 466 | /* | ||
| 439 | * bdi_min_ratio keeps the sum of the minimum dirty shares of all | 467 | * bdi_min_ratio keeps the sum of the minimum dirty shares of all |
| 440 | * registered backing devices, which, for obvious reasons, can not | 468 | * registered backing devices, which, for obvious reasons, can not |
| 441 | * exceed 100%. | 469 | * exceed 100%. |
| @@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) | |||
| 475 | ret = -EINVAL; | 503 | ret = -EINVAL; |
| 476 | } else { | 504 | } else { |
| 477 | bdi->max_ratio = max_ratio; | 505 | bdi->max_ratio = max_ratio; |
| 478 | bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; | 506 | bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; |
| 479 | } | 507 | } |
| 480 | spin_unlock_bh(&bdi_lock); | 508 | spin_unlock_bh(&bdi_lock); |
| 481 | 509 | ||
| @@ -918,7 +946,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
| 918 | * bdi->dirty_ratelimit = balanced_dirty_ratelimit; | 946 | * bdi->dirty_ratelimit = balanced_dirty_ratelimit; |
| 919 | * | 947 | * |
| 920 | * However to get a more stable dirty_ratelimit, the below elaborated | 948 | * However to get a more stable dirty_ratelimit, the below elaborated |
| 921 | * code makes use of task_ratelimit to filter out sigular points and | 949 | * code makes use of task_ratelimit to filter out singular points and |
| 922 | * limit the step size. | 950 | * limit the step size. |
| 923 | * | 951 | * |
| 924 | * The below code essentially only uses the relative value of | 952 | * The below code essentially only uses the relative value of |
| @@ -941,7 +969,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
| 941 | * feel and care are stable dirty rate and small position error. | 969 | * feel and care are stable dirty rate and small position error. |
| 942 | * | 970 | * |
| 943 | * |task_ratelimit - dirty_ratelimit| is used to limit the step size | 971 | * |task_ratelimit - dirty_ratelimit| is used to limit the step size |
| 944 | * and filter out the sigular points of balanced_dirty_ratelimit. Which | 972 | * and filter out the singular points of balanced_dirty_ratelimit. Which |
| 945 | * keeps jumping around randomly and can even leap far away at times | 973 | * keeps jumping around randomly and can even leap far away at times |
| 946 | * due to the small 200ms estimation period of dirty_rate (we want to | 974 | * due to the small 200ms estimation period of dirty_rate (we want to |
| 947 | * keep that period small to reduce time lags). | 975 | * keep that period small to reduce time lags). |
| @@ -1606,13 +1634,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { | |||
| 1606 | */ | 1634 | */ |
| 1607 | void __init page_writeback_init(void) | 1635 | void __init page_writeback_init(void) |
| 1608 | { | 1636 | { |
| 1609 | int shift; | ||
| 1610 | |||
| 1611 | writeback_set_ratelimit(); | 1637 | writeback_set_ratelimit(); |
| 1612 | register_cpu_notifier(&ratelimit_nb); | 1638 | register_cpu_notifier(&ratelimit_nb); |
| 1613 | 1639 | ||
| 1614 | shift = calc_period_shift(); | 1640 | fprop_global_init(&writeout_completions); |
| 1615 | prop_descriptor_init(&vm_completions, shift); | ||
| 1616 | } | 1641 | } |
| 1617 | 1642 | ||
| 1618 | /** | 1643 | /** |
