diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-31 01:14:04 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-31 01:14:04 -0400 |
commit | 2e3ee613480563a6d5c01b57d342e65cc58c06df (patch) | |
tree | b6b82d1ade41f137bdb9a5a18d8aa446e149c8b2 | |
parent | 1fad1e9a747687a7399bf58e87974f9b1bbcae06 (diff) | |
parent | 331cbdeedeb2f4ef01ccb761513708af0fe77098 (diff) |
Merge tag 'writeback-proportions' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux
Pull writeback updates from Wu Fengguang:
"Use time based periods to age the writeback proportions, which can
adapt equally well to fast/slow devices."
Fix up trivial conflict in comment in fs/sync.c
* tag 'writeback-proportions' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux:
writeback: Fix some comment errors
block: Convert BDI proportion calculations to flexible proportions
lib: Fix possible deadlock in flexible proportion code
lib: Proportions with flexible period
-rw-r--r-- | fs/fs-writeback.c | 4 | ||||
-rw-r--r-- | fs/super.c | 2 | ||||
-rw-r--r-- | include/linux/backing-dev.h | 4 | ||||
-rw-r--r-- | include/linux/flex_proportions.h | 101 | ||||
-rw-r--r-- | lib/Makefile | 2 | ||||
-rw-r--r-- | lib/flex_proportions.c | 272 | ||||
-rw-r--r-- | mm/backing-dev.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 107 |
8 files changed, 448 insertions, 50 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8f660dd6137a..50d0b78130a1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -628,8 +628,8 @@ static long writeback_sb_inodes(struct super_block *sb, | |||
628 | } | 628 | } |
629 | 629 | ||
630 | /* | 630 | /* |
631 | * Don't bother with new inodes or inodes beeing freed, first | 631 | * Don't bother with new inodes or inodes being freed, first |
632 | * kind does not need peridic writeout yet, and for the latter | 632 | * kind does not need periodic writeout yet, and for the latter |
633 | * kind writeout is handled by the freer. | 633 | * kind writeout is handled by the freer. |
634 | */ | 634 | */ |
635 | spin_lock(&inode->i_lock); | 635 | spin_lock(&inode->i_lock); |
diff --git a/fs/super.c b/fs/super.c index c743fb3be4b8..4c5d82f56ec4 100644 --- a/fs/super.c +++ b/fs/super.c | |||
@@ -320,7 +320,7 @@ static int grab_super(struct super_block *s) __releases(sb_lock) | |||
320 | 320 | ||
321 | /* | 321 | /* |
322 | * grab_super_passive - acquire a passive reference | 322 | * grab_super_passive - acquire a passive reference |
323 | * @s: reference we are trying to grab | 323 | * @sb: reference we are trying to grab |
324 | * | 324 | * |
325 | * Tries to acquire a passive reference. This is used in places where we | 325 | * Tries to acquire a passive reference. This is used in places where we |
326 | * cannot take an active reference but we need to ensure that the | 326 | * cannot take an active reference but we need to ensure that the |
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index b1038bd686ac..489de625cd25 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h | |||
@@ -10,7 +10,7 @@ | |||
10 | 10 | ||
11 | #include <linux/percpu_counter.h> | 11 | #include <linux/percpu_counter.h> |
12 | #include <linux/log2.h> | 12 | #include <linux/log2.h> |
13 | #include <linux/proportions.h> | 13 | #include <linux/flex_proportions.h> |
14 | #include <linux/kernel.h> | 14 | #include <linux/kernel.h> |
15 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
@@ -89,7 +89,7 @@ struct backing_dev_info { | |||
89 | unsigned long dirty_ratelimit; | 89 | unsigned long dirty_ratelimit; |
90 | unsigned long balanced_dirty_ratelimit; | 90 | unsigned long balanced_dirty_ratelimit; |
91 | 91 | ||
92 | struct prop_local_percpu completions; | 92 | struct fprop_local_percpu completions; |
93 | int dirty_exceeded; | 93 | int dirty_exceeded; |
94 | 94 | ||
95 | unsigned int min_ratio; | 95 | unsigned int min_ratio; |
diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h new file mode 100644 index 000000000000..4ebc49fae391 --- /dev/null +++ b/include/linux/flex_proportions.h | |||
@@ -0,0 +1,101 @@ | |||
1 | /* | ||
2 | * Floating proportions with flexible aging period | ||
3 | * | ||
4 | * Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz> | ||
5 | */ | ||
6 | |||
7 | #ifndef _LINUX_FLEX_PROPORTIONS_H | ||
8 | #define _LINUX_FLEX_PROPORTIONS_H | ||
9 | |||
10 | #include <linux/percpu_counter.h> | ||
11 | #include <linux/spinlock.h> | ||
12 | #include <linux/seqlock.h> | ||
13 | |||
14 | /* | ||
15 | * When maximum proportion of some event type is specified, this is the | ||
16 | * precision with which we allow limitting. Note that this creates an upper | ||
17 | * bound on the number of events per period like | ||
18 | * ULLONG_MAX >> FPROP_FRAC_SHIFT. | ||
19 | */ | ||
20 | #define FPROP_FRAC_SHIFT 10 | ||
21 | #define FPROP_FRAC_BASE (1UL << FPROP_FRAC_SHIFT) | ||
22 | |||
23 | /* | ||
24 | * ---- Global proportion definitions ---- | ||
25 | */ | ||
26 | struct fprop_global { | ||
27 | /* Number of events in the current period */ | ||
28 | struct percpu_counter events; | ||
29 | /* Current period */ | ||
30 | unsigned int period; | ||
31 | /* Synchronization with period transitions */ | ||
32 | seqcount_t sequence; | ||
33 | }; | ||
34 | |||
35 | int fprop_global_init(struct fprop_global *p); | ||
36 | void fprop_global_destroy(struct fprop_global *p); | ||
37 | bool fprop_new_period(struct fprop_global *p, int periods); | ||
38 | |||
39 | /* | ||
40 | * ---- SINGLE ---- | ||
41 | */ | ||
42 | struct fprop_local_single { | ||
43 | /* the local events counter */ | ||
44 | unsigned long events; | ||
45 | /* Period in which we last updated events */ | ||
46 | unsigned int period; | ||
47 | raw_spinlock_t lock; /* Protect period and numerator */ | ||
48 | }; | ||
49 | |||
50 | #define INIT_FPROP_LOCAL_SINGLE(name) \ | ||
51 | { .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ | ||
52 | } | ||
53 | |||
54 | int fprop_local_init_single(struct fprop_local_single *pl); | ||
55 | void fprop_local_destroy_single(struct fprop_local_single *pl); | ||
56 | void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl); | ||
57 | void fprop_fraction_single(struct fprop_global *p, | ||
58 | struct fprop_local_single *pl, unsigned long *numerator, | ||
59 | unsigned long *denominator); | ||
60 | |||
61 | static inline | ||
62 | void fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl) | ||
63 | { | ||
64 | unsigned long flags; | ||
65 | |||
66 | local_irq_save(flags); | ||
67 | __fprop_inc_single(p, pl); | ||
68 | local_irq_restore(flags); | ||
69 | } | ||
70 | |||
71 | /* | ||
72 | * ---- PERCPU ---- | ||
73 | */ | ||
74 | struct fprop_local_percpu { | ||
75 | /* the local events counter */ | ||
76 | struct percpu_counter events; | ||
77 | /* Period in which we last updated events */ | ||
78 | unsigned int period; | ||
79 | raw_spinlock_t lock; /* Protect period and numerator */ | ||
80 | }; | ||
81 | |||
82 | int fprop_local_init_percpu(struct fprop_local_percpu *pl); | ||
83 | void fprop_local_destroy_percpu(struct fprop_local_percpu *pl); | ||
84 | void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl); | ||
85 | void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl, | ||
86 | int max_frac); | ||
87 | void fprop_fraction_percpu(struct fprop_global *p, | ||
88 | struct fprop_local_percpu *pl, unsigned long *numerator, | ||
89 | unsigned long *denominator); | ||
90 | |||
91 | static inline | ||
92 | void fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl) | ||
93 | { | ||
94 | unsigned long flags; | ||
95 | |||
96 | local_irq_save(flags); | ||
97 | __fprop_inc_percpu(p, pl); | ||
98 | local_irq_restore(flags); | ||
99 | } | ||
100 | |||
101 | #endif | ||
diff --git a/lib/Makefile b/lib/Makefile index 9cb4104f47d9..42d283edc4d3 100644 --- a/lib/Makefile +++ b/lib/Makefile | |||
@@ -11,7 +11,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ | |||
11 | rbtree.o radix-tree.o dump_stack.o timerqueue.o\ | 11 | rbtree.o radix-tree.o dump_stack.o timerqueue.o\ |
12 | idr.o int_sqrt.o extable.o prio_tree.o \ | 12 | idr.o int_sqrt.o extable.o prio_tree.o \ |
13 | sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ | 13 | sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ |
14 | proportions.o prio_heap.o ratelimit.o show_mem.o \ | 14 | proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ |
15 | is_single_threaded.o plist.o decompress.o | 15 | is_single_threaded.o plist.o decompress.o |
16 | 16 | ||
17 | lib-$(CONFIG_MMU) += ioremap.o | 17 | lib-$(CONFIG_MMU) += ioremap.o |
diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c new file mode 100644 index 000000000000..c785554f9523 --- /dev/null +++ b/lib/flex_proportions.c | |||
@@ -0,0 +1,272 @@ | |||
1 | /* | ||
2 | * Floating proportions with flexible aging period | ||
3 | * | ||
4 | * Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz> | ||
5 | * | ||
6 | * The goal of this code is: Given different types of event, measure proportion | ||
7 | * of each type of event over time. The proportions are measured with | ||
8 | * exponentially decaying history to give smooth transitions. A formula | ||
9 | * expressing proportion of event of type 'j' is: | ||
10 | * | ||
11 | * p_{j} = (\Sum_{i>=0} x_{i,j}/2^{i+1})/(\Sum_{i>=0} x_i/2^{i+1}) | ||
12 | * | ||
13 | * Where x_{i,j} is j's number of events in i-th last time period and x_i is | ||
14 | * total number of events in i-th last time period. | ||
15 | * | ||
16 | * Note that p_{j}'s are normalised, i.e. | ||
17 | * | ||
18 | * \Sum_{j} p_{j} = 1, | ||
19 | * | ||
20 | * This formula can be straightforwardly computed by maintaing denominator | ||
21 | * (let's call it 'd') and for each event type its numerator (let's call it | ||
22 | * 'n_j'). When an event of type 'j' happens, we simply need to do: | ||
23 | * n_j++; d++; | ||
24 | * | ||
25 | * When a new period is declared, we could do: | ||
26 | * d /= 2 | ||
27 | * for each j | ||
28 | * n_j /= 2 | ||
29 | * | ||
30 | * To avoid iteration over all event types, we instead shift numerator of event | ||
31 | * j lazily when someone asks for a proportion of event j or when event j | ||
32 | * occurs. This can bit trivially implemented by remembering last period in | ||
33 | * which something happened with proportion of type j. | ||
34 | */ | ||
35 | #include <linux/flex_proportions.h> | ||
36 | |||
37 | int fprop_global_init(struct fprop_global *p) | ||
38 | { | ||
39 | int err; | ||
40 | |||
41 | p->period = 0; | ||
42 | /* Use 1 to avoid dealing with periods with 0 events... */ | ||
43 | err = percpu_counter_init(&p->events, 1); | ||
44 | if (err) | ||
45 | return err; | ||
46 | seqcount_init(&p->sequence); | ||
47 | return 0; | ||
48 | } | ||
49 | |||
50 | void fprop_global_destroy(struct fprop_global *p) | ||
51 | { | ||
52 | percpu_counter_destroy(&p->events); | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * Declare @periods new periods. It is upto the caller to make sure period | ||
57 | * transitions cannot happen in parallel. | ||
58 | * | ||
59 | * The function returns true if the proportions are still defined and false | ||
60 | * if aging zeroed out all events. This can be used to detect whether declaring | ||
61 | * further periods has any effect. | ||
62 | */ | ||
63 | bool fprop_new_period(struct fprop_global *p, int periods) | ||
64 | { | ||
65 | u64 events; | ||
66 | unsigned long flags; | ||
67 | |||
68 | local_irq_save(flags); | ||
69 | events = percpu_counter_sum(&p->events); | ||
70 | /* | ||
71 | * Don't do anything if there are no events. | ||
72 | */ | ||
73 | if (events <= 1) { | ||
74 | local_irq_restore(flags); | ||
75 | return false; | ||
76 | } | ||
77 | write_seqcount_begin(&p->sequence); | ||
78 | if (periods < 64) | ||
79 | events -= events >> periods; | ||
80 | /* Use addition to avoid losing events happening between sum and set */ | ||
81 | percpu_counter_add(&p->events, -events); | ||
82 | p->period += periods; | ||
83 | write_seqcount_end(&p->sequence); | ||
84 | local_irq_restore(flags); | ||
85 | |||
86 | return true; | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | * ---- SINGLE ---- | ||
91 | */ | ||
92 | |||
93 | int fprop_local_init_single(struct fprop_local_single *pl) | ||
94 | { | ||
95 | pl->events = 0; | ||
96 | pl->period = 0; | ||
97 | raw_spin_lock_init(&pl->lock); | ||
98 | return 0; | ||
99 | } | ||
100 | |||
101 | void fprop_local_destroy_single(struct fprop_local_single *pl) | ||
102 | { | ||
103 | } | ||
104 | |||
105 | static void fprop_reflect_period_single(struct fprop_global *p, | ||
106 | struct fprop_local_single *pl) | ||
107 | { | ||
108 | unsigned int period = p->period; | ||
109 | unsigned long flags; | ||
110 | |||
111 | /* Fast path - period didn't change */ | ||
112 | if (pl->period == period) | ||
113 | return; | ||
114 | raw_spin_lock_irqsave(&pl->lock, flags); | ||
115 | /* Someone updated pl->period while we were spinning? */ | ||
116 | if (pl->period >= period) { | ||
117 | raw_spin_unlock_irqrestore(&pl->lock, flags); | ||
118 | return; | ||
119 | } | ||
120 | /* Aging zeroed our fraction? */ | ||
121 | if (period - pl->period < BITS_PER_LONG) | ||
122 | pl->events >>= period - pl->period; | ||
123 | else | ||
124 | pl->events = 0; | ||
125 | pl->period = period; | ||
126 | raw_spin_unlock_irqrestore(&pl->lock, flags); | ||
127 | } | ||
128 | |||
129 | /* Event of type pl happened */ | ||
130 | void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl) | ||
131 | { | ||
132 | fprop_reflect_period_single(p, pl); | ||
133 | pl->events++; | ||
134 | percpu_counter_add(&p->events, 1); | ||
135 | } | ||
136 | |||
137 | /* Return fraction of events of type pl */ | ||
138 | void fprop_fraction_single(struct fprop_global *p, | ||
139 | struct fprop_local_single *pl, | ||
140 | unsigned long *numerator, unsigned long *denominator) | ||
141 | { | ||
142 | unsigned int seq; | ||
143 | s64 num, den; | ||
144 | |||
145 | do { | ||
146 | seq = read_seqcount_begin(&p->sequence); | ||
147 | fprop_reflect_period_single(p, pl); | ||
148 | num = pl->events; | ||
149 | den = percpu_counter_read_positive(&p->events); | ||
150 | } while (read_seqcount_retry(&p->sequence, seq)); | ||
151 | |||
152 | /* | ||
153 | * Make fraction <= 1 and denominator > 0 even in presence of percpu | ||
154 | * counter errors | ||
155 | */ | ||
156 | if (den <= num) { | ||
157 | if (num) | ||
158 | den = num; | ||
159 | else | ||
160 | den = 1; | ||
161 | } | ||
162 | *denominator = den; | ||
163 | *numerator = num; | ||
164 | } | ||
165 | |||
166 | /* | ||
167 | * ---- PERCPU ---- | ||
168 | */ | ||
169 | #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) | ||
170 | |||
171 | int fprop_local_init_percpu(struct fprop_local_percpu *pl) | ||
172 | { | ||
173 | int err; | ||
174 | |||
175 | err = percpu_counter_init(&pl->events, 0); | ||
176 | if (err) | ||
177 | return err; | ||
178 | pl->period = 0; | ||
179 | raw_spin_lock_init(&pl->lock); | ||
180 | return 0; | ||
181 | } | ||
182 | |||
183 | void fprop_local_destroy_percpu(struct fprop_local_percpu *pl) | ||
184 | { | ||
185 | percpu_counter_destroy(&pl->events); | ||
186 | } | ||
187 | |||
188 | static void fprop_reflect_period_percpu(struct fprop_global *p, | ||
189 | struct fprop_local_percpu *pl) | ||
190 | { | ||
191 | unsigned int period = p->period; | ||
192 | unsigned long flags; | ||
193 | |||
194 | /* Fast path - period didn't change */ | ||
195 | if (pl->period == period) | ||
196 | return; | ||
197 | raw_spin_lock_irqsave(&pl->lock, flags); | ||
198 | /* Someone updated pl->period while we were spinning? */ | ||
199 | if (pl->period >= period) { | ||
200 | raw_spin_unlock_irqrestore(&pl->lock, flags); | ||
201 | return; | ||
202 | } | ||
203 | /* Aging zeroed our fraction? */ | ||
204 | if (period - pl->period < BITS_PER_LONG) { | ||
205 | s64 val = percpu_counter_read(&pl->events); | ||
206 | |||
207 | if (val < (nr_cpu_ids * PROP_BATCH)) | ||
208 | val = percpu_counter_sum(&pl->events); | ||
209 | |||
210 | __percpu_counter_add(&pl->events, | ||
211 | -val + (val >> (period-pl->period)), PROP_BATCH); | ||
212 | } else | ||
213 | percpu_counter_set(&pl->events, 0); | ||
214 | pl->period = period; | ||
215 | raw_spin_unlock_irqrestore(&pl->lock, flags); | ||
216 | } | ||
217 | |||
218 | /* Event of type pl happened */ | ||
219 | void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl) | ||
220 | { | ||
221 | fprop_reflect_period_percpu(p, pl); | ||
222 | __percpu_counter_add(&pl->events, 1, PROP_BATCH); | ||
223 | percpu_counter_add(&p->events, 1); | ||
224 | } | ||
225 | |||
226 | void fprop_fraction_percpu(struct fprop_global *p, | ||
227 | struct fprop_local_percpu *pl, | ||
228 | unsigned long *numerator, unsigned long *denominator) | ||
229 | { | ||
230 | unsigned int seq; | ||
231 | s64 num, den; | ||
232 | |||
233 | do { | ||
234 | seq = read_seqcount_begin(&p->sequence); | ||
235 | fprop_reflect_period_percpu(p, pl); | ||
236 | num = percpu_counter_read_positive(&pl->events); | ||
237 | den = percpu_counter_read_positive(&p->events); | ||
238 | } while (read_seqcount_retry(&p->sequence, seq)); | ||
239 | |||
240 | /* | ||
241 | * Make fraction <= 1 and denominator > 0 even in presence of percpu | ||
242 | * counter errors | ||
243 | */ | ||
244 | if (den <= num) { | ||
245 | if (num) | ||
246 | den = num; | ||
247 | else | ||
248 | den = 1; | ||
249 | } | ||
250 | *denominator = den; | ||
251 | *numerator = num; | ||
252 | } | ||
253 | |||
254 | /* | ||
255 | * Like __fprop_inc_percpu() except that event is counted only if the given | ||
256 | * type has fraction smaller than @max_frac/FPROP_FRAC_BASE | ||
257 | */ | ||
258 | void __fprop_inc_percpu_max(struct fprop_global *p, | ||
259 | struct fprop_local_percpu *pl, int max_frac) | ||
260 | { | ||
261 | if (unlikely(max_frac < FPROP_FRAC_BASE)) { | ||
262 | unsigned long numerator, denominator; | ||
263 | |||
264 | fprop_fraction_percpu(p, pl, &numerator, &denominator); | ||
265 | if (numerator > | ||
266 | (((u64)denominator) * max_frac) >> FPROP_FRAC_SHIFT) | ||
267 | return; | ||
268 | } else | ||
269 | fprop_reflect_period_percpu(p, pl); | ||
270 | __percpu_counter_add(&pl->events, 1, PROP_BATCH); | ||
271 | percpu_counter_add(&p->events, 1); | ||
272 | } | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dd8e2aafb07e..3387aea11209 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -677,7 +677,7 @@ int bdi_init(struct backing_dev_info *bdi) | |||
677 | 677 | ||
678 | bdi->min_ratio = 0; | 678 | bdi->min_ratio = 0; |
679 | bdi->max_ratio = 100; | 679 | bdi->max_ratio = 100; |
680 | bdi->max_prop_frac = PROP_FRAC_BASE; | 680 | bdi->max_prop_frac = FPROP_FRAC_BASE; |
681 | spin_lock_init(&bdi->wb_lock); | 681 | spin_lock_init(&bdi->wb_lock); |
682 | INIT_LIST_HEAD(&bdi->bdi_list); | 682 | INIT_LIST_HEAD(&bdi->bdi_list); |
683 | INIT_LIST_HEAD(&bdi->work_list); | 683 | INIT_LIST_HEAD(&bdi->work_list); |
@@ -700,7 +700,7 @@ int bdi_init(struct backing_dev_info *bdi) | |||
700 | bdi->write_bandwidth = INIT_BW; | 700 | bdi->write_bandwidth = INIT_BW; |
701 | bdi->avg_write_bandwidth = INIT_BW; | 701 | bdi->avg_write_bandwidth = INIT_BW; |
702 | 702 | ||
703 | err = prop_local_init_percpu(&bdi->completions); | 703 | err = fprop_local_init_percpu(&bdi->completions); |
704 | 704 | ||
705 | if (err) { | 705 | if (err) { |
706 | err: | 706 | err: |
@@ -744,7 +744,7 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
744 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | 744 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) |
745 | percpu_counter_destroy(&bdi->bdi_stat[i]); | 745 | percpu_counter_destroy(&bdi->bdi_stat[i]); |
746 | 746 | ||
747 | prop_local_destroy_percpu(&bdi->completions); | 747 | fprop_local_destroy_percpu(&bdi->completions); |
748 | } | 748 | } |
749 | EXPORT_SYMBOL(bdi_destroy); | 749 | EXPORT_SYMBOL(bdi_destroy); |
750 | 750 | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 93d8d2f7108c..e5363f34e025 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
35 | #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ | 35 | #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ |
36 | #include <linux/pagevec.h> | 36 | #include <linux/pagevec.h> |
37 | #include <linux/timer.h> | ||
37 | #include <trace/events/writeback.h> | 38 | #include <trace/events/writeback.h> |
38 | 39 | ||
39 | /* | 40 | /* |
@@ -135,7 +136,20 @@ unsigned long global_dirty_limit; | |||
135 | * measured in page writeback completions. | 136 | * measured in page writeback completions. |
136 | * | 137 | * |
137 | */ | 138 | */ |
138 | static struct prop_descriptor vm_completions; | 139 | static struct fprop_global writeout_completions; |
140 | |||
141 | static void writeout_period(unsigned long t); | ||
142 | /* Timer for aging of writeout_completions */ | ||
143 | static struct timer_list writeout_period_timer = | ||
144 | TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0); | ||
145 | static unsigned long writeout_period_time = 0; | ||
146 | |||
147 | /* | ||
148 | * Length of period for aging writeout fractions of bdis. This is an | ||
149 | * arbitrarily chosen number. The longer the period, the slower fractions will | ||
150 | * reflect changes in current writeout rate. | ||
151 | */ | ||
152 | #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) | ||
139 | 153 | ||
140 | /* | 154 | /* |
141 | * Work out the current dirty-memory clamping and background writeout | 155 | * Work out the current dirty-memory clamping and background writeout |
@@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone) | |||
322 | zone_page_state(zone, NR_WRITEBACK) <= limit; | 336 | zone_page_state(zone, NR_WRITEBACK) <= limit; |
323 | } | 337 | } |
324 | 338 | ||
325 | /* | ||
326 | * couple the period to the dirty_ratio: | ||
327 | * | ||
328 | * period/2 ~ roundup_pow_of_two(dirty limit) | ||
329 | */ | ||
330 | static int calc_period_shift(void) | ||
331 | { | ||
332 | unsigned long dirty_total; | ||
333 | |||
334 | if (vm_dirty_bytes) | ||
335 | dirty_total = vm_dirty_bytes / PAGE_SIZE; | ||
336 | else | ||
337 | dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) / | ||
338 | 100; | ||
339 | return 2 + ilog2(dirty_total - 1); | ||
340 | } | ||
341 | |||
342 | /* | ||
343 | * update the period when the dirty threshold changes. | ||
344 | */ | ||
345 | static void update_completion_period(void) | ||
346 | { | ||
347 | int shift = calc_period_shift(); | ||
348 | prop_change_shift(&vm_completions, shift); | ||
349 | |||
350 | writeback_set_ratelimit(); | ||
351 | } | ||
352 | |||
353 | int dirty_background_ratio_handler(struct ctl_table *table, int write, | 339 | int dirty_background_ratio_handler(struct ctl_table *table, int write, |
354 | void __user *buffer, size_t *lenp, | 340 | void __user *buffer, size_t *lenp, |
355 | loff_t *ppos) | 341 | loff_t *ppos) |
@@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, | |||
383 | 369 | ||
384 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 370 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
385 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { | 371 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { |
386 | update_completion_period(); | 372 | writeback_set_ratelimit(); |
387 | vm_dirty_bytes = 0; | 373 | vm_dirty_bytes = 0; |
388 | } | 374 | } |
389 | return ret; | 375 | return ret; |
@@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write, | |||
398 | 384 | ||
399 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); | 385 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
400 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { | 386 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { |
401 | update_completion_period(); | 387 | writeback_set_ratelimit(); |
402 | vm_dirty_ratio = 0; | 388 | vm_dirty_ratio = 0; |
403 | } | 389 | } |
404 | return ret; | 390 | return ret; |
405 | } | 391 | } |
406 | 392 | ||
393 | static unsigned long wp_next_time(unsigned long cur_time) | ||
394 | { | ||
395 | cur_time += VM_COMPLETIONS_PERIOD_LEN; | ||
396 | /* 0 has a special meaning... */ | ||
397 | if (!cur_time) | ||
398 | return 1; | ||
399 | return cur_time; | ||
400 | } | ||
401 | |||
407 | /* | 402 | /* |
408 | * Increment the BDI's writeout completion count and the global writeout | 403 | * Increment the BDI's writeout completion count and the global writeout |
409 | * completion count. Called from test_clear_page_writeback(). | 404 | * completion count. Called from test_clear_page_writeback(). |
@@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write, | |||
411 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) | 406 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) |
412 | { | 407 | { |
413 | __inc_bdi_stat(bdi, BDI_WRITTEN); | 408 | __inc_bdi_stat(bdi, BDI_WRITTEN); |
414 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, | 409 | __fprop_inc_percpu_max(&writeout_completions, &bdi->completions, |
415 | bdi->max_prop_frac); | 410 | bdi->max_prop_frac); |
411 | /* First event after period switching was turned off? */ | ||
412 | if (!unlikely(writeout_period_time)) { | ||
413 | /* | ||
414 | * We can race with other __bdi_writeout_inc calls here but | ||
415 | * it does not cause any harm since the resulting time when | ||
416 | * timer will fire and what is in writeout_period_time will be | ||
417 | * roughly the same. | ||
418 | */ | ||
419 | writeout_period_time = wp_next_time(jiffies); | ||
420 | mod_timer(&writeout_period_timer, writeout_period_time); | ||
421 | } | ||
416 | } | 422 | } |
417 | 423 | ||
418 | void bdi_writeout_inc(struct backing_dev_info *bdi) | 424 | void bdi_writeout_inc(struct backing_dev_info *bdi) |
@@ -431,11 +437,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc); | |||
431 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | 437 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, |
432 | long *numerator, long *denominator) | 438 | long *numerator, long *denominator) |
433 | { | 439 | { |
434 | prop_fraction_percpu(&vm_completions, &bdi->completions, | 440 | fprop_fraction_percpu(&writeout_completions, &bdi->completions, |
435 | numerator, denominator); | 441 | numerator, denominator); |
436 | } | 442 | } |
437 | 443 | ||
438 | /* | 444 | /* |
445 | * On idle system, we can be called long after we scheduled because we use | ||
446 | * deferred timers so count with missed periods. | ||
447 | */ | ||
448 | static void writeout_period(unsigned long t) | ||
449 | { | ||
450 | int miss_periods = (jiffies - writeout_period_time) / | ||
451 | VM_COMPLETIONS_PERIOD_LEN; | ||
452 | |||
453 | if (fprop_new_period(&writeout_completions, miss_periods + 1)) { | ||
454 | writeout_period_time = wp_next_time(writeout_period_time + | ||
455 | miss_periods * VM_COMPLETIONS_PERIOD_LEN); | ||
456 | mod_timer(&writeout_period_timer, writeout_period_time); | ||
457 | } else { | ||
458 | /* | ||
459 | * Aging has zeroed all fractions. Stop wasting CPU on period | ||
460 | * updates. | ||
461 | */ | ||
462 | writeout_period_time = 0; | ||
463 | } | ||
464 | } | ||
465 | |||
466 | /* | ||
439 | * bdi_min_ratio keeps the sum of the minimum dirty shares of all | 467 | * bdi_min_ratio keeps the sum of the minimum dirty shares of all |
440 | * registered backing devices, which, for obvious reasons, can not | 468 | * registered backing devices, which, for obvious reasons, can not |
441 | * exceed 100%. | 469 | * exceed 100%. |
@@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) | |||
475 | ret = -EINVAL; | 503 | ret = -EINVAL; |
476 | } else { | 504 | } else { |
477 | bdi->max_ratio = max_ratio; | 505 | bdi->max_ratio = max_ratio; |
478 | bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; | 506 | bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; |
479 | } | 507 | } |
480 | spin_unlock_bh(&bdi_lock); | 508 | spin_unlock_bh(&bdi_lock); |
481 | 509 | ||
@@ -918,7 +946,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
918 | * bdi->dirty_ratelimit = balanced_dirty_ratelimit; | 946 | * bdi->dirty_ratelimit = balanced_dirty_ratelimit; |
919 | * | 947 | * |
920 | * However to get a more stable dirty_ratelimit, the below elaborated | 948 | * However to get a more stable dirty_ratelimit, the below elaborated |
921 | * code makes use of task_ratelimit to filter out sigular points and | 949 | * code makes use of task_ratelimit to filter out singular points and |
922 | * limit the step size. | 950 | * limit the step size. |
923 | * | 951 | * |
924 | * The below code essentially only uses the relative value of | 952 | * The below code essentially only uses the relative value of |
@@ -941,7 +969,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
941 | * feel and care are stable dirty rate and small position error. | 969 | * feel and care are stable dirty rate and small position error. |
942 | * | 970 | * |
943 | * |task_ratelimit - dirty_ratelimit| is used to limit the step size | 971 | * |task_ratelimit - dirty_ratelimit| is used to limit the step size |
944 | * and filter out the sigular points of balanced_dirty_ratelimit. Which | 972 | * and filter out the singular points of balanced_dirty_ratelimit. Which |
945 | * keeps jumping around randomly and can even leap far away at times | 973 | * keeps jumping around randomly and can even leap far away at times |
946 | * due to the small 200ms estimation period of dirty_rate (we want to | 974 | * due to the small 200ms estimation period of dirty_rate (we want to |
947 | * keep that period small to reduce time lags). | 975 | * keep that period small to reduce time lags). |
@@ -1606,13 +1634,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { | |||
1606 | */ | 1634 | */ |
1607 | void __init page_writeback_init(void) | 1635 | void __init page_writeback_init(void) |
1608 | { | 1636 | { |
1609 | int shift; | ||
1610 | |||
1611 | writeback_set_ratelimit(); | 1637 | writeback_set_ratelimit(); |
1612 | register_cpu_notifier(&ratelimit_nb); | 1638 | register_cpu_notifier(&ratelimit_nb); |
1613 | 1639 | ||
1614 | shift = calc_period_shift(); | 1640 | fprop_global_init(&writeout_completions); |
1615 | prop_descriptor_init(&vm_completions, shift); | ||
1616 | } | 1641 | } |
1617 | 1642 | ||
1618 | /** | 1643 | /** |