aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVikas Shivappa <vikas.shivappa@linux.intel.com>2017-08-15 21:00:43 -0400
committerThomas Gleixner <tglx@linutronix.de>2017-08-16 06:05:41 -0400
commit24247aeeabe99eab13b798ccccc2dec066dd6f07 (patch)
tree5ae1fddc1d0ae510c0060783092a6a618add6103
parentbbc4615e0b7df5e21d0991adb4b2798508354924 (diff)
x86/intel_rdt/cqm: Improve limbo list processing
During a mkdir, the entire limbo list is synchronously checked on each package for free RMIDs by sending IPIs. With a large number of RMIDs (SKL has 192) this creates a intolerable amount of work in IPIs. Replace the IPI based checking of the limbo list with asynchronous worker threads on each package which periodically scan the limbo list and move the RMIDs that have: llc_occupancy < threshold_occupancy on all packages to the free list. mkdir now returns -ENOSPC if the free list and the limbo list ere empty or returns -EBUSY if there are RMIDs on the limbo list and the free list is empty. Getting rid of the IPIs also simplifies the data structures and the serialization required for handling the lists. [ tglx: Rewrote changelog ... ] Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: davidcc@google.com Link: http://lkml.kernel.org/r/1502845243-20454-3-git-send-email-vikas.shivappa@linux.intel.com
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c31
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.h14
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_monitor.c210
3 files changed, 133 insertions, 122 deletions
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index b8dc141896b6..6935c8ecad7f 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -426,6 +426,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
426 GFP_KERNEL); 426 GFP_KERNEL);
427 if (!d->rmid_busy_llc) 427 if (!d->rmid_busy_llc)
428 return -ENOMEM; 428 return -ENOMEM;
429 INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
429 } 430 }
430 if (is_mbm_total_enabled()) { 431 if (is_mbm_total_enabled()) {
431 tsize = sizeof(*d->mbm_total); 432 tsize = sizeof(*d->mbm_total);
@@ -536,11 +537,33 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
536 list_del(&d->list); 537 list_del(&d->list);
537 if (is_mbm_enabled()) 538 if (is_mbm_enabled())
538 cancel_delayed_work(&d->mbm_over); 539 cancel_delayed_work(&d->mbm_over);
540 if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) {
541 /*
542 * When a package is going down, forcefully
543 * decrement rmid->ebusy. There is no way to know
544 * that the L3 was flushed and hence may lead to
545 * incorrect counts in rare scenarios, but leaving
546 * the RMID as busy creates RMID leaks if the
547 * package never comes back.
548 */
549 __check_limbo(d, true);
550 cancel_delayed_work(&d->cqm_limbo);
551 }
552
539 kfree(d); 553 kfree(d);
540 } else if (r == &rdt_resources_all[RDT_RESOURCE_L3] && 554 return;
541 cpu == d->mbm_work_cpu && is_mbm_enabled()) { 555 }
542 cancel_delayed_work(&d->mbm_over); 556
543 mbm_setup_overflow_handler(d, 0); 557 if (r == &rdt_resources_all[RDT_RESOURCE_L3]) {
558 if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
559 cancel_delayed_work(&d->mbm_over);
560 mbm_setup_overflow_handler(d, 0);
561 }
562 if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
563 has_busy_rmid(r, d)) {
564 cancel_delayed_work(&d->cqm_limbo);
565 cqm_setup_limbo_handler(d, 0);
566 }
544 } 567 }
545} 568}
546 569
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 3e4869390603..ebaddaeef023 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -20,6 +20,8 @@
20#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02 20#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02
21#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03 21#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03
22 22
23#define CQM_LIMBOCHECK_INTERVAL 1000
24
23#define MBM_CNTR_WIDTH 24 25#define MBM_CNTR_WIDTH 24
24#define MBM_OVERFLOW_INTERVAL 1000 26#define MBM_OVERFLOW_INTERVAL 1000
25 27
@@ -187,8 +189,11 @@ struct mbm_state {
187 * @mbm_total: saved state for MBM total bandwidth 189 * @mbm_total: saved state for MBM total bandwidth
188 * @mbm_local: saved state for MBM local bandwidth 190 * @mbm_local: saved state for MBM local bandwidth
189 * @mbm_over: worker to periodically read MBM h/w counters 191 * @mbm_over: worker to periodically read MBM h/w counters
192 * @cqm_limbo: worker to periodically read CQM h/w counters
190 * @mbm_work_cpu: 193 * @mbm_work_cpu:
191 * worker cpu for MBM h/w counters 194 * worker cpu for MBM h/w counters
195 * @cqm_work_cpu:
196 * worker cpu for CQM h/w counters
192 * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) 197 * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
193 * @new_ctrl: new ctrl value to be loaded 198 * @new_ctrl: new ctrl value to be loaded
194 * @have_new_ctrl: did user provide new_ctrl for this domain 199 * @have_new_ctrl: did user provide new_ctrl for this domain
@@ -201,7 +206,9 @@ struct rdt_domain {
201 struct mbm_state *mbm_total; 206 struct mbm_state *mbm_total;
202 struct mbm_state *mbm_local; 207 struct mbm_state *mbm_local;
203 struct delayed_work mbm_over; 208 struct delayed_work mbm_over;
209 struct delayed_work cqm_limbo;
204 int mbm_work_cpu; 210 int mbm_work_cpu;
211 int cqm_work_cpu;
205 u32 *ctrl_val; 212 u32 *ctrl_val;
206 u32 new_ctrl; 213 u32 new_ctrl;
207 bool have_new_ctrl; 214 bool have_new_ctrl;
@@ -422,7 +429,12 @@ void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
422 struct rdt_domain *d); 429 struct rdt_domain *d);
423void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, 430void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
424 struct rdtgroup *rdtgrp, int evtid, int first); 431 struct rdtgroup *rdtgrp, int evtid, int first);
425void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms); 432void mbm_setup_overflow_handler(struct rdt_domain *dom,
433 unsigned long delay_ms);
426void mbm_handle_overflow(struct work_struct *work); 434void mbm_handle_overflow(struct work_struct *work);
435void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
436void cqm_handle_limbo(struct work_struct *work);
437bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
438void __check_limbo(struct rdt_domain *d, bool force_free);
427 439
428#endif /* _ASM_X86_INTEL_RDT_H */ 440#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index 8378785883dc..30827510094b 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -33,7 +33,7 @@
33 33
34struct rmid_entry { 34struct rmid_entry {
35 u32 rmid; 35 u32 rmid;
36 atomic_t busy; 36 int busy;
37 struct list_head list; 37 struct list_head list;
38}; 38};
39 39
@@ -45,13 +45,13 @@ struct rmid_entry {
45static LIST_HEAD(rmid_free_lru); 45static LIST_HEAD(rmid_free_lru);
46 46
47/** 47/**
48 * @rmid_limbo_lru list of currently unused but (potentially) 48 * @rmid_limbo_count count of currently unused but (potentially)
49 * dirty RMIDs. 49 * dirty RMIDs.
50 * This list contains RMIDs that no one is currently using but that 50 * This counts RMIDs that no one is currently using but that
51 * may have a occupancy value > intel_cqm_threshold. User can change 51 * may have a occupancy value > intel_cqm_threshold. User can change
52 * the threshold occupancy value. 52 * the threshold occupancy value.
53 */ 53 */
54static LIST_HEAD(rmid_limbo_lru); 54unsigned int rmid_limbo_count;
55 55
56/** 56/**
57 * @rmid_entry - The entry in the limbo and free lists. 57 * @rmid_entry - The entry in the limbo and free lists.
@@ -103,124 +103,53 @@ static u64 __rmid_read(u32 rmid, u32 eventid)
103 return val; 103 return val;
104} 104}
105 105
106/* 106static bool rmid_dirty(struct rmid_entry *entry)
107 * Walk the limbo list looking at any RMIDs that are flagged in the
108 * domain rmid_busy_llc bitmap as busy. If the reported LLC occupancy
109 * is below the threshold clear the busy bit and decrement the count.
110 * If the busy count gets to zero on an RMID we stop looking.
111 * This can be called from an IPI.
112 * We need an atomic for the busy count because multiple CPUs may check
113 * the same RMID at the same time.
114 */
115static bool __check_limbo(struct rdt_domain *d)
116{
117 struct rmid_entry *entry;
118 u64 val;
119
120 list_for_each_entry(entry, &rmid_limbo_lru, list) {
121 if (!test_bit(entry->rmid, d->rmid_busy_llc))
122 continue;
123 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
124 if (val <= intel_cqm_threshold) {
125 clear_bit(entry->rmid, d->rmid_busy_llc);
126 if (atomic_dec_and_test(&entry->busy))
127 return true;
128 }
129 }
130 return false;
131}
132
133static void check_limbo(void *arg)
134{ 107{
135 struct rdt_domain *d; 108 u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
136
137 d = get_domain_from_cpu(smp_processor_id(),
138 &rdt_resources_all[RDT_RESOURCE_L3]);
139
140 if (d)
141 __check_limbo(d);
142}
143 109
144static bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) 110 return val >= intel_cqm_threshold;
145{
146 return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
147} 111}
148 112
149/* 113/*
150 * Scan the limbo list and move all entries that are below the 114 * Check the RMIDs that are marked as busy for this domain. If the
151 * intel_cqm_threshold to the free list. 115 * reported LLC occupancy is below the threshold clear the busy bit and
152 * Return "true" if the limbo list is empty, "false" if there are 116 * decrement the count. If the busy count gets to zero on an RMID, we
153 * still some RMIDs there. 117 * free the RMID
154 */ 118 */
155static bool try_freeing_limbo_rmid(void) 119void __check_limbo(struct rdt_domain *d, bool force_free)
156{ 120{
157 struct rmid_entry *entry, *tmp; 121 struct rmid_entry *entry;
158 struct rdt_resource *r; 122 struct rdt_resource *r;
159 cpumask_var_t cpu_mask; 123 u32 crmid = 1, nrmid;
160 struct rdt_domain *d;
161 bool ret = true;
162 int cpu;
163
164 if (list_empty(&rmid_limbo_lru))
165 return ret;
166 124
167 r = &rdt_resources_all[RDT_RESOURCE_L3]; 125 r = &rdt_resources_all[RDT_RESOURCE_L3];
168 126
169 cpu = get_cpu();
170
171 /* 127 /*
172 * First see if we can free up an RMID by checking busy values 128 * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
173 * on the local package. 129 * are marked as busy for occupancy < threshold. If the occupancy
130 * is less than the threshold decrement the busy counter of the
131 * RMID and move it to the free list when the counter reaches 0.
174 */ 132 */
175 d = get_domain_from_cpu(cpu, r); 133 for (;;) {
176 if (d && has_busy_rmid(r, d) && __check_limbo(d)) { 134 nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid);
177 list_for_each_entry_safe(entry, tmp, &rmid_limbo_lru, list) { 135 if (nrmid >= r->num_rmid)
178 if (atomic_read(&entry->busy) == 0) { 136 break;
179 list_del(&entry->list); 137
138 entry = __rmid_entry(nrmid);
139 if (force_free || !rmid_dirty(entry)) {
140 clear_bit(entry->rmid, d->rmid_busy_llc);
141 if (!--entry->busy) {
142 rmid_limbo_count--;
180 list_add_tail(&entry->list, &rmid_free_lru); 143 list_add_tail(&entry->list, &rmid_free_lru);
181 goto done;
182 } 144 }
183 } 145 }
146 crmid = nrmid + 1;
184 } 147 }
148}
185 149
186 if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) { 150bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
187 ret = false; 151{
188 goto done; 152 return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
189 }
190
191 /*
192 * Build a mask of other domains that have busy RMIDs
193 */
194 list_for_each_entry(d, &r->domains, list) {
195 if (!cpumask_test_cpu(cpu, &d->cpu_mask) &&
196 has_busy_rmid(r, d))
197 cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
198 }
199 if (cpumask_empty(cpu_mask)) {
200 ret = false;
201 goto free_mask;
202 }
203
204 /*
205 * Scan domains with busy RMIDs to check if they still are busy
206 */
207 on_each_cpu_mask(cpu_mask, check_limbo, NULL, true);
208
209 /* Walk limbo list moving all free RMIDs to the &rmid_free_lru list */
210 list_for_each_entry_safe(entry, tmp, &rmid_limbo_lru, list) {
211 if (atomic_read(&entry->busy) != 0) {
212 ret = false;
213 continue;
214 }
215 list_del(&entry->list);
216 list_add_tail(&entry->list, &rmid_free_lru);
217 }
218
219free_mask:
220 free_cpumask_var(cpu_mask);
221done:
222 put_cpu();
223 return ret;
224} 153}
225 154
226/* 155/*
@@ -231,15 +160,11 @@ done:
231int alloc_rmid(void) 160int alloc_rmid(void)
232{ 161{
233 struct rmid_entry *entry; 162 struct rmid_entry *entry;
234 bool ret;
235 163
236 lockdep_assert_held(&rdtgroup_mutex); 164 lockdep_assert_held(&rdtgroup_mutex);
237 165
238 if (list_empty(&rmid_free_lru)) { 166 if (list_empty(&rmid_free_lru))
239 ret = try_freeing_limbo_rmid(); 167 return rmid_limbo_count ? -EBUSY : -ENOSPC;
240 if (list_empty(&rmid_free_lru))
241 return ret ? -ENOSPC : -EBUSY;
242 }
243 168
244 entry = list_first_entry(&rmid_free_lru, 169 entry = list_first_entry(&rmid_free_lru,
245 struct rmid_entry, list); 170 struct rmid_entry, list);
@@ -252,11 +177,12 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
252{ 177{
253 struct rdt_resource *r; 178 struct rdt_resource *r;
254 struct rdt_domain *d; 179 struct rdt_domain *d;
255 int cpu, nbusy = 0; 180 int cpu;
256 u64 val; 181 u64 val;
257 182
258 r = &rdt_resources_all[RDT_RESOURCE_L3]; 183 r = &rdt_resources_all[RDT_RESOURCE_L3];
259 184
185 entry->busy = 0;
260 cpu = get_cpu(); 186 cpu = get_cpu();
261 list_for_each_entry(d, &r->domains, list) { 187 list_for_each_entry(d, &r->domains, list) {
262 if (cpumask_test_cpu(cpu, &d->cpu_mask)) { 188 if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
@@ -264,17 +190,22 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
264 if (val <= intel_cqm_threshold) 190 if (val <= intel_cqm_threshold)
265 continue; 191 continue;
266 } 192 }
193
194 /*
195 * For the first limbo RMID in the domain,
196 * setup up the limbo worker.
197 */
198 if (!has_busy_rmid(r, d))
199 cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL);
267 set_bit(entry->rmid, d->rmid_busy_llc); 200 set_bit(entry->rmid, d->rmid_busy_llc);
268 nbusy++; 201 entry->busy++;
269 } 202 }
270 put_cpu(); 203 put_cpu();
271 204
272 if (nbusy) { 205 if (entry->busy)
273 atomic_set(&entry->busy, nbusy); 206 rmid_limbo_count++;
274 list_add_tail(&entry->list, &rmid_limbo_lru); 207 else
275 } else {
276 list_add_tail(&entry->list, &rmid_free_lru); 208 list_add_tail(&entry->list, &rmid_free_lru);
277 }
278} 209}
279 210
280void free_rmid(u32 rmid) 211void free_rmid(u32 rmid)
@@ -387,6 +318,50 @@ static void mbm_update(struct rdt_domain *d, int rmid)
387 } 318 }
388} 319}
389 320
321/*
322 * Handler to scan the limbo list and move the RMIDs
323 * to free list whose occupancy < threshold_occupancy.
324 */
325void cqm_handle_limbo(struct work_struct *work)
326{
327 unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
328 int cpu = smp_processor_id();
329 struct rdt_resource *r;
330 struct rdt_domain *d;
331
332 mutex_lock(&rdtgroup_mutex);
333
334 r = &rdt_resources_all[RDT_RESOURCE_L3];
335 d = get_domain_from_cpu(cpu, r);
336
337 if (!d) {
338 pr_warn_once("Failure to get domain for limbo worker\n");
339 goto out_unlock;
340 }
341
342 __check_limbo(d, false);
343
344 if (has_busy_rmid(r, d))
345 schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
346
347out_unlock:
348 mutex_unlock(&rdtgroup_mutex);
349}
350
351void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
352{
353 unsigned long delay = msecs_to_jiffies(delay_ms);
354 struct rdt_resource *r;
355 int cpu;
356
357 r = &rdt_resources_all[RDT_RESOURCE_L3];
358
359 cpu = cpumask_any(&dom->cpu_mask);
360 dom->cqm_work_cpu = cpu;
361
362 schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
363}
364
390void mbm_handle_overflow(struct work_struct *work) 365void mbm_handle_overflow(struct work_struct *work)
391{ 366{
392 unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); 367 unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
@@ -413,6 +388,7 @@ void mbm_handle_overflow(struct work_struct *work)
413 } 388 }
414 389
415 schedule_delayed_work_on(cpu, &d->mbm_over, delay); 390 schedule_delayed_work_on(cpu, &d->mbm_over, delay);
391
416out_unlock: 392out_unlock:
417 mutex_unlock(&rdtgroup_mutex); 393 mutex_unlock(&rdtgroup_mutex);
418} 394}