sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth

Account bandwidth usage on the cfs_rq level versus the task_groups to which they belong. Whether we are tracking bandwidth on a given cfs_rq is maintained under cfs_rq->runtime_enabled. cfs_rq's which belong to a bandwidth constrained task_group have their runtime accounted via the update_curr() path, which withdraws bandwidth from the global pool as desired. Updates involving the global pool are currently protected under cfs_bandwidth->lock, local runtime is protected by rq->lock. This patch only assigns and tracks quota, no action is taken in the case that cfs_rq->runtime_used exceeds cfs_rq->runtime_assigned. Signed-off-by: Paul Turner <pjt@google.com> Signed-off-by: Nikhil Rao <ncrao@google.com> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com> Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20110721184757.179386821@google.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Paul Turner <pjt@google.com> 2011-07-21 12:43:30 -0400
committer: Ingo Molnar <mingo@elte.hu> 2011-08-14 06:03:26 -0400
commit: ec12cb7f31e28854efae7dd6f9544e0a66379040 (patch)
tree: 30a7293a4f9d566043f524bb4c43d4ae8b0560db
parent: a790de99599a29ad3f18667530cf4b9f4b7e3234 (diff)
4 files changed, 94 insertions, 3 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4ac2c0578e0f..bc6f5f2e24fa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2040,6 +2040,10 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { }
 static inline void sched_autogroup_exit(struct signal_struct *sig) { }
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+extern unsigned int sysctl_sched_cfs_bandwidth_slice;
+#endif
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
diff --git a/kernel/sched.c b/kernel/sched.c
index ea6850d93b2a..35561c63a490 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -251,7 +251,7 @@ struct cfs_bandwidth {
 #ifdef CONFIG_CFS_BANDWIDTH
        raw_spinlock_t lock;
        ktime_t period;
-        u64 quota;
+        u64 quota, runtime;
        s64 hierarchal_quota;
 #endif
 };
@@ -407,6 +407,7 @@ static inline u64 default_cfs_period(void);
 static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
        raw_spin_lock_init(&cfs_b->lock);
+        cfs_b->runtime = 0;
        cfs_b->quota = RUNTIME_INF;
        cfs_b->period = ns_to_ktime(default_cfs_period());
 }
@@ -9107,6 +9108,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        raw_spin_lock_irq(&cfs_b->lock);
        cfs_b->period = ns_to_ktime(period);
        cfs_b->quota = quota;
+        cfs_b->runtime = quota;
        raw_spin_unlock_irq(&cfs_b->lock);
        for_each_possible_cpu(i) {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f24f4171019d..9502aa899f73 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 */
 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
+ * each time a cfs_rq requests quota.
+ *
+ * Note: in the case that the slice exceeds the runtime remaining (either due
+ * to consumption or the quota being specified to be smaller than the slice)
+ * we will always only issue the remaining available time.
+ *
+ * default: 5 msec, units: microseconds
+  */
+unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+#endif
 static const struct sched_class fair_sched_class;
 /**************************************************************
@@ -292,6 +306,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 #endif  /* CONFIG_FAIR_GROUP_SCHED */
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                   unsigned long delta_exec);
 /**************************************************************
 * Scheduling class tree data structure manipulation methods:
@@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
                cpuacct_charge(curtask, delta_exec);
                account_group_exec_runtime(curtask, delta_exec);
        }
+        account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 static inline void
@@ -1248,6 +1266,58 @@ static inline u64 default_cfs_period(void)
 {
        return 100000000ULL;
 }
+static inline u64 sched_cfs_bandwidth_slice(void)
+{
+        return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
+}
+static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        struct task_group *tg = cfs_rq->tg;
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+        u64 amount = 0, min_amount;
+        /* note: this is a positive sum as runtime_remaining <= 0 */
+        min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+        raw_spin_lock(&cfs_b->lock);
+        if (cfs_b->quota == RUNTIME_INF)
+                amount = min_amount;
+        else if (cfs_b->runtime > 0) {
+                amount = min(cfs_b->runtime, min_amount);
+                cfs_b->runtime -= amount;
+        }
+        raw_spin_unlock(&cfs_b->lock);
+        cfs_rq->runtime_remaining += amount;
+}
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                     unsigned long delta_exec)
+{
+        if (!cfs_rq->runtime_enabled)
+                return;
+        cfs_rq->runtime_remaining -= delta_exec;
+        if (cfs_rq->runtime_remaining > 0)
+                return;
+        assign_cfs_rq_runtime(cfs_rq);
+}
+static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                                   unsigned long delta_exec)
+{
+        if (!cfs_rq->runtime_enabled)
+                return;
+        __account_cfs_rq_runtime(cfs_rq, delta_exec);
+}
+#else
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                     unsigned long delta_exec) {}
 #endif
 /**************************************************
@@ -4266,8 +4336,13 @@ static void set_curr_task_fair(struct rq *rq)
 {
        struct sched_entity *se = &rq->curr->se;
-        for_each_sched_entity(se)
+        for_each_sched_entity(se) {
-                set_next_entity(cfs_rq_of(se), se);
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                set_next_entity(cfs_rq, se);
+                /* ensure bandwidth has been allocated on our new cfs_rq */
+                account_cfs_rq_runtime(cfs_rq, 0);
+        }
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b531e50..2d2ecdcc8cdb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = {
                .extra2         = &one,
        },
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+        {
+                .procname       = "sched_cfs_bandwidth_slice_us",
+                .data           = &sysctl_sched_cfs_bandwidth_slice,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &one,
+        },
+#endif
 #ifdef CONFIG_PROVE_LOCKING
        {
                .procname       = "prove_locking",
author	Paul Turner <pjt@google.com>	2011-07-21 12:43:30 -0400
committer	Ingo Molnar <mingo@elte.hu>	2011-08-14 06:03:26 -0400
commit	ec12cb7f31e28854efae7dd6f9544e0a66379040 (patch)
tree	30a7293a4f9d566043f524bb4c43d4ae8b0560db
parent	a790de99599a29ad3f18667530cf4b9f4b7e3234 (diff)

diff --git a/include/linux/sched.h b/include/linux/sched.h index 4ac2c0578e0f..bc6f5f2e24fa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -2040,6 +2040,10 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { }
2040	static inline void sched_autogroup_exit(struct signal_struct *sig) { }	2040	static inline void sched_autogroup_exit(struct signal_struct *sig) { }
2041	#endif	2041	#endif
2042		2042
		2043	#ifdef CONFIG_CFS_BANDWIDTH
		2044	extern unsigned int sysctl_sched_cfs_bandwidth_slice;
		2045	#endif
		2046
2043	#ifdef CONFIG_RT_MUTEXES	2047	#ifdef CONFIG_RT_MUTEXES
2044	extern int rt_mutex_getprio(struct task_struct *p);	2048	extern int rt_mutex_getprio(struct task_struct *p);
2045	extern void rt_mutex_setprio(struct task_struct *p, int prio);	2049	extern void rt_mutex_setprio(struct task_struct *p, int prio);


diff --git a/kernel/sched.c b/kernel/sched.c index ea6850d93b2a..35561c63a490 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -251,7 +251,7 @@ struct cfs_bandwidth {
251	#ifdef CONFIG_CFS_BANDWIDTH	251	#ifdef CONFIG_CFS_BANDWIDTH
252	raw_spinlock_t lock;	252	raw_spinlock_t lock;
253	ktime_t period;	253	ktime_t period;
254	u64 quota;	254	u64 quota, runtime;
255	s64 hierarchal_quota;	255	s64 hierarchal_quota;
256	#endif	256	#endif
257	};	257	};
@@ -407,6 +407,7 @@ static inline u64 default_cfs_period(void);
407	static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)	407	static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
408	{	408	{
409	raw_spin_lock_init(&cfs_b->lock);	409	raw_spin_lock_init(&cfs_b->lock);
		410	cfs_b->runtime = 0;
410	cfs_b->quota = RUNTIME_INF;	411	cfs_b->quota = RUNTIME_INF;
411	cfs_b->period = ns_to_ktime(default_cfs_period());	412	cfs_b->period = ns_to_ktime(default_cfs_period());
412	}	413	}
@@ -9107,6 +9108,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9107	raw_spin_lock_irq(&cfs_b->lock);	9108	raw_spin_lock_irq(&cfs_b->lock);
9108	cfs_b->period = ns_to_ktime(period);	9109	cfs_b->period = ns_to_ktime(period);
9109	cfs_b->quota = quota;	9110	cfs_b->quota = quota;
		9111	cfs_b->runtime = quota;
9110	raw_spin_unlock_irq(&cfs_b->lock);	9112	raw_spin_unlock_irq(&cfs_b->lock);
9111		9113
9112	for_each_possible_cpu(i) {	9114	for_each_possible_cpu(i) {


diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f24f4171019d..9502aa899f73 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
89	*/	89	*/
90	unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;	90	unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
91		91
		92	#ifdef CONFIG_CFS_BANDWIDTH
		93	/*
		94	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
		95	* each time a cfs_rq requests quota.
		96	*
		97	* Note: in the case that the slice exceeds the runtime remaining (either due
		98	* to consumption or the quota being specified to be smaller than the slice)
		99	* we will always only issue the remaining available time.
		100	*
		101	* default: 5 msec, units: microseconds
		102	*/
		103	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
		104	#endif
		105
92	static const struct sched_class fair_sched_class;	106	static const struct sched_class fair_sched_class;
93		107
94	/**************************************************************	108	/**************************************************************
@@ -292,6 +306,8 @@ find_matching_se(struct sched_entity se, struct sched_entity pse)
292		306
293	#endif /* CONFIG_FAIR_GROUP_SCHED */	307	#endif /* CONFIG_FAIR_GROUP_SCHED */
294		308
		309	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
		310	unsigned long delta_exec);
295		311
296	/**************************************************************	312	/**************************************************************
297	* Scheduling class tree data structure manipulation methods:	313	* Scheduling class tree data structure manipulation methods:
@@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
583	cpuacct_charge(curtask, delta_exec);	599	cpuacct_charge(curtask, delta_exec);
584	account_group_exec_runtime(curtask, delta_exec);	600	account_group_exec_runtime(curtask, delta_exec);
585	}	601	}
		602
		603	account_cfs_rq_runtime(cfs_rq, delta_exec);
586	}	604	}
587		605
588	static inline void	606	static inline void
@@ -1248,6 +1266,58 @@ static inline u64 default_cfs_period(void)
1248	{	1266	{
1249	return 100000000ULL;	1267	return 100000000ULL;
1250	}	1268	}
		1269
		1270	static inline u64 sched_cfs_bandwidth_slice(void)
		1271	{
		1272	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
		1273	}
		1274
		1275	static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
		1276	{
		1277	struct task_group *tg = cfs_rq->tg;
		1278	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
		1279	u64 amount = 0, min_amount;
		1280
		1281	/* note: this is a positive sum as runtime_remaining <= 0 */
		1282	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
		1283
		1284	raw_spin_lock(&cfs_b->lock);
		1285	if (cfs_b->quota == RUNTIME_INF)
		1286	amount = min_amount;
		1287	else if (cfs_b->runtime > 0) {
		1288	amount = min(cfs_b->runtime, min_amount);
		1289	cfs_b->runtime -= amount;
		1290	}
		1291	raw_spin_unlock(&cfs_b->lock);
		1292
		1293	cfs_rq->runtime_remaining += amount;
		1294	}
		1295
		1296	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
		1297	unsigned long delta_exec)
		1298	{
		1299	if (!cfs_rq->runtime_enabled)
		1300	return;
		1301
		1302	cfs_rq->runtime_remaining -= delta_exec;
		1303	if (cfs_rq->runtime_remaining > 0)
		1304	return;
		1305
		1306	assign_cfs_rq_runtime(cfs_rq);
		1307	}
		1308
		1309	static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
		1310	unsigned long delta_exec)
		1311	{
		1312	if (!cfs_rq->runtime_enabled)
		1313	return;
		1314
		1315	__account_cfs_rq_runtime(cfs_rq, delta_exec);
		1316	}
		1317
		1318	#else
		1319	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
		1320	unsigned long delta_exec) {}
1251	#endif	1321	#endif
1252		1322
1253	/**************************************************	1323	/**************************************************
@@ -4266,8 +4336,13 @@ static void set_curr_task_fair(struct rq *rq)
4266	{	4336	{
4267	struct sched_entity *se = &rq->curr->se;	4337	struct sched_entity *se = &rq->curr->se;
4268		4338
4269	for_each_sched_entity(se)	4339	for_each_sched_entity(se) {
4270	set_next_entity(cfs_rq_of(se), se);	4340	struct cfs_rq *cfs_rq = cfs_rq_of(se);
		4341
		4342	set_next_entity(cfs_rq, se);
		4343	/* ensure bandwidth has been allocated on our new cfs_rq */
		4344	account_cfs_rq_runtime(cfs_rq, 0);
		4345	}
4271	}	4346	}
4272		4347
4273	#ifdef CONFIG_FAIR_GROUP_SCHED	4348	#ifdef CONFIG_FAIR_GROUP_SCHED


diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 11d65b531e50..2d2ecdcc8cdb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c
@@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = {
379	.extra2 = &one,	379	.extra2 = &one,
380	},	380	},
381	#endif	381	#endif
		382	#ifdef CONFIG_CFS_BANDWIDTH
		383	{
		384	.procname = "sched_cfs_bandwidth_slice_us",
		385	.data = &sysctl_sched_cfs_bandwidth_slice,
		386	.maxlen = sizeof(unsigned int),
		387	.mode = 0644,
		388	.proc_handler = proc_dointvec_minmax,
		389	.extra1 = &one,
		390	},
		391	#endif
382	#ifdef CONFIG_PROVE_LOCKING	392	#ifdef CONFIG_PROVE_LOCKING
383	{	393	{
384	.procname = "prove_locking",	394	.procname = "prove_locking",