aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2017-09-25 12:00:19 -0400
committerTejun Heo <tj@kernel.org>2017-09-29 17:30:37 -0400
commit0d5936344f30aba0f6ddb92b030cb6a05168efe6 (patch)
tree5eabe1efd54035cac1594286f60d44f2dc2df786
parenta1f7164c7b8b0d46f63bfb4ca0bb5971c760b921 (diff)
sched: Implement interface for cgroup unified hierarchy
There are a couple interface issues which can be addressed in cgroup2 interface. * Stats from cpuacct being reported separately from the cpu stats. * Use of different time units. Writable control knobs use microseconds, some stat fields use nanoseconds while other cpuacct stat fields use centiseconds. * Control knobs which can't be used in the root cgroup still show up in the root. * Control knob names and semantics aren't consistent with other controllers. This patchset implements cpu controller's interface on cgroup2 which adheres to the controller file conventions described in Documentation/cgroups/cgroup-v2.txt. Overall, the following changes are made. * cpuacct is implictly enabled and disabled by cpu and its information is reported through "cpu.stat" which now uses microseconds for all time durations. All time duration fields now have "_usec" appended to them for clarity. Note that cpuacct.usage_percpu is currently not included in "cpu.stat". If this information is actually called for, it will be added later. * "cpu.shares" is replaced with "cpu.weight" and operates on the standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000). The weight is scaled to scheduler weight so that 100 maps to 1024 and the ratio relationship is preserved - if weight is W and its scaled value is S, W / 100 == S / 1024. While the mapped range is a bit smaller than the orignal scheduler weight range, the dead zones on both sides are relatively small and covers wider range than the nice value mappings. This file doesn't make sense in the root cgroup and isn't created on root. * "cpu.weight.nice" is added. When read, it reads back the nice value which is closest to the current "cpu.weight". When written, it sets "cpu.weight" to the weight value which matches the nice value. This makes it easy to configure cgroups when they're competing against threads in threaded subtrees. * "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max" which contains both quota and period. v4: - Use cgroup2 basic usage stat as the information source instead of cpuacct. v3: - Added "cpu.weight.nice" to allow using nice values when configuring the weight. The feature is requested by PeterZ. - Merge the patch to enable threaded support on cpu and cpuacct. - Dropped the bits about getting rid of cpuacct from patch description as there is a pretty strong case for making cpuacct an implicit controller so that basic cpu usage stats are always available. - Documentation updated accordingly. "cpu.rt.max" section is dropped for now. v2: - cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for CFS bandwidth stats and also using raw division for u64. Use CONFIG_CFS_BANDWITH and do_div() instead. "cpu.rt.max" is not included yet. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Johannes Weiner <hannes@cmpxchg.org>
-rw-r--r--Documentation/cgroup-v2.txt36
-rw-r--r--kernel/sched/core.c171
2 files changed, 183 insertions, 24 deletions
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 3f8216912df0..0bbdc720dd7c 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -902,10 +902,6 @@ Controllers
902CPU 902CPU
903--- 903---
904 904
905.. note::
906
907 The interface for the cpu controller hasn't been merged yet
908
909The "cpu" controllers regulates distribution of CPU cycles. This 905The "cpu" controllers regulates distribution of CPU cycles. This
910controller implements weight and absolute bandwidth limit models for 906controller implements weight and absolute bandwidth limit models for
911normal scheduling policy and absolute bandwidth allocation model for 907normal scheduling policy and absolute bandwidth allocation model for
@@ -935,6 +931,18 @@ All time durations are in microseconds.
935 931
936 The weight in the range [1, 10000]. 932 The weight in the range [1, 10000].
937 933
934 cpu.weight.nice
935 A read-write single value file which exists on non-root
936 cgroups. The default is "0".
937
938 The nice value is in the range [-20, 19].
939
940 This interface file is an alternative interface for
941 "cpu.weight" and allows reading and setting weight using the
942 same values used by nice(2). Because the range is smaller and
943 granularity is coarser for the nice values, the read value is
944 the closest approximation of the current weight.
945
938 cpu.max 946 cpu.max
939 A read-write two value file which exists on non-root cgroups. 947 A read-write two value file which exists on non-root cgroups.
940 The default is "max 100000". 948 The default is "max 100000".
@@ -947,26 +955,6 @@ All time durations are in microseconds.
947 $PERIOD duration. "max" for $MAX indicates no limit. If only 955 $PERIOD duration. "max" for $MAX indicates no limit. If only
948 one number is written, $MAX is updated. 956 one number is written, $MAX is updated.
949 957
950 cpu.rt.max
951 .. note::
952
953 The semantics of this file is still under discussion and the
954 interface hasn't been merged yet
955
956 A read-write two value file which exists on all cgroups.
957 The default is "0 100000".
958
959 The maximum realtime runtime allocation. Over-committing
960 configurations are disallowed and process migrations are
961 rejected if not enough bandwidth is available. It's in the
962 following format::
963
964 $MAX $PERIOD
965
966 which indicates that the group may consume upto $MAX in each
967 $PERIOD duration. If only one number is written, $MAX is
968 updated.
969
970 958
971Memory 959Memory
972------ 960------
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6815fa424a7a..ad255162a830 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6678,6 +6678,175 @@ static struct cftype cpu_legacy_files[] = {
6678 { } /* Terminate */ 6678 { } /* Terminate */
6679}; 6679};
6680 6680
6681static int cpu_stat_show(struct seq_file *sf, void *v)
6682{
6683 cgroup_stat_show_cputime(sf, "");
6684
6685#ifdef CONFIG_CFS_BANDWIDTH
6686 {
6687 struct task_group *tg = css_tg(seq_css(sf));
6688 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6689 u64 throttled_usec;
6690
6691 throttled_usec = cfs_b->throttled_time;
6692 do_div(throttled_usec, NSEC_PER_USEC);
6693
6694 seq_printf(sf, "nr_periods %d\n"
6695 "nr_throttled %d\n"
6696 "throttled_usec %llu\n",
6697 cfs_b->nr_periods, cfs_b->nr_throttled,
6698 throttled_usec);
6699 }
6700#endif
6701 return 0;
6702}
6703
6704#ifdef CONFIG_FAIR_GROUP_SCHED
6705static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
6706 struct cftype *cft)
6707{
6708 struct task_group *tg = css_tg(css);
6709 u64 weight = scale_load_down(tg->shares);
6710
6711 return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
6712}
6713
6714static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
6715 struct cftype *cft, u64 weight)
6716{
6717 /*
6718 * cgroup weight knobs should use the common MIN, DFL and MAX
6719 * values which are 1, 100 and 10000 respectively. While it loses
6720 * a bit of range on both ends, it maps pretty well onto the shares
6721 * value used by scheduler and the round-trip conversions preserve
6722 * the original value over the entire range.
6723 */
6724 if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
6725 return -ERANGE;
6726
6727 weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
6728
6729 return sched_group_set_shares(css_tg(css), scale_load(weight));
6730}
6731
6732static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
6733 struct cftype *cft)
6734{
6735 unsigned long weight = scale_load_down(css_tg(css)->shares);
6736 int last_delta = INT_MAX;
6737 int prio, delta;
6738
6739 /* find the closest nice value to the current weight */
6740 for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
6741 delta = abs(sched_prio_to_weight[prio] - weight);
6742 if (delta >= last_delta)
6743 break;
6744 last_delta = delta;
6745 }
6746
6747 return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
6748}
6749
6750static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
6751 struct cftype *cft, s64 nice)
6752{
6753 unsigned long weight;
6754
6755 if (nice < MIN_NICE || nice > MAX_NICE)
6756 return -ERANGE;
6757
6758 weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO];
6759 return sched_group_set_shares(css_tg(css), scale_load(weight));
6760}
6761#endif
6762
6763static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
6764 long period, long quota)
6765{
6766 if (quota < 0)
6767 seq_puts(sf, "max");
6768 else
6769 seq_printf(sf, "%ld", quota);
6770
6771 seq_printf(sf, " %ld\n", period);
6772}
6773
6774/* caller should put the current value in *@periodp before calling */
6775static int __maybe_unused cpu_period_quota_parse(char *buf,
6776 u64 *periodp, u64 *quotap)
6777{
6778 char tok[21]; /* U64_MAX */
6779
6780 if (!sscanf(buf, "%s %llu", tok, periodp))
6781 return -EINVAL;
6782
6783 *periodp *= NSEC_PER_USEC;
6784
6785 if (sscanf(tok, "%llu", quotap))
6786 *quotap *= NSEC_PER_USEC;
6787 else if (!strcmp(tok, "max"))
6788 *quotap = RUNTIME_INF;
6789 else
6790 return -EINVAL;
6791
6792 return 0;
6793}
6794
6795#ifdef CONFIG_CFS_BANDWIDTH
6796static int cpu_max_show(struct seq_file *sf, void *v)
6797{
6798 struct task_group *tg = css_tg(seq_css(sf));
6799
6800 cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
6801 return 0;
6802}
6803
6804static ssize_t cpu_max_write(struct kernfs_open_file *of,
6805 char *buf, size_t nbytes, loff_t off)
6806{
6807 struct task_group *tg = css_tg(of_css(of));
6808 u64 period = tg_get_cfs_period(tg);
6809 u64 quota;
6810 int ret;
6811
6812 ret = cpu_period_quota_parse(buf, &period, &quota);
6813 if (!ret)
6814 ret = tg_set_cfs_bandwidth(tg, period, quota);
6815 return ret ?: nbytes;
6816}
6817#endif
6818
6819static struct cftype cpu_files[] = {
6820 {
6821 .name = "stat",
6822 .flags = CFTYPE_NOT_ON_ROOT,
6823 .seq_show = cpu_stat_show,
6824 },
6825#ifdef CONFIG_FAIR_GROUP_SCHED
6826 {
6827 .name = "weight",
6828 .flags = CFTYPE_NOT_ON_ROOT,
6829 .read_u64 = cpu_weight_read_u64,
6830 .write_u64 = cpu_weight_write_u64,
6831 },
6832 {
6833 .name = "weight.nice",
6834 .flags = CFTYPE_NOT_ON_ROOT,
6835 .read_s64 = cpu_weight_nice_read_s64,
6836 .write_s64 = cpu_weight_nice_write_s64,
6837 },
6838#endif
6839#ifdef CONFIG_CFS_BANDWIDTH
6840 {
6841 .name = "max",
6842 .flags = CFTYPE_NOT_ON_ROOT,
6843 .seq_show = cpu_max_show,
6844 .write = cpu_max_write,
6845 },
6846#endif
6847 { } /* terminate */
6848};
6849
6681struct cgroup_subsys cpu_cgrp_subsys = { 6850struct cgroup_subsys cpu_cgrp_subsys = {
6682 .css_alloc = cpu_cgroup_css_alloc, 6851 .css_alloc = cpu_cgroup_css_alloc,
6683 .css_online = cpu_cgroup_css_online, 6852 .css_online = cpu_cgroup_css_online,
@@ -6687,7 +6856,9 @@ struct cgroup_subsys cpu_cgrp_subsys = {
6687 .can_attach = cpu_cgroup_can_attach, 6856 .can_attach = cpu_cgroup_can_attach,
6688 .attach = cpu_cgroup_attach, 6857 .attach = cpu_cgroup_attach,
6689 .legacy_cftypes = cpu_legacy_files, 6858 .legacy_cftypes = cpu_legacy_files,
6859 .dfl_cftypes = cpu_files,
6690 .early_init = true, 6860 .early_init = true,
6861 .threaded = true,
6691}; 6862};
6692 6863
6693#endif /* CONFIG_CGROUP_SCHED */ 6864#endif /* CONFIG_CGROUP_SCHED */