aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/cgroups.txt92
-rw-r--r--drivers/net/tun.c1
-rw-r--r--fs/xattr.c180
-rw-r--r--include/linux/cgroup.h25
-rw-r--r--include/linux/cgroup_subsys.h24
-rw-r--r--include/linux/shmem_fs.h3
-rw-r--r--include/linux/xattr.h48
-rw-r--r--include/net/cls_cgroup.h27
-rw-r--r--include/net/netprio_cgroup.h30
-rw-r--r--include/net/sock.h8
-rw-r--r--kernel/cgroup.c320
-rw-r--r--mm/shmem.c171
-rw-r--r--net/core/netprio_cgroup.c11
-rw-r--r--net/core/sock.c15
-rw-r--r--net/sched/cls_cgroup.c13
15 files changed, 575 insertions, 393 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 4a0b64c605fc..9e04196c4d78 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -29,7 +29,8 @@ CONTENTS:
29 3.1 Overview 29 3.1 Overview
30 3.2 Synchronization 30 3.2 Synchronization
31 3.3 Subsystem API 31 3.3 Subsystem API
324. Questions 324. Extended attributes usage
335. Questions
33 34
341. Control Groups 351. Control Groups
35================= 36=================
@@ -62,9 +63,9 @@ an instance of the cgroup virtual filesystem associated with it.
62At any one time there may be multiple active hierarchies of task 63At any one time there may be multiple active hierarchies of task
63cgroups. Each hierarchy is a partition of all tasks in the system. 64cgroups. Each hierarchy is a partition of all tasks in the system.
64 65
65User level code may create and destroy cgroups by name in an 66User-level code may create and destroy cgroups by name in an
66instance of the cgroup virtual file system, specify and query to 67instance of the cgroup virtual file system, specify and query to
67which cgroup a task is assigned, and list the task pids assigned to 68which cgroup a task is assigned, and list the task PIDs assigned to
68a cgroup. Those creations and assignments only affect the hierarchy 69a cgroup. Those creations and assignments only affect the hierarchy
69associated with that instance of the cgroup file system. 70associated with that instance of the cgroup file system.
70 71
@@ -72,7 +73,7 @@ On their own, the only use for cgroups is for simple job
72tracking. The intention is that other subsystems hook into the generic 73tracking. The intention is that other subsystems hook into the generic
73cgroup support to provide new attributes for cgroups, such as 74cgroup support to provide new attributes for cgroups, such as
74accounting/limiting the resources which processes in a cgroup can 75accounting/limiting the resources which processes in a cgroup can
75access. For example, cpusets (see Documentation/cgroups/cpusets.txt) allows 76access. For example, cpusets (see Documentation/cgroups/cpusets.txt) allow
76you to associate a set of CPUs and a set of memory nodes with the 77you to associate a set of CPUs and a set of memory nodes with the
77tasks in each cgroup. 78tasks in each cgroup.
78 79
@@ -80,11 +81,11 @@ tasks in each cgroup.
80---------------------------- 81----------------------------
81 82
82There are multiple efforts to provide process aggregations in the 83There are multiple efforts to provide process aggregations in the
83Linux kernel, mainly for resource tracking purposes. Such efforts 84Linux kernel, mainly for resource-tracking purposes. Such efforts
84include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server 85include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
85namespaces. These all require the basic notion of a 86namespaces. These all require the basic notion of a
86grouping/partitioning of processes, with newly forked processes ending 87grouping/partitioning of processes, with newly forked processes ending
87in the same group (cgroup) as their parent process. 88up in the same group (cgroup) as their parent process.
88 89
89The kernel cgroup patch provides the minimum essential kernel 90The kernel cgroup patch provides the minimum essential kernel
90mechanisms required to efficiently implement such groups. It has 91mechanisms required to efficiently implement such groups. It has
@@ -127,14 +128,14 @@ following lines:
127 / \ 128 / \
128 Professors (15%) students (5%) 129 Professors (15%) students (5%)
129 130
130Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd go 131Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes
131into NFS network class. 132into the NFS network class.
132 133
133At the same time Firefox/Lynx will share an appropriate CPU/Memory class 134At the same time Firefox/Lynx will share an appropriate CPU/Memory class
134depending on who launched it (prof/student). 135depending on who launched it (prof/student).
135 136
136With the ability to classify tasks differently for different resources 137With the ability to classify tasks differently for different resources
137(by putting those resource subsystems in different hierarchies) then 138(by putting those resource subsystems in different hierarchies),
138the admin can easily set up a script which receives exec notifications 139the admin can easily set up a script which receives exec notifications
139and depending on who is launching the browser he can 140and depending on who is launching the browser he can
140 141
@@ -145,19 +146,19 @@ a separate cgroup for every browser launched and associate it with
145appropriate network and other resource class. This may lead to 146appropriate network and other resource class. This may lead to
146proliferation of such cgroups. 147proliferation of such cgroups.
147 148
148Also lets say that the administrator would like to give enhanced network 149Also let's say that the administrator would like to give enhanced network
149access temporarily to a student's browser (since it is night and the user 150access temporarily to a student's browser (since it is night and the user
150wants to do online gaming :)) OR give one of the students simulation 151wants to do online gaming :)) OR give one of the student's simulation
151apps enhanced CPU power, 152apps enhanced CPU power.
152 153
153With ability to write pids directly to resource classes, it's just a 154With ability to write PIDs directly to resource classes, it's just a
154matter of : 155matter of:
155 156
156 # echo pid > /sys/fs/cgroup/network/<new_class>/tasks 157 # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
157 (after some time) 158 (after some time)
158 # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks 159 # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks
159 160
160Without this ability, he would have to split the cgroup into 161Without this ability, the administrator would have to split the cgroup into
161multiple separate ones and then associate the new cgroups with the 162multiple separate ones and then associate the new cgroups with the
162new resource classes. 163new resource classes.
163 164
@@ -184,20 +185,20 @@ Control Groups extends the kernel as follows:
184 field of each task_struct using the css_set, anchored at 185 field of each task_struct using the css_set, anchored at
185 css_set->tasks. 186 css_set->tasks.
186 187
187 - A cgroup hierarchy filesystem can be mounted for browsing and 188 - A cgroup hierarchy filesystem can be mounted for browsing and
188 manipulation from user space. 189 manipulation from user space.
189 190
190 - You can list all the tasks (by pid) attached to any cgroup. 191 - You can list all the tasks (by PID) attached to any cgroup.
191 192
192The implementation of cgroups requires a few, simple hooks 193The implementation of cgroups requires a few, simple hooks
193into the rest of the kernel, none in performance critical paths: 194into the rest of the kernel, none in performance-critical paths:
194 195
195 - in init/main.c, to initialize the root cgroups and initial 196 - in init/main.c, to initialize the root cgroups and initial
196 css_set at system boot. 197 css_set at system boot.
197 198
198 - in fork and exit, to attach and detach a task from its css_set. 199 - in fork and exit, to attach and detach a task from its css_set.
199 200
200In addition a new file system, of type "cgroup" may be mounted, to 201In addition, a new file system of type "cgroup" may be mounted, to
201enable browsing and modifying the cgroups presently known to the 202enable browsing and modifying the cgroups presently known to the
202kernel. When mounting a cgroup hierarchy, you may specify a 203kernel. When mounting a cgroup hierarchy, you may specify a
203comma-separated list of subsystems to mount as the filesystem mount 204comma-separated list of subsystems to mount as the filesystem mount
@@ -230,13 +231,13 @@ as the path relative to the root of the cgroup file system.
230Each cgroup is represented by a directory in the cgroup file system 231Each cgroup is represented by a directory in the cgroup file system
231containing the following files describing that cgroup: 232containing the following files describing that cgroup:
232 233
233 - tasks: list of tasks (by pid) attached to that cgroup. This list 234 - tasks: list of tasks (by PID) attached to that cgroup. This list
234 is not guaranteed to be sorted. Writing a thread id into this file 235 is not guaranteed to be sorted. Writing a thread ID into this file
235 moves the thread into this cgroup. 236 moves the thread into this cgroup.
236 - cgroup.procs: list of tgids in the cgroup. This list is not 237 - cgroup.procs: list of thread group IDs in the cgroup. This list is
237 guaranteed to be sorted or free of duplicate tgids, and userspace 238 not guaranteed to be sorted or free of duplicate TGIDs, and userspace
238 should sort/uniquify the list if this property is required. 239 should sort/uniquify the list if this property is required.
239 Writing a thread group id into this file moves all threads in that 240 Writing a thread group ID into this file moves all threads in that
240 group into this cgroup. 241 group into this cgroup.
241 - notify_on_release flag: run the release agent on exit? 242 - notify_on_release flag: run the release agent on exit?
242 - release_agent: the path to use for release notifications (this file 243 - release_agent: the path to use for release notifications (this file
@@ -261,7 +262,7 @@ cgroup file system directories.
261 262
262When a task is moved from one cgroup to another, it gets a new 263When a task is moved from one cgroup to another, it gets a new
263css_set pointer - if there's an already existing css_set with the 264css_set pointer - if there's an already existing css_set with the
264desired collection of cgroups then that group is reused, else a new 265desired collection of cgroups then that group is reused, otherwise a new
265css_set is allocated. The appropriate existing css_set is located by 266css_set is allocated. The appropriate existing css_set is located by
266looking into a hash table. 267looking into a hash table.
267 268
@@ -292,7 +293,7 @@ file system) of the abandoned cgroup. This enables automatic
292removal of abandoned cgroups. The default value of 293removal of abandoned cgroups. The default value of
293notify_on_release in the root cgroup at system boot is disabled 294notify_on_release in the root cgroup at system boot is disabled
294(0). The default value of other cgroups at creation is the current 295(0). The default value of other cgroups at creation is the current
295value of their parents notify_on_release setting. The default value of 296value of their parents' notify_on_release settings. The default value of
296a cgroup hierarchy's release_agent path is empty. 297a cgroup hierarchy's release_agent path is empty.
297 298
2981.5 What does clone_children do ? 2991.5 What does clone_children do ?
@@ -316,7 +317,7 @@ the "cpuset" cgroup subsystem, the steps are something like:
316 4) Create the new cgroup by doing mkdir's and write's (or echo's) in 317 4) Create the new cgroup by doing mkdir's and write's (or echo's) in
317 the /sys/fs/cgroup virtual file system. 318 the /sys/fs/cgroup virtual file system.
318 5) Start a task that will be the "founding father" of the new job. 319 5) Start a task that will be the "founding father" of the new job.
319 6) Attach that task to the new cgroup by writing its pid to the 320 6) Attach that task to the new cgroup by writing its PID to the
320 /sys/fs/cgroup/cpuset/tasks file for that cgroup. 321 /sys/fs/cgroup/cpuset/tasks file for that cgroup.
321 7) fork, exec or clone the job tasks from this founding father task. 322 7) fork, exec or clone the job tasks from this founding father task.
322 323
@@ -344,7 +345,7 @@ and then start a subshell 'sh' in that cgroup:
3442.1 Basic Usage 3452.1 Basic Usage
345--------------- 346---------------
346 347
347Creating, modifying, using the cgroups can be done through the cgroup 348Creating, modifying, using cgroups can be done through the cgroup
348virtual filesystem. 349virtual filesystem.
349 350
350To mount a cgroup hierarchy with all available subsystems, type: 351To mount a cgroup hierarchy with all available subsystems, type:
@@ -441,7 +442,7 @@ You can attach the current shell task by echoing 0:
441# echo 0 > tasks 442# echo 0 > tasks
442 443
443You can use the cgroup.procs file instead of the tasks file to move all 444You can use the cgroup.procs file instead of the tasks file to move all
444threads in a threadgroup at once. Echoing the pid of any task in a 445threads in a threadgroup at once. Echoing the PID of any task in a
445threadgroup to cgroup.procs causes all tasks in that threadgroup to be 446threadgroup to cgroup.procs causes all tasks in that threadgroup to be
446be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks 447be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
447in the writing task's threadgroup. 448in the writing task's threadgroup.
@@ -479,7 +480,7 @@ in /proc/mounts and /proc/<pid>/cgroups.
479There is mechanism which allows to get notifications about changing 480There is mechanism which allows to get notifications about changing
480status of a cgroup. 481status of a cgroup.
481 482
482To register new notification handler you need: 483To register a new notification handler you need to:
483 - create a file descriptor for event notification using eventfd(2); 484 - create a file descriptor for event notification using eventfd(2);
484 - open a control file to be monitored (e.g. memory.usage_in_bytes); 485 - open a control file to be monitored (e.g. memory.usage_in_bytes);
485 - write "<event_fd> <control_fd> <args>" to cgroup.event_control. 486 - write "<event_fd> <control_fd> <args>" to cgroup.event_control.
@@ -488,7 +489,7 @@ To register new notification handler you need:
488eventfd will be woken up by control file implementation or when the 489eventfd will be woken up by control file implementation or when the
489cgroup is removed. 490cgroup is removed.
490 491
491To unregister notification handler just close eventfd. 492To unregister a notification handler just close eventfd.
492 493
493NOTE: Support of notifications should be implemented for the control 494NOTE: Support of notifications should be implemented for the control
494file. See documentation for the subsystem. 495file. See documentation for the subsystem.
@@ -502,7 +503,7 @@ file. See documentation for the subsystem.
502Each kernel subsystem that wants to hook into the generic cgroup 503Each kernel subsystem that wants to hook into the generic cgroup
503system needs to create a cgroup_subsys object. This contains 504system needs to create a cgroup_subsys object. This contains
504various methods, which are callbacks from the cgroup system, along 505various methods, which are callbacks from the cgroup system, along
505with a subsystem id which will be assigned by the cgroup system. 506with a subsystem ID which will be assigned by the cgroup system.
506 507
507Other fields in the cgroup_subsys object include: 508Other fields in the cgroup_subsys object include:
508 509
@@ -516,7 +517,7 @@ Other fields in the cgroup_subsys object include:
516 at system boot. 517 at system boot.
517 518
518Each cgroup object created by the system has an array of pointers, 519Each cgroup object created by the system has an array of pointers,
519indexed by subsystem id; this pointer is entirely managed by the 520indexed by subsystem ID; this pointer is entirely managed by the
520subsystem; the generic cgroup code will never touch this pointer. 521subsystem; the generic cgroup code will never touch this pointer.
521 522
5223.2 Synchronization 5233.2 Synchronization
@@ -639,7 +640,7 @@ void post_clone(struct cgroup *cgrp)
639 640
640Called during cgroup_create() to do any parameter 641Called during cgroup_create() to do any parameter
641initialization which might be required before a task could attach. For 642initialization which might be required before a task could attach. For
642example in cpusets, no task may attach before 'cpus' and 'mems' are set 643example, in cpusets, no task may attach before 'cpus' and 'mems' are set
643up. 644up.
644 645
645void bind(struct cgroup *root) 646void bind(struct cgroup *root)
@@ -650,7 +651,26 @@ and root cgroup. Currently this will only involve movement between
650the default hierarchy (which never has sub-cgroups) and a hierarchy 651the default hierarchy (which never has sub-cgroups) and a hierarchy
651that is being created/destroyed (and hence has no sub-cgroups). 652that is being created/destroyed (and hence has no sub-cgroups).
652 653
6534. Questions 6544. Extended attribute usage
655===========================
656
657cgroup filesystem supports certain types of extended attributes in its
658directories and files. The current supported types are:
659 - Trusted (XATTR_TRUSTED)
660 - Security (XATTR_SECURITY)
661
662Both require CAP_SYS_ADMIN capability to set.
663
664Like in tmpfs, the extended attributes in cgroup filesystem are stored
665using kernel memory and it's advised to keep the usage at minimum. This
666is the reason why user defined extended attributes are not supported, since
667any user can do it and there's no limit in the value size.
668
669The current known users for this feature are SELinux to limit cgroup usage
670in containers and systemd for assorted meta data like main PID in a cgroup
671(systemd creates a cgroup per service).
672
6735. Questions
654============ 674============
655 675
656Q: what's up with this '/bin/echo' ? 676Q: what's up with this '/bin/echo' ?
@@ -660,5 +680,5 @@ A: bash's builtin 'echo' command does not check calls to write() against
660 680
661Q: When I attach processes, only the first of the line gets really attached ! 681Q: When I attach processes, only the first of the line gets really attached !
662A: We can only return one error code per call to write(). So you should also 682A: We can only return one error code per call to write(). So you should also
663 put only ONE pid. 683 put only ONE PID.
664 684
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3a16d4fdaa05..9336b829cc81 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -68,6 +68,7 @@
68#include <net/netns/generic.h> 68#include <net/netns/generic.h>
69#include <net/rtnetlink.h> 69#include <net/rtnetlink.h>
70#include <net/sock.h> 70#include <net/sock.h>
71#include <net/cls_cgroup.h>
71 72
72#include <asm/uaccess.h> 73#include <asm/uaccess.h>
73 74
diff --git a/fs/xattr.c b/fs/xattr.c
index 4d45b7189e7e..014f11321fd9 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -791,3 +791,183 @@ EXPORT_SYMBOL(generic_getxattr);
791EXPORT_SYMBOL(generic_listxattr); 791EXPORT_SYMBOL(generic_listxattr);
792EXPORT_SYMBOL(generic_setxattr); 792EXPORT_SYMBOL(generic_setxattr);
793EXPORT_SYMBOL(generic_removexattr); 793EXPORT_SYMBOL(generic_removexattr);
794
795/*
796 * Allocate new xattr and copy in the value; but leave the name to callers.
797 */
798struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
799{
800 struct simple_xattr *new_xattr;
801 size_t len;
802
803 /* wrap around? */
804 len = sizeof(*new_xattr) + size;
805 if (len <= sizeof(*new_xattr))
806 return NULL;
807
808 new_xattr = kmalloc(len, GFP_KERNEL);
809 if (!new_xattr)
810 return NULL;
811
812 new_xattr->size = size;
813 memcpy(new_xattr->value, value, size);
814 return new_xattr;
815}
816
817/*
818 * xattr GET operation for in-memory/pseudo filesystems
819 */
820int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
821 void *buffer, size_t size)
822{
823 struct simple_xattr *xattr;
824 int ret = -ENODATA;
825
826 spin_lock(&xattrs->lock);
827 list_for_each_entry(xattr, &xattrs->head, list) {
828 if (strcmp(name, xattr->name))
829 continue;
830
831 ret = xattr->size;
832 if (buffer) {
833 if (size < xattr->size)
834 ret = -ERANGE;
835 else
836 memcpy(buffer, xattr->value, xattr->size);
837 }
838 break;
839 }
840 spin_unlock(&xattrs->lock);
841 return ret;
842}
843
844static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
845 const void *value, size_t size, int flags)
846{
847 struct simple_xattr *xattr;
848 struct simple_xattr *uninitialized_var(new_xattr);
849 int err = 0;
850
851 /* value == NULL means remove */
852 if (value) {
853 new_xattr = simple_xattr_alloc(value, size);
854 if (!new_xattr)
855 return -ENOMEM;
856
857 new_xattr->name = kstrdup(name, GFP_KERNEL);
858 if (!new_xattr->name) {
859 kfree(new_xattr);
860 return -ENOMEM;
861 }
862 }
863
864 spin_lock(&xattrs->lock);
865 list_for_each_entry(xattr, &xattrs->head, list) {
866 if (!strcmp(name, xattr->name)) {
867 if (flags & XATTR_CREATE) {
868 xattr = new_xattr;
869 err = -EEXIST;
870 } else if (new_xattr) {
871 list_replace(&xattr->list, &new_xattr->list);
872 } else {
873 list_del(&xattr->list);
874 }
875 goto out;
876 }
877 }
878 if (flags & XATTR_REPLACE) {
879 xattr = new_xattr;
880 err = -ENODATA;
881 } else {
882 list_add(&new_xattr->list, &xattrs->head);
883 xattr = NULL;
884 }
885out:
886 spin_unlock(&xattrs->lock);
887 if (xattr) {
888 kfree(xattr->name);
889 kfree(xattr);
890 }
891 return err;
892
893}
894
895/**
896 * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
897 * @xattrs: target simple_xattr list
898 * @name: name of the new extended attribute
899 * @value: value of the new xattr. If %NULL, will remove the attribute
900 * @size: size of the new xattr
901 * @flags: %XATTR_{CREATE|REPLACE}
902 *
903 * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
904 * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
905 * otherwise, fails with -ENODATA.
906 *
907 * Returns 0 on success, -errno on failure.
908 */
909int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
910 const void *value, size_t size, int flags)
911{
912 if (size == 0)
913 value = ""; /* empty EA, do not remove */
914 return __simple_xattr_set(xattrs, name, value, size, flags);
915}
916
917/*
918 * xattr REMOVE operation for in-memory/pseudo filesystems
919 */
920int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name)
921{
922 return __simple_xattr_set(xattrs, name, NULL, 0, XATTR_REPLACE);
923}
924
925static bool xattr_is_trusted(const char *name)
926{
927 return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
928}
929
930/*
931 * xattr LIST operation for in-memory/pseudo filesystems
932 */
933ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer,
934 size_t size)
935{
936 bool trusted = capable(CAP_SYS_ADMIN);
937 struct simple_xattr *xattr;
938 size_t used = 0;
939
940 spin_lock(&xattrs->lock);
941 list_for_each_entry(xattr, &xattrs->head, list) {
942 size_t len;
943
944 /* skip "trusted." attributes for unprivileged callers */
945 if (!trusted && xattr_is_trusted(xattr->name))
946 continue;
947
948 len = strlen(xattr->name) + 1;
949 used += len;
950 if (buffer) {
951 if (size < used) {
952 used = -ERANGE;
953 break;
954 }
955 memcpy(buffer, xattr->name, len);
956 buffer += len;
957 }
958 }
959 spin_unlock(&xattrs->lock);
960
961 return used;
962}
963
964/*
965 * Adds an extended attribute to the list
966 */
967void simple_xattr_list_add(struct simple_xattrs *xattrs,
968 struct simple_xattr *new_xattr)
969{
970 spin_lock(&xattrs->lock);
971 list_add(&new_xattr->list, &xattrs->head);
972 spin_unlock(&xattrs->lock);
973}
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c90eaa803440..df354ae079c1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -17,6 +17,7 @@
17#include <linux/rwsem.h> 17#include <linux/rwsem.h>
18#include <linux/idr.h> 18#include <linux/idr.h>
19#include <linux/workqueue.h> 19#include <linux/workqueue.h>
20#include <linux/xattr.h>
20 21
21#ifdef CONFIG_CGROUPS 22#ifdef CONFIG_CGROUPS
22 23
@@ -45,17 +46,13 @@ extern const struct file_operations proc_cgroup_operations;
45 46
46/* Define the enumeration of all builtin cgroup subsystems */ 47/* Define the enumeration of all builtin cgroup subsystems */
47#define SUBSYS(_x) _x ## _subsys_id, 48#define SUBSYS(_x) _x ## _subsys_id,
49#define IS_SUBSYS_ENABLED(option) IS_ENABLED(option)
48enum cgroup_subsys_id { 50enum cgroup_subsys_id {
49#include <linux/cgroup_subsys.h> 51#include <linux/cgroup_subsys.h>
50 CGROUP_BUILTIN_SUBSYS_COUNT 52 CGROUP_SUBSYS_COUNT,
51}; 53};
54#undef IS_SUBSYS_ENABLED
52#undef SUBSYS 55#undef SUBSYS
53/*
54 * This define indicates the maximum number of subsystems that can be loaded
55 * at once. We limit to this many since cgroupfs_root has subsys_bits to keep
56 * track of all of them.
57 */
58#define CGROUP_SUBSYS_COUNT (BITS_PER_BYTE*sizeof(unsigned long))
59 56
60/* Per-subsystem/per-cgroup state maintained by the system. */ 57/* Per-subsystem/per-cgroup state maintained by the system. */
61struct cgroup_subsys_state { 58struct cgroup_subsys_state {
@@ -216,6 +213,9 @@ struct cgroup {
216 /* List of events which userspace want to receive */ 213 /* List of events which userspace want to receive */
217 struct list_head event_list; 214 struct list_head event_list;
218 spinlock_t event_list_lock; 215 spinlock_t event_list_lock;
216
217 /* directory xattrs */
218 struct simple_xattrs xattrs;
219}; 219};
220 220
221/* 221/*
@@ -309,6 +309,9 @@ struct cftype {
309 /* CFTYPE_* flags */ 309 /* CFTYPE_* flags */
310 unsigned int flags; 310 unsigned int flags;
311 311
312 /* file xattrs */
313 struct simple_xattrs xattrs;
314
312 int (*open)(struct inode *inode, struct file *file); 315 int (*open)(struct inode *inode, struct file *file);
313 ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, 316 ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft,
314 struct file *file, 317 struct file *file,
@@ -394,7 +397,7 @@ struct cftype {
394 */ 397 */
395struct cftype_set { 398struct cftype_set {
396 struct list_head node; /* chained at subsys->cftsets */ 399 struct list_head node; /* chained at subsys->cftsets */
397 const struct cftype *cfts; 400 struct cftype *cfts;
398}; 401};
399 402
400struct cgroup_scanner { 403struct cgroup_scanner {
@@ -406,8 +409,8 @@ struct cgroup_scanner {
406 void *data; 409 void *data;
407}; 410};
408 411
409int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts); 412int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
410int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts); 413int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
411 414
412int cgroup_is_removed(const struct cgroup *cgrp); 415int cgroup_is_removed(const struct cgroup *cgrp);
413 416
@@ -521,7 +524,9 @@ struct cgroup_subsys {
521}; 524};
522 525
523#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; 526#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
527#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
524#include <linux/cgroup_subsys.h> 528#include <linux/cgroup_subsys.h>
529#undef IS_SUBSYS_ENABLED
525#undef SUBSYS 530#undef SUBSYS
526 531
527static inline struct cgroup_subsys_state *cgroup_subsys_state( 532static inline struct cgroup_subsys_state *cgroup_subsys_state(
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index dfae957398c3..f204a7a9cf38 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -7,73 +7,73 @@
7 7
8/* */ 8/* */
9 9
10#ifdef CONFIG_CPUSETS 10#if IS_SUBSYS_ENABLED(CONFIG_CPUSETS)
11SUBSYS(cpuset) 11SUBSYS(cpuset)
12#endif 12#endif
13 13
14/* */ 14/* */
15 15
16#ifdef CONFIG_CGROUP_DEBUG 16#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEBUG)
17SUBSYS(debug) 17SUBSYS(debug)
18#endif 18#endif
19 19
20/* */ 20/* */
21 21
22#ifdef CONFIG_CGROUP_SCHED 22#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_SCHED)
23SUBSYS(cpu_cgroup) 23SUBSYS(cpu_cgroup)
24#endif 24#endif
25 25
26/* */ 26/* */
27 27
28#ifdef CONFIG_CGROUP_CPUACCT 28#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_CPUACCT)
29SUBSYS(cpuacct) 29SUBSYS(cpuacct)
30#endif 30#endif
31 31
32/* */ 32/* */
33 33
34#ifdef CONFIG_MEMCG 34#if IS_SUBSYS_ENABLED(CONFIG_MEMCG)
35SUBSYS(mem_cgroup) 35SUBSYS(mem_cgroup)
36#endif 36#endif
37 37
38/* */ 38/* */
39 39
40#ifdef CONFIG_CGROUP_DEVICE 40#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEVICE)
41SUBSYS(devices) 41SUBSYS(devices)
42#endif 42#endif
43 43
44/* */ 44/* */
45 45
46#ifdef CONFIG_CGROUP_FREEZER 46#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_FREEZER)
47SUBSYS(freezer) 47SUBSYS(freezer)
48#endif 48#endif
49 49
50/* */ 50/* */
51 51
52#ifdef CONFIG_NET_CLS_CGROUP 52#if IS_SUBSYS_ENABLED(CONFIG_NET_CLS_CGROUP)
53SUBSYS(net_cls) 53SUBSYS(net_cls)
54#endif 54#endif
55 55
56/* */ 56/* */
57 57
58#ifdef CONFIG_BLK_CGROUP 58#if IS_SUBSYS_ENABLED(CONFIG_BLK_CGROUP)
59SUBSYS(blkio) 59SUBSYS(blkio)
60#endif 60#endif
61 61
62/* */ 62/* */
63 63
64#ifdef CONFIG_CGROUP_PERF 64#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)
65SUBSYS(perf) 65SUBSYS(perf)
66#endif 66#endif
67 67
68/* */ 68/* */
69 69
70#ifdef CONFIG_NETPRIO_CGROUP 70#if IS_SUBSYS_ENABLED(CONFIG_NETPRIO_CGROUP)
71SUBSYS(net_prio) 71SUBSYS(net_prio)
72#endif 72#endif
73 73
74/* */ 74/* */
75 75
76#ifdef CONFIG_CGROUP_HUGETLB 76#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_HUGETLB)
77SUBSYS(hugetlb) 77SUBSYS(hugetlb)
78#endif 78#endif
79 79
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index bef2cf00b3be..30aa0dc60d75 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -5,6 +5,7 @@
5#include <linux/mempolicy.h> 5#include <linux/mempolicy.h>
6#include <linux/pagemap.h> 6#include <linux/pagemap.h>
7#include <linux/percpu_counter.h> 7#include <linux/percpu_counter.h>
8#include <linux/xattr.h>
8 9
9/* inode in-kernel data */ 10/* inode in-kernel data */
10 11
@@ -18,7 +19,7 @@ struct shmem_inode_info {
18 }; 19 };
19 struct shared_policy policy; /* NUMA memory alloc policy */ 20 struct shared_policy policy; /* NUMA memory alloc policy */
20 struct list_head swaplist; /* chain of maybes on swap */ 21 struct list_head swaplist; /* chain of maybes on swap */
21 struct list_head xattr_list; /* list of shmem_xattr */ 22 struct simple_xattrs xattrs; /* list of xattrs */
22 struct inode vfs_inode; 23 struct inode vfs_inode;
23}; 24};
24 25
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index e5d122031542..2ace7a60316d 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -59,7 +59,9 @@
59 59
60#ifdef __KERNEL__ 60#ifdef __KERNEL__
61 61
62#include <linux/slab.h>
62#include <linux/types.h> 63#include <linux/types.h>
64#include <linux/spinlock.h>
63 65
64struct inode; 66struct inode;
65struct dentry; 67struct dentry;
@@ -96,6 +98,52 @@ ssize_t vfs_getxattr_alloc(struct dentry *dentry, const char *name,
96 char **xattr_value, size_t size, gfp_t flags); 98 char **xattr_value, size_t size, gfp_t flags);
97int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name, 99int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name,
98 const char *value, size_t size, gfp_t flags); 100 const char *value, size_t size, gfp_t flags);
101
102struct simple_xattrs {
103 struct list_head head;
104 spinlock_t lock;
105};
106
107struct simple_xattr {
108 struct list_head list;
109 char *name;
110 size_t size;
111 char value[0];
112};
113
114/*
115 * initialize the simple_xattrs structure
116 */
117static inline void simple_xattrs_init(struct simple_xattrs *xattrs)
118{
119 INIT_LIST_HEAD(&xattrs->head);
120 spin_lock_init(&xattrs->lock);
121}
122
123/*
124 * free all the xattrs
125 */
126static inline void simple_xattrs_free(struct simple_xattrs *xattrs)
127{
128 struct simple_xattr *xattr, *node;
129
130 list_for_each_entry_safe(xattr, node, &xattrs->head, list) {
131 kfree(xattr->name);
132 kfree(xattr);
133 }
134}
135
136struct simple_xattr *simple_xattr_alloc(const void *value, size_t size);
137int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
138 void *buffer, size_t size);
139int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
140 const void *value, size_t size, int flags);
141int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name);
142ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer,
143 size_t size);
144void simple_xattr_list_add(struct simple_xattrs *xattrs,
145 struct simple_xattr *new_xattr);
146
99#endif /* __KERNEL__ */ 147#endif /* __KERNEL__ */
100 148
101#endif /* _LINUX_XATTR_H */ 149#endif /* _LINUX_XATTR_H */
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index a4dc5b027bd9..b6a6eeb3905f 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -17,14 +17,16 @@
17#include <linux/hardirq.h> 17#include <linux/hardirq.h>
18#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
19 19
20#ifdef CONFIG_CGROUPS 20#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
21struct cgroup_cls_state 21struct cgroup_cls_state
22{ 22{
23 struct cgroup_subsys_state css; 23 struct cgroup_subsys_state css;
24 u32 classid; 24 u32 classid;
25}; 25};
26 26
27#ifdef CONFIG_NET_CLS_CGROUP 27extern void sock_update_classid(struct sock *sk);
28
29#if IS_BUILTIN(CONFIG_NET_CLS_CGROUP)
28static inline u32 task_cls_classid(struct task_struct *p) 30static inline u32 task_cls_classid(struct task_struct *p)
29{ 31{
30 int classid; 32 int classid;
@@ -39,32 +41,33 @@ static inline u32 task_cls_classid(struct task_struct *p)
39 41
40 return classid; 42 return classid;
41} 43}
42#else 44#elif IS_MODULE(CONFIG_NET_CLS_CGROUP)
43extern int net_cls_subsys_id;
44
45static inline u32 task_cls_classid(struct task_struct *p) 45static inline u32 task_cls_classid(struct task_struct *p)
46{ 46{
47 int id; 47 struct cgroup_subsys_state *css;
48 u32 classid = 0; 48 u32 classid = 0;
49 49
50 if (in_interrupt()) 50 if (in_interrupt())
51 return 0; 51 return 0;
52 52
53 rcu_read_lock(); 53 rcu_read_lock();
54 id = rcu_dereference_index_check(net_cls_subsys_id, 54 css = task_subsys_state(p, net_cls_subsys_id);
55 rcu_read_lock_held()); 55 if (css)
56 if (id >= 0) 56 classid = container_of(css,
57 classid = container_of(task_subsys_state(p, id),
58 struct cgroup_cls_state, css)->classid; 57 struct cgroup_cls_state, css)->classid;
59 rcu_read_unlock(); 58 rcu_read_unlock();
60 59
61 return classid; 60 return classid;
62} 61}
63#endif 62#endif
64#else 63#else /* !CGROUP_NET_CLS_CGROUP */
64static inline void sock_update_classid(struct sock *sk)
65{
66}
67
65static inline u32 task_cls_classid(struct task_struct *p) 68static inline u32 task_cls_classid(struct task_struct *p)
66{ 69{
67 return 0; 70 return 0;
68} 71}
69#endif 72#endif /* CGROUP_NET_CLS_CGROUP */
70#endif /* _NET_CLS_CGROUP_H */ 73#endif /* _NET_CLS_CGROUP_H */
diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
index 2719dec6b5a8..2760f4f4ae9b 100644
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -18,23 +18,18 @@
18#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
19 19
20 20
21#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
21struct netprio_map { 22struct netprio_map {
22 struct rcu_head rcu; 23 struct rcu_head rcu;
23 u32 priomap_len; 24 u32 priomap_len;
24 u32 priomap[]; 25 u32 priomap[];
25}; 26};
26 27
27#ifdef CONFIG_CGROUPS
28
29struct cgroup_netprio_state { 28struct cgroup_netprio_state {
30 struct cgroup_subsys_state css; 29 struct cgroup_subsys_state css;
31 u32 prioidx; 30 u32 prioidx;
32}; 31};
33 32
34#ifndef CONFIG_NETPRIO_CGROUP
35extern int net_prio_subsys_id;
36#endif
37
38extern void sock_update_netprioidx(struct sock *sk, struct task_struct *task); 33extern void sock_update_netprioidx(struct sock *sk, struct task_struct *task);
39 34
40#if IS_BUILTIN(CONFIG_NETPRIO_CGROUP) 35#if IS_BUILTIN(CONFIG_NETPRIO_CGROUP)
@@ -56,33 +51,28 @@ static inline u32 task_netprioidx(struct task_struct *p)
56 51
57static inline u32 task_netprioidx(struct task_struct *p) 52static inline u32 task_netprioidx(struct task_struct *p)
58{ 53{
59 struct cgroup_netprio_state *state; 54 struct cgroup_subsys_state *css;
60 int subsys_id;
61 u32 idx = 0; 55 u32 idx = 0;
62 56
63 rcu_read_lock(); 57 rcu_read_lock();
64 subsys_id = rcu_dereference_index_check(net_prio_subsys_id, 58 css = task_subsys_state(p, net_prio_subsys_id);
65 rcu_read_lock_held()); 59 if (css)
66 if (subsys_id >= 0) { 60 idx = container_of(css,
67 state = container_of(task_subsys_state(p, subsys_id), 61 struct cgroup_netprio_state, css)->prioidx;
68 struct cgroup_netprio_state, css);
69 idx = state->prioidx;
70 }
71 rcu_read_unlock(); 62 rcu_read_unlock();
72 return idx; 63 return idx;
73} 64}
65#endif
74 66
75#else 67#else /* !CONFIG_NETPRIO_CGROUP */
76 68
77static inline u32 task_netprioidx(struct task_struct *p) 69static inline u32 task_netprioidx(struct task_struct *p)
78{ 70{
79 return 0; 71 return 0;
80} 72}
81 73
82#endif /* CONFIG_NETPRIO_CGROUP */
83
84#else
85#define sock_update_netprioidx(sk, task) 74#define sock_update_netprioidx(sk, task)
86#endif 75
76#endif /* CONFIG_NETPRIO_CGROUP */
87 77
88#endif /* _NET_CLS_CGROUP_H */ 78#endif /* _NET_CLS_CGROUP_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index adb7da20b5a1..6e6ec18fb6d0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1486,14 +1486,6 @@ extern void *sock_kmalloc(struct sock *sk, int size,
1486extern void sock_kfree_s(struct sock *sk, void *mem, int size); 1486extern void sock_kfree_s(struct sock *sk, void *mem, int size);
1487extern void sk_send_sigurg(struct sock *sk); 1487extern void sk_send_sigurg(struct sock *sk);
1488 1488
1489#ifdef CONFIG_CGROUPS
1490extern void sock_update_classid(struct sock *sk);
1491#else
1492static inline void sock_update_classid(struct sock *sk)
1493{
1494}
1495#endif
1496
1497/* 1489/*
1498 * Functions to fill in entries in struct proto_ops when a protocol 1490 * Functions to fill in entries in struct proto_ops when a protocol
1499 * does not implement a particular function. 1491 * does not implement a particular function.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 79818507e444..485cc1487ea2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -88,11 +88,12 @@ static DEFINE_MUTEX(cgroup_root_mutex);
88 88
89/* 89/*
90 * Generate an array of cgroup subsystem pointers. At boot time, this is 90 * Generate an array of cgroup subsystem pointers. At boot time, this is
91 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are 91 * populated with the built in subsystems, and modular subsystems are
92 * registered after that. The mutable section of this array is protected by 92 * registered after that. The mutable section of this array is protected by
93 * cgroup_mutex. 93 * cgroup_mutex.
94 */ 94 */
95#define SUBSYS(_x) &_x ## _subsys, 95#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
96#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
96static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { 97static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
97#include <linux/cgroup_subsys.h> 98#include <linux/cgroup_subsys.h>
98}; 99};
@@ -111,13 +112,13 @@ struct cgroupfs_root {
111 * The bitmask of subsystems intended to be attached to this 112 * The bitmask of subsystems intended to be attached to this
112 * hierarchy 113 * hierarchy
113 */ 114 */
114 unsigned long subsys_bits; 115 unsigned long subsys_mask;
115 116
116 /* Unique id for this hierarchy. */ 117 /* Unique id for this hierarchy. */
117 int hierarchy_id; 118 int hierarchy_id;
118 119
119 /* The bitmask of subsystems currently attached to this hierarchy */ 120 /* The bitmask of subsystems currently attached to this hierarchy */
120 unsigned long actual_subsys_bits; 121 unsigned long actual_subsys_mask;
121 122
122 /* A list running through the attached subsystems */ 123 /* A list running through the attached subsystems */
123 struct list_head subsys_list; 124 struct list_head subsys_list;
@@ -276,7 +277,8 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
276 277
277/* bits in struct cgroupfs_root flags field */ 278/* bits in struct cgroupfs_root flags field */
278enum { 279enum {
279 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ 280 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
281 ROOT_XATTR, /* supports extended attributes */
280}; 282};
281 283
282static int cgroup_is_releasable(const struct cgroup *cgrp) 284static int cgroup_is_releasable(const struct cgroup *cgrp)
@@ -556,7 +558,7 @@ static struct css_set *find_existing_css_set(
556 * won't change, so no need for locking. 558 * won't change, so no need for locking.
557 */ 559 */
558 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 560 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
559 if (root->subsys_bits & (1UL << i)) { 561 if (root->subsys_mask & (1UL << i)) {
560 /* Subsystem is in this hierarchy. So we want 562 /* Subsystem is in this hierarchy. So we want
561 * the subsystem state from the new 563 * the subsystem state from the new
562 * cgroup */ 564 * cgroup */
@@ -824,7 +826,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
824static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 826static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
825static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); 827static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
826static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 828static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
827static int cgroup_populate_dir(struct cgroup *cgrp); 829static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
830 unsigned long subsys_mask);
828static const struct inode_operations cgroup_dir_inode_operations; 831static const struct inode_operations cgroup_dir_inode_operations;
829static const struct file_operations proc_cgroupstats_operations; 832static const struct file_operations proc_cgroupstats_operations;
830 833
@@ -912,15 +915,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
912 */ 915 */
913 BUG_ON(!list_empty(&cgrp->pidlists)); 916 BUG_ON(!list_empty(&cgrp->pidlists));
914 917
918 simple_xattrs_free(&cgrp->xattrs);
919
915 kfree_rcu(cgrp, rcu_head); 920 kfree_rcu(cgrp, rcu_head);
916 } else { 921 } else {
917 struct cfent *cfe = __d_cfe(dentry); 922 struct cfent *cfe = __d_cfe(dentry);
918 struct cgroup *cgrp = dentry->d_parent->d_fsdata; 923 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
924 struct cftype *cft = cfe->type;
919 925
920 WARN_ONCE(!list_empty(&cfe->node) && 926 WARN_ONCE(!list_empty(&cfe->node) &&
921 cgrp != &cgrp->root->top_cgroup, 927 cgrp != &cgrp->root->top_cgroup,
922 "cfe still linked for %s\n", cfe->type->name); 928 "cfe still linked for %s\n", cfe->type->name);
923 kfree(cfe); 929 kfree(cfe);
930 simple_xattrs_free(&cft->xattrs);
924 } 931 }
925 iput(inode); 932 iput(inode);
926} 933}
@@ -963,12 +970,29 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
963 return -ENOENT; 970 return -ENOENT;
964} 971}
965 972
966static void cgroup_clear_directory(struct dentry *dir) 973/**
974 * cgroup_clear_directory - selective removal of base and subsystem files
975 * @dir: directory containing the files
976 * @base_files: true if the base files should be removed
977 * @subsys_mask: mask of the subsystem ids whose files should be removed
978 */
979static void cgroup_clear_directory(struct dentry *dir, bool base_files,
980 unsigned long subsys_mask)
967{ 981{
968 struct cgroup *cgrp = __d_cgrp(dir); 982 struct cgroup *cgrp = __d_cgrp(dir);
983 struct cgroup_subsys *ss;
969 984
970 while (!list_empty(&cgrp->files)) 985 for_each_subsys(cgrp->root, ss) {
971 cgroup_rm_file(cgrp, NULL); 986 struct cftype_set *set;
987 if (!test_bit(ss->subsys_id, &subsys_mask))
988 continue;
989 list_for_each_entry(set, &ss->cftsets, node)
990 cgroup_rm_file(cgrp, set->cfts);
991 }
992 if (base_files) {
993 while (!list_empty(&cgrp->files))
994 cgroup_rm_file(cgrp, NULL);
995 }
972} 996}
973 997
974/* 998/*
@@ -977,8 +1001,9 @@ static void cgroup_clear_directory(struct dentry *dir)
977static void cgroup_d_remove_dir(struct dentry *dentry) 1001static void cgroup_d_remove_dir(struct dentry *dentry)
978{ 1002{
979 struct dentry *parent; 1003 struct dentry *parent;
1004 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
980 1005
981 cgroup_clear_directory(dentry); 1006 cgroup_clear_directory(dentry, true, root->subsys_mask);
982 1007
983 parent = dentry->d_parent; 1008 parent = dentry->d_parent;
984 spin_lock(&parent->d_lock); 1009 spin_lock(&parent->d_lock);
@@ -1022,22 +1047,22 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
1022 * returns an error, no reference counts are touched. 1047 * returns an error, no reference counts are touched.
1023 */ 1048 */
1024static int rebind_subsystems(struct cgroupfs_root *root, 1049static int rebind_subsystems(struct cgroupfs_root *root,
1025 unsigned long final_bits) 1050 unsigned long final_subsys_mask)
1026{ 1051{
1027 unsigned long added_bits, removed_bits; 1052 unsigned long added_mask, removed_mask;
1028 struct cgroup *cgrp = &root->top_cgroup; 1053 struct cgroup *cgrp = &root->top_cgroup;
1029 int i; 1054 int i;
1030 1055
1031 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1056 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1032 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 1057 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
1033 1058
1034 removed_bits = root->actual_subsys_bits & ~final_bits; 1059 removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
1035 added_bits = final_bits & ~root->actual_subsys_bits; 1060 added_mask = final_subsys_mask & ~root->actual_subsys_mask;
1036 /* Check that any added subsystems are currently free */ 1061 /* Check that any added subsystems are currently free */
1037 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1062 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1038 unsigned long bit = 1UL << i; 1063 unsigned long bit = 1UL << i;
1039 struct cgroup_subsys *ss = subsys[i]; 1064 struct cgroup_subsys *ss = subsys[i];
1040 if (!(bit & added_bits)) 1065 if (!(bit & added_mask))
1041 continue; 1066 continue;
1042 /* 1067 /*
1043 * Nobody should tell us to do a subsys that doesn't exist: 1068 * Nobody should tell us to do a subsys that doesn't exist:
@@ -1062,7 +1087,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1062 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1087 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1063 struct cgroup_subsys *ss = subsys[i]; 1088 struct cgroup_subsys *ss = subsys[i];
1064 unsigned long bit = 1UL << i; 1089 unsigned long bit = 1UL << i;
1065 if (bit & added_bits) { 1090 if (bit & added_mask) {
1066 /* We're binding this subsystem to this hierarchy */ 1091 /* We're binding this subsystem to this hierarchy */
1067 BUG_ON(ss == NULL); 1092 BUG_ON(ss == NULL);
1068 BUG_ON(cgrp->subsys[i]); 1093 BUG_ON(cgrp->subsys[i]);
@@ -1075,7 +1100,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1075 if (ss->bind) 1100 if (ss->bind)
1076 ss->bind(cgrp); 1101 ss->bind(cgrp);
1077 /* refcount was already taken, and we're keeping it */ 1102 /* refcount was already taken, and we're keeping it */
1078 } else if (bit & removed_bits) { 1103 } else if (bit & removed_mask) {
1079 /* We're removing this subsystem */ 1104 /* We're removing this subsystem */
1080 BUG_ON(ss == NULL); 1105 BUG_ON(ss == NULL);
1081 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 1106 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
@@ -1088,7 +1113,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1088 list_move(&ss->sibling, &rootnode.subsys_list); 1113 list_move(&ss->sibling, &rootnode.subsys_list);
1089 /* subsystem is now free - drop reference on module */ 1114 /* subsystem is now free - drop reference on module */
1090 module_put(ss->module); 1115 module_put(ss->module);
1091 } else if (bit & final_bits) { 1116 } else if (bit & final_subsys_mask) {
1092 /* Subsystem state should already exist */ 1117 /* Subsystem state should already exist */
1093 BUG_ON(ss == NULL); 1118 BUG_ON(ss == NULL);
1094 BUG_ON(!cgrp->subsys[i]); 1119 BUG_ON(!cgrp->subsys[i]);
@@ -1105,7 +1130,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1105 BUG_ON(cgrp->subsys[i]); 1130 BUG_ON(cgrp->subsys[i]);
1106 } 1131 }
1107 } 1132 }
1108 root->subsys_bits = root->actual_subsys_bits = final_bits; 1133 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
1109 synchronize_rcu(); 1134 synchronize_rcu();
1110 1135
1111 return 0; 1136 return 0;
@@ -1121,6 +1146,8 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1121 seq_printf(seq, ",%s", ss->name); 1146 seq_printf(seq, ",%s", ss->name);
1122 if (test_bit(ROOT_NOPREFIX, &root->flags)) 1147 if (test_bit(ROOT_NOPREFIX, &root->flags))
1123 seq_puts(seq, ",noprefix"); 1148 seq_puts(seq, ",noprefix");
1149 if (test_bit(ROOT_XATTR, &root->flags))
1150 seq_puts(seq, ",xattr");
1124 if (strlen(root->release_agent_path)) 1151 if (strlen(root->release_agent_path))
1125 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1152 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1126 if (clone_children(&root->top_cgroup)) 1153 if (clone_children(&root->top_cgroup))
@@ -1132,7 +1159,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1132} 1159}
1133 1160
1134struct cgroup_sb_opts { 1161struct cgroup_sb_opts {
1135 unsigned long subsys_bits; 1162 unsigned long subsys_mask;
1136 unsigned long flags; 1163 unsigned long flags;
1137 char *release_agent; 1164 char *release_agent;
1138 bool clone_children; 1165 bool clone_children;
@@ -1189,6 +1216,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1189 opts->clone_children = true; 1216 opts->clone_children = true;
1190 continue; 1217 continue;
1191 } 1218 }
1219 if (!strcmp(token, "xattr")) {
1220 set_bit(ROOT_XATTR, &opts->flags);
1221 continue;
1222 }
1192 if (!strncmp(token, "release_agent=", 14)) { 1223 if (!strncmp(token, "release_agent=", 14)) {
1193 /* Specifying two release agents is forbidden */ 1224 /* Specifying two release agents is forbidden */
1194 if (opts->release_agent) 1225 if (opts->release_agent)
@@ -1237,7 +1268,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1237 /* Mutually exclusive option 'all' + subsystem name */ 1268 /* Mutually exclusive option 'all' + subsystem name */
1238 if (all_ss) 1269 if (all_ss)
1239 return -EINVAL; 1270 return -EINVAL;
1240 set_bit(i, &opts->subsys_bits); 1271 set_bit(i, &opts->subsys_mask);
1241 one_ss = true; 1272 one_ss = true;
1242 1273
1243 break; 1274 break;
@@ -1258,7 +1289,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1258 continue; 1289 continue;
1259 if (ss->disabled) 1290 if (ss->disabled)
1260 continue; 1291 continue;
1261 set_bit(i, &opts->subsys_bits); 1292 set_bit(i, &opts->subsys_mask);
1262 } 1293 }
1263 } 1294 }
1264 1295
@@ -1270,19 +1301,19 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1270 * the cpuset subsystem. 1301 * the cpuset subsystem.
1271 */ 1302 */
1272 if (test_bit(ROOT_NOPREFIX, &opts->flags) && 1303 if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
1273 (opts->subsys_bits & mask)) 1304 (opts->subsys_mask & mask))
1274 return -EINVAL; 1305 return -EINVAL;
1275 1306
1276 1307
1277 /* Can't specify "none" and some subsystems */ 1308 /* Can't specify "none" and some subsystems */
1278 if (opts->subsys_bits && opts->none) 1309 if (opts->subsys_mask && opts->none)
1279 return -EINVAL; 1310 return -EINVAL;
1280 1311
1281 /* 1312 /*
1282 * We either have to specify by name or by subsystems. (So all 1313 * We either have to specify by name or by subsystems. (So all
1283 * empty hierarchies must have a name). 1314 * empty hierarchies must have a name).
1284 */ 1315 */
1285 if (!opts->subsys_bits && !opts->name) 1316 if (!opts->subsys_mask && !opts->name)
1286 return -EINVAL; 1317 return -EINVAL;
1287 1318
1288 /* 1319 /*
@@ -1291,10 +1322,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1291 * take duplicate reference counts on a subsystem that's already used, 1322 * take duplicate reference counts on a subsystem that's already used,
1292 * but rebind_subsystems handles this case. 1323 * but rebind_subsystems handles this case.
1293 */ 1324 */
1294 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { 1325 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1295 unsigned long bit = 1UL << i; 1326 unsigned long bit = 1UL << i;
1296 1327
1297 if (!(bit & opts->subsys_bits)) 1328 if (!(bit & opts->subsys_mask))
1298 continue; 1329 continue;
1299 if (!try_module_get(subsys[i]->module)) { 1330 if (!try_module_get(subsys[i]->module)) {
1300 module_pin_failed = true; 1331 module_pin_failed = true;
@@ -1307,11 +1338,11 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1307 * raced with a module_delete call, and to the user this is 1338 * raced with a module_delete call, and to the user this is
1308 * essentially a "subsystem doesn't exist" case. 1339 * essentially a "subsystem doesn't exist" case.
1309 */ 1340 */
1310 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { 1341 for (i--; i >= 0; i--) {
1311 /* drop refcounts only on the ones we took */ 1342 /* drop refcounts only on the ones we took */
1312 unsigned long bit = 1UL << i; 1343 unsigned long bit = 1UL << i;
1313 1344
1314 if (!(bit & opts->subsys_bits)) 1345 if (!(bit & opts->subsys_mask))
1315 continue; 1346 continue;
1316 module_put(subsys[i]->module); 1347 module_put(subsys[i]->module);
1317 } 1348 }
@@ -1321,13 +1352,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1321 return 0; 1352 return 0;
1322} 1353}
1323 1354
1324static void drop_parsed_module_refcounts(unsigned long subsys_bits) 1355static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1325{ 1356{
1326 int i; 1357 int i;
1327 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { 1358 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1328 unsigned long bit = 1UL << i; 1359 unsigned long bit = 1UL << i;
1329 1360
1330 if (!(bit & subsys_bits)) 1361 if (!(bit & subsys_mask))
1331 continue; 1362 continue;
1332 module_put(subsys[i]->module); 1363 module_put(subsys[i]->module);
1333 } 1364 }
@@ -1339,6 +1370,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1339 struct cgroupfs_root *root = sb->s_fs_info; 1370 struct cgroupfs_root *root = sb->s_fs_info;
1340 struct cgroup *cgrp = &root->top_cgroup; 1371 struct cgroup *cgrp = &root->top_cgroup;
1341 struct cgroup_sb_opts opts; 1372 struct cgroup_sb_opts opts;
1373 unsigned long added_mask, removed_mask;
1342 1374
1343 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1375 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1344 mutex_lock(&cgroup_mutex); 1376 mutex_lock(&cgroup_mutex);
@@ -1350,27 +1382,31 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1350 goto out_unlock; 1382 goto out_unlock;
1351 1383
1352 /* See feature-removal-schedule.txt */ 1384 /* See feature-removal-schedule.txt */
1353 if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) 1385 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
1354 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1386 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1355 task_tgid_nr(current), current->comm); 1387 task_tgid_nr(current), current->comm);
1356 1388
1389 added_mask = opts.subsys_mask & ~root->subsys_mask;
1390 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1391
1357 /* Don't allow flags or name to change at remount */ 1392 /* Don't allow flags or name to change at remount */
1358 if (opts.flags != root->flags || 1393 if (opts.flags != root->flags ||
1359 (opts.name && strcmp(opts.name, root->name))) { 1394 (opts.name && strcmp(opts.name, root->name))) {
1360 ret = -EINVAL; 1395 ret = -EINVAL;
1361 drop_parsed_module_refcounts(opts.subsys_bits); 1396 drop_parsed_module_refcounts(opts.subsys_mask);
1362 goto out_unlock; 1397 goto out_unlock;
1363 } 1398 }
1364 1399
1365 ret = rebind_subsystems(root, opts.subsys_bits); 1400 ret = rebind_subsystems(root, opts.subsys_mask);
1366 if (ret) { 1401 if (ret) {
1367 drop_parsed_module_refcounts(opts.subsys_bits); 1402 drop_parsed_module_refcounts(opts.subsys_mask);
1368 goto out_unlock; 1403 goto out_unlock;
1369 } 1404 }
1370 1405
1371 /* clear out any existing files and repopulate subsystem files */ 1406 /* clear out any existing files and repopulate subsystem files */
1372 cgroup_clear_directory(cgrp->dentry); 1407 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1373 cgroup_populate_dir(cgrp); 1408 /* re-populate subsystem files */
1409 cgroup_populate_dir(cgrp, false, added_mask);
1374 1410
1375 if (opts.release_agent) 1411 if (opts.release_agent)
1376 strcpy(root->release_agent_path, opts.release_agent); 1412 strcpy(root->release_agent_path, opts.release_agent);
@@ -1401,6 +1437,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1401 mutex_init(&cgrp->pidlist_mutex); 1437 mutex_init(&cgrp->pidlist_mutex);
1402 INIT_LIST_HEAD(&cgrp->event_list); 1438 INIT_LIST_HEAD(&cgrp->event_list);
1403 spin_lock_init(&cgrp->event_list_lock); 1439 spin_lock_init(&cgrp->event_list_lock);
1440 simple_xattrs_init(&cgrp->xattrs);
1404} 1441}
1405 1442
1406static void init_cgroup_root(struct cgroupfs_root *root) 1443static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1455,8 +1492,8 @@ static int cgroup_test_super(struct super_block *sb, void *data)
1455 * If we asked for subsystems (or explicitly for no 1492 * If we asked for subsystems (or explicitly for no
1456 * subsystems) then they must match 1493 * subsystems) then they must match
1457 */ 1494 */
1458 if ((opts->subsys_bits || opts->none) 1495 if ((opts->subsys_mask || opts->none)
1459 && (opts->subsys_bits != root->subsys_bits)) 1496 && (opts->subsys_mask != root->subsys_mask))
1460 return 0; 1497 return 0;
1461 1498
1462 return 1; 1499 return 1;
@@ -1466,7 +1503,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1466{ 1503{
1467 struct cgroupfs_root *root; 1504 struct cgroupfs_root *root;
1468 1505
1469 if (!opts->subsys_bits && !opts->none) 1506 if (!opts->subsys_mask && !opts->none)
1470 return NULL; 1507 return NULL;
1471 1508
1472 root = kzalloc(sizeof(*root), GFP_KERNEL); 1509 root = kzalloc(sizeof(*root), GFP_KERNEL);
@@ -1479,7 +1516,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1479 } 1516 }
1480 init_cgroup_root(root); 1517 init_cgroup_root(root);
1481 1518
1482 root->subsys_bits = opts->subsys_bits; 1519 root->subsys_mask = opts->subsys_mask;
1483 root->flags = opts->flags; 1520 root->flags = opts->flags;
1484 if (opts->release_agent) 1521 if (opts->release_agent)
1485 strcpy(root->release_agent_path, opts->release_agent); 1522 strcpy(root->release_agent_path, opts->release_agent);
@@ -1511,7 +1548,7 @@ static int cgroup_set_super(struct super_block *sb, void *data)
1511 if (!opts->new_root) 1548 if (!opts->new_root)
1512 return -EINVAL; 1549 return -EINVAL;
1513 1550
1514 BUG_ON(!opts->subsys_bits && !opts->none); 1551 BUG_ON(!opts->subsys_mask && !opts->none);
1515 1552
1516 ret = set_anon_super(sb, NULL); 1553 ret = set_anon_super(sb, NULL);
1517 if (ret) 1554 if (ret)
@@ -1629,7 +1666,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1629 if (ret) 1666 if (ret)
1630 goto unlock_drop; 1667 goto unlock_drop;
1631 1668
1632 ret = rebind_subsystems(root, root->subsys_bits); 1669 ret = rebind_subsystems(root, root->subsys_mask);
1633 if (ret == -EBUSY) { 1670 if (ret == -EBUSY) {
1634 free_cg_links(&tmp_cg_links); 1671 free_cg_links(&tmp_cg_links);
1635 goto unlock_drop; 1672 goto unlock_drop;
@@ -1669,7 +1706,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1669 BUG_ON(root->number_of_cgroups != 1); 1706 BUG_ON(root->number_of_cgroups != 1);
1670 1707
1671 cred = override_creds(&init_cred); 1708 cred = override_creds(&init_cred);
1672 cgroup_populate_dir(root_cgrp); 1709 cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
1673 revert_creds(cred); 1710 revert_creds(cred);
1674 mutex_unlock(&cgroup_root_mutex); 1711 mutex_unlock(&cgroup_root_mutex);
1675 mutex_unlock(&cgroup_mutex); 1712 mutex_unlock(&cgroup_mutex);
@@ -1681,7 +1718,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1681 */ 1718 */
1682 cgroup_drop_root(opts.new_root); 1719 cgroup_drop_root(opts.new_root);
1683 /* no subsys rebinding, so refcounts don't change */ 1720 /* no subsys rebinding, so refcounts don't change */
1684 drop_parsed_module_refcounts(opts.subsys_bits); 1721 drop_parsed_module_refcounts(opts.subsys_mask);
1685 } 1722 }
1686 1723
1687 kfree(opts.release_agent); 1724 kfree(opts.release_agent);
@@ -1695,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1695 drop_new_super: 1732 drop_new_super:
1696 deactivate_locked_super(sb); 1733 deactivate_locked_super(sb);
1697 drop_modules: 1734 drop_modules:
1698 drop_parsed_module_refcounts(opts.subsys_bits); 1735 drop_parsed_module_refcounts(opts.subsys_mask);
1699 out_err: 1736 out_err:
1700 kfree(opts.release_agent); 1737 kfree(opts.release_agent);
1701 kfree(opts.name); 1738 kfree(opts.name);
@@ -1745,6 +1782,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
1745 mutex_unlock(&cgroup_root_mutex); 1782 mutex_unlock(&cgroup_root_mutex);
1746 mutex_unlock(&cgroup_mutex); 1783 mutex_unlock(&cgroup_mutex);
1747 1784
1785 simple_xattrs_free(&cgrp->xattrs);
1786
1748 kill_litter_super(sb); 1787 kill_litter_super(sb);
1749 cgroup_drop_root(root); 1788 cgroup_drop_root(root);
1750} 1789}
@@ -2551,6 +2590,64 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2551 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2590 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
2552} 2591}
2553 2592
2593static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2594{
2595 if (S_ISDIR(dentry->d_inode->i_mode))
2596 return &__d_cgrp(dentry)->xattrs;
2597 else
2598 return &__d_cft(dentry)->xattrs;
2599}
2600
2601static inline int xattr_enabled(struct dentry *dentry)
2602{
2603 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2604 return test_bit(ROOT_XATTR, &root->flags);
2605}
2606
2607static bool is_valid_xattr(const char *name)
2608{
2609 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
2610 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
2611 return true;
2612 return false;
2613}
2614
2615static int cgroup_setxattr(struct dentry *dentry, const char *name,
2616 const void *val, size_t size, int flags)
2617{
2618 if (!xattr_enabled(dentry))
2619 return -EOPNOTSUPP;
2620 if (!is_valid_xattr(name))
2621 return -EINVAL;
2622 return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
2623}
2624
2625static int cgroup_removexattr(struct dentry *dentry, const char *name)
2626{
2627 if (!xattr_enabled(dentry))
2628 return -EOPNOTSUPP;
2629 if (!is_valid_xattr(name))
2630 return -EINVAL;
2631 return simple_xattr_remove(__d_xattrs(dentry), name);
2632}
2633
2634static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
2635 void *buf, size_t size)
2636{
2637 if (!xattr_enabled(dentry))
2638 return -EOPNOTSUPP;
2639 if (!is_valid_xattr(name))
2640 return -EINVAL;
2641 return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
2642}
2643
2644static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2645{
2646 if (!xattr_enabled(dentry))
2647 return -EOPNOTSUPP;
2648 return simple_xattr_list(__d_xattrs(dentry), buf, size);
2649}
2650
2554static const struct file_operations cgroup_file_operations = { 2651static const struct file_operations cgroup_file_operations = {
2555 .read = cgroup_file_read, 2652 .read = cgroup_file_read,
2556 .write = cgroup_file_write, 2653 .write = cgroup_file_write,
@@ -2559,11 +2656,22 @@ static const struct file_operations cgroup_file_operations = {
2559 .release = cgroup_file_release, 2656 .release = cgroup_file_release,
2560}; 2657};
2561 2658
2659static const struct inode_operations cgroup_file_inode_operations = {
2660 .setxattr = cgroup_setxattr,
2661 .getxattr = cgroup_getxattr,
2662 .listxattr = cgroup_listxattr,
2663 .removexattr = cgroup_removexattr,
2664};
2665
2562static const struct inode_operations cgroup_dir_inode_operations = { 2666static const struct inode_operations cgroup_dir_inode_operations = {
2563 .lookup = cgroup_lookup, 2667 .lookup = cgroup_lookup,
2564 .mkdir = cgroup_mkdir, 2668 .mkdir = cgroup_mkdir,
2565 .rmdir = cgroup_rmdir, 2669 .rmdir = cgroup_rmdir,
2566 .rename = cgroup_rename, 2670 .rename = cgroup_rename,
2671 .setxattr = cgroup_setxattr,
2672 .getxattr = cgroup_getxattr,
2673 .listxattr = cgroup_listxattr,
2674 .removexattr = cgroup_removexattr,
2567}; 2675};
2568 2676
2569static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 2677static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -2611,6 +2719,7 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2611 } else if (S_ISREG(mode)) { 2719 } else if (S_ISREG(mode)) {
2612 inode->i_size = 0; 2720 inode->i_size = 0;
2613 inode->i_fop = &cgroup_file_operations; 2721 inode->i_fop = &cgroup_file_operations;
2722 inode->i_op = &cgroup_file_inode_operations;
2614 } 2723 }
2615 d_instantiate(dentry, inode); 2724 d_instantiate(dentry, inode);
2616 dget(dentry); /* Extra count - pin the dentry in core */ 2725 dget(dentry); /* Extra count - pin the dentry in core */
@@ -2671,7 +2780,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2671} 2780}
2672 2781
2673static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2782static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2674 const struct cftype *cft) 2783 struct cftype *cft)
2675{ 2784{
2676 struct dentry *dir = cgrp->dentry; 2785 struct dentry *dir = cgrp->dentry;
2677 struct cgroup *parent = __d_cgrp(dir); 2786 struct cgroup *parent = __d_cgrp(dir);
@@ -2681,6 +2790,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2681 umode_t mode; 2790 umode_t mode;
2682 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2791 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2683 2792
2793 simple_xattrs_init(&cft->xattrs);
2794
2684 /* does @cft->flags tell us to skip creation on @cgrp? */ 2795 /* does @cft->flags tell us to skip creation on @cgrp? */
2685 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2796 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2686 return 0; 2797 return 0;
@@ -2721,9 +2832,9 @@ out:
2721} 2832}
2722 2833
2723static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2834static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2724 const struct cftype cfts[], bool is_add) 2835 struct cftype cfts[], bool is_add)
2725{ 2836{
2726 const struct cftype *cft; 2837 struct cftype *cft;
2727 int err, ret = 0; 2838 int err, ret = 0;
2728 2839
2729 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2840 for (cft = cfts; cft->name[0] != '\0'; cft++) {
@@ -2757,7 +2868,7 @@ static void cgroup_cfts_prepare(void)
2757} 2868}
2758 2869
2759static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2870static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2760 const struct cftype *cfts, bool is_add) 2871 struct cftype *cfts, bool is_add)
2761 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) 2872 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
2762{ 2873{
2763 LIST_HEAD(pending); 2874 LIST_HEAD(pending);
@@ -2808,7 +2919,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2808 * function currently returns 0 as long as @cfts registration is successful 2919 * function currently returns 0 as long as @cfts registration is successful
2809 * even if some file creation attempts on existing cgroups fail. 2920 * even if some file creation attempts on existing cgroups fail.
2810 */ 2921 */
2811int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) 2922int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2812{ 2923{
2813 struct cftype_set *set; 2924 struct cftype_set *set;
2814 2925
@@ -2838,7 +2949,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2838 * Returns 0 on successful unregistration, -ENOENT if @cfts is not 2949 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2839 * registered with @ss. 2950 * registered with @ss.
2840 */ 2951 */
2841int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) 2952int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2842{ 2953{
2843 struct cftype_set *set; 2954 struct cftype_set *set;
2844 2955
@@ -3843,18 +3954,29 @@ static struct cftype files[] = {
3843 { } /* terminate */ 3954 { } /* terminate */
3844}; 3955};
3845 3956
3846static int cgroup_populate_dir(struct cgroup *cgrp) 3957/**
3958 * cgroup_populate_dir - selectively creation of files in a directory
3959 * @cgrp: target cgroup
3960 * @base_files: true if the base files should be added
3961 * @subsys_mask: mask of the subsystem ids whose files should be added
3962 */
3963static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
3964 unsigned long subsys_mask)
3847{ 3965{
3848 int err; 3966 int err;
3849 struct cgroup_subsys *ss; 3967 struct cgroup_subsys *ss;
3850 3968
3851 err = cgroup_addrm_files(cgrp, NULL, files, true); 3969 if (base_files) {
3852 if (err < 0) 3970 err = cgroup_addrm_files(cgrp, NULL, files, true);
3853 return err; 3971 if (err < 0)
3972 return err;
3973 }
3854 3974
3855 /* process cftsets of each subsystem */ 3975 /* process cftsets of each subsystem */
3856 for_each_subsys(cgrp->root, ss) { 3976 for_each_subsys(cgrp->root, ss) {
3857 struct cftype_set *set; 3977 struct cftype_set *set;
3978 if (!test_bit(ss->subsys_id, &subsys_mask))
3979 continue;
3858 3980
3859 list_for_each_entry(set, &ss->cftsets, node) 3981 list_for_each_entry(set, &ss->cftsets, node)
3860 cgroup_addrm_files(cgrp, ss, set->cfts, true); 3982 cgroup_addrm_files(cgrp, ss, set->cfts, true);
@@ -3988,7 +4110,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3988 4110
3989 list_add_tail(&cgrp->allcg_node, &root->allcg_list); 4111 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
3990 4112
3991 err = cgroup_populate_dir(cgrp); 4113 err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
3992 /* If err < 0, we have a half-filled directory - oh well ;) */ 4114 /* If err < 0, we have a half-filled directory - oh well ;) */
3993 4115
3994 mutex_unlock(&cgroup_mutex); 4116 mutex_unlock(&cgroup_mutex);
@@ -4321,8 +4443,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4321 * since cgroup_init_subsys will have already taken care of it. 4443 * since cgroup_init_subsys will have already taken care of it.
4322 */ 4444 */
4323 if (ss->module == NULL) { 4445 if (ss->module == NULL) {
4324 /* a few sanity checks */ 4446 /* a sanity check */
4325 BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
4326 BUG_ON(subsys[ss->subsys_id] != ss); 4447 BUG_ON(subsys[ss->subsys_id] != ss);
4327 return 0; 4448 return 0;
4328 } 4449 }
@@ -4330,24 +4451,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4330 /* init base cftset */ 4451 /* init base cftset */
4331 cgroup_init_cftsets(ss); 4452 cgroup_init_cftsets(ss);
4332 4453
4333 /*
4334 * need to register a subsys id before anything else - for example,
4335 * init_cgroup_css needs it.
4336 */
4337 mutex_lock(&cgroup_mutex); 4454 mutex_lock(&cgroup_mutex);
4338 /* find the first empty slot in the array */ 4455 subsys[ss->subsys_id] = ss;
4339 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
4340 if (subsys[i] == NULL)
4341 break;
4342 }
4343 if (i == CGROUP_SUBSYS_COUNT) {
4344 /* maximum number of subsystems already registered! */
4345 mutex_unlock(&cgroup_mutex);
4346 return -EBUSY;
4347 }
4348 /* assign ourselves the subsys_id */
4349 ss->subsys_id = i;
4350 subsys[i] = ss;
4351 4456
4352 /* 4457 /*
4353 * no ss->create seems to need anything important in the ss struct, so 4458 * no ss->create seems to need anything important in the ss struct, so
@@ -4356,7 +4461,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4356 css = ss->create(dummytop); 4461 css = ss->create(dummytop);
4357 if (IS_ERR(css)) { 4462 if (IS_ERR(css)) {
4358 /* failure case - need to deassign the subsys[] slot. */ 4463 /* failure case - need to deassign the subsys[] slot. */
4359 subsys[i] = NULL; 4464 subsys[ss->subsys_id] = NULL;
4360 mutex_unlock(&cgroup_mutex); 4465 mutex_unlock(&cgroup_mutex);
4361 return PTR_ERR(css); 4466 return PTR_ERR(css);
4362 } 4467 }
@@ -4372,7 +4477,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4372 if (ret) { 4477 if (ret) {
4373 dummytop->subsys[ss->subsys_id] = NULL; 4478 dummytop->subsys[ss->subsys_id] = NULL;
4374 ss->destroy(dummytop); 4479 ss->destroy(dummytop);
4375 subsys[i] = NULL; 4480 subsys[ss->subsys_id] = NULL;
4376 mutex_unlock(&cgroup_mutex); 4481 mutex_unlock(&cgroup_mutex);
4377 return ret; 4482 return ret;
4378 } 4483 }
@@ -4439,7 +4544,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4439 4544
4440 mutex_lock(&cgroup_mutex); 4545 mutex_lock(&cgroup_mutex);
4441 /* deassign the subsys_id */ 4546 /* deassign the subsys_id */
4442 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
4443 subsys[ss->subsys_id] = NULL; 4547 subsys[ss->subsys_id] = NULL;
4444 4548
4445 /* remove subsystem from rootnode's list of subsystems */ 4549 /* remove subsystem from rootnode's list of subsystems */
@@ -4502,10 +4606,13 @@ int __init cgroup_init_early(void)
4502 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) 4606 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
4503 INIT_HLIST_HEAD(&css_set_table[i]); 4607 INIT_HLIST_HEAD(&css_set_table[i]);
4504 4608
4505 /* at bootup time, we don't worry about modular subsystems */ 4609 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4506 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4507 struct cgroup_subsys *ss = subsys[i]; 4610 struct cgroup_subsys *ss = subsys[i];
4508 4611
4612 /* at bootup time, we don't worry about modular subsystems */
4613 if (!ss || ss->module)
4614 continue;
4615
4509 BUG_ON(!ss->name); 4616 BUG_ON(!ss->name);
4510 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4617 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4511 BUG_ON(!ss->create); 4618 BUG_ON(!ss->create);
@@ -4538,9 +4645,12 @@ int __init cgroup_init(void)
4538 if (err) 4645 if (err)
4539 return err; 4646 return err;
4540 4647
4541 /* at bootup time, we don't worry about modular subsystems */ 4648 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4542 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4543 struct cgroup_subsys *ss = subsys[i]; 4649 struct cgroup_subsys *ss = subsys[i];
4650
4651 /* at bootup time, we don't worry about modular subsystems */
4652 if (!ss || ss->module)
4653 continue;
4544 if (!ss->early_init) 4654 if (!ss->early_init)
4545 cgroup_init_subsys(ss); 4655 cgroup_init_subsys(ss);
4546 if (ss->use_id) 4656 if (ss->use_id)
@@ -4735,13 +4845,16 @@ void cgroup_fork_callbacks(struct task_struct *child)
4735{ 4845{
4736 if (need_forkexit_callback) { 4846 if (need_forkexit_callback) {
4737 int i; 4847 int i;
4738 /* 4848 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4739 * forkexit callbacks are only supported for builtin
4740 * subsystems, and the builtin section of the subsys array is
4741 * immutable, so we don't need to lock the subsys array here.
4742 */
4743 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4744 struct cgroup_subsys *ss = subsys[i]; 4849 struct cgroup_subsys *ss = subsys[i];
4850
4851 /*
4852 * forkexit callbacks are only supported for
4853 * builtin subsystems.
4854 */
4855 if (!ss || ss->module)
4856 continue;
4857
4745 if (ss->fork) 4858 if (ss->fork)
4746 ss->fork(child); 4859 ss->fork(child);
4747 } 4860 }
@@ -4846,12 +4959,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4846 tsk->cgroups = &init_css_set; 4959 tsk->cgroups = &init_css_set;
4847 4960
4848 if (run_callbacks && need_forkexit_callback) { 4961 if (run_callbacks && need_forkexit_callback) {
4849 /* 4962 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4850 * modular subsystems can't use callbacks, so no need to lock
4851 * the subsys array
4852 */
4853 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4854 struct cgroup_subsys *ss = subsys[i]; 4963 struct cgroup_subsys *ss = subsys[i];
4964
4965 /* modular subsystems can't use callbacks */
4966 if (!ss || ss->module)
4967 continue;
4968
4855 if (ss->exit) { 4969 if (ss->exit) {
4856 struct cgroup *old_cgrp = 4970 struct cgroup *old_cgrp =
4857 rcu_dereference_raw(cg->subsys[i])->cgroup; 4971 rcu_dereference_raw(cg->subsys[i])->cgroup;
@@ -5037,13 +5151,17 @@ static int __init cgroup_disable(char *str)
5037 while ((token = strsep(&str, ",")) != NULL) { 5151 while ((token = strsep(&str, ",")) != NULL) {
5038 if (!*token) 5152 if (!*token)
5039 continue; 5153 continue;
5040 /* 5154 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
5041 * cgroup_disable, being at boot time, can't know about module
5042 * subsystems, so we don't worry about them.
5043 */
5044 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
5045 struct cgroup_subsys *ss = subsys[i]; 5155 struct cgroup_subsys *ss = subsys[i];
5046 5156
5157 /*
5158 * cgroup_disable, being at boot time, can't
5159 * know about module subsystems, so we don't
5160 * worry about them.
5161 */
5162 if (!ss || ss->module)
5163 continue;
5164
5047 if (!strcmp(token, ss->name)) { 5165 if (!strcmp(token, ss->name)) {
5048 ss->disabled = 1; 5166 ss->disabled = 1;
5049 printk(KERN_INFO "Disabling %s control group" 5167 printk(KERN_INFO "Disabling %s control group"
diff --git a/mm/shmem.c b/mm/shmem.c
index d4e184e2a38e..d3752110c8c7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -77,13 +77,6 @@ static struct vfsmount *shm_mnt;
77/* Symlink up to this size is kmalloc'ed instead of using a swappable page */ 77/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
78#define SHORT_SYMLINK_LEN 128 78#define SHORT_SYMLINK_LEN 128
79 79
80struct shmem_xattr {
81 struct list_head list; /* anchored by shmem_inode_info->xattr_list */
82 char *name; /* xattr name */
83 size_t size;
84 char value[0];
85};
86
87/* 80/*
88 * shmem_fallocate and shmem_writepage communicate via inode->i_private 81 * shmem_fallocate and shmem_writepage communicate via inode->i_private
89 * (with i_mutex making sure that it has only one user at a time): 82 * (with i_mutex making sure that it has only one user at a time):
@@ -636,7 +629,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
636static void shmem_evict_inode(struct inode *inode) 629static void shmem_evict_inode(struct inode *inode)
637{ 630{
638 struct shmem_inode_info *info = SHMEM_I(inode); 631 struct shmem_inode_info *info = SHMEM_I(inode);
639 struct shmem_xattr *xattr, *nxattr;
640 632
641 if (inode->i_mapping->a_ops == &shmem_aops) { 633 if (inode->i_mapping->a_ops == &shmem_aops) {
642 shmem_unacct_size(info->flags, inode->i_size); 634 shmem_unacct_size(info->flags, inode->i_size);
@@ -650,10 +642,7 @@ static void shmem_evict_inode(struct inode *inode)
650 } else 642 } else
651 kfree(info->symlink); 643 kfree(info->symlink);
652 644
653 list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { 645 simple_xattrs_free(&info->xattrs);
654 kfree(xattr->name);
655 kfree(xattr);
656 }
657 BUG_ON(inode->i_blocks); 646 BUG_ON(inode->i_blocks);
658 shmem_free_inode(inode->i_sb); 647 shmem_free_inode(inode->i_sb);
659 clear_inode(inode); 648 clear_inode(inode);
@@ -1377,7 +1366,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1377 spin_lock_init(&info->lock); 1366 spin_lock_init(&info->lock);
1378 info->flags = flags & VM_NORESERVE; 1367 info->flags = flags & VM_NORESERVE;
1379 INIT_LIST_HEAD(&info->swaplist); 1368 INIT_LIST_HEAD(&info->swaplist);
1380 INIT_LIST_HEAD(&info->xattr_list); 1369 simple_xattrs_init(&info->xattrs);
1381 cache_no_acl(inode); 1370 cache_no_acl(inode);
1382 1371
1383 switch (mode & S_IFMT) { 1372 switch (mode & S_IFMT) {
@@ -2060,28 +2049,6 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
2060 */ 2049 */
2061 2050
2062/* 2051/*
2063 * Allocate new xattr and copy in the value; but leave the name to callers.
2064 */
2065static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size)
2066{
2067 struct shmem_xattr *new_xattr;
2068 size_t len;
2069
2070 /* wrap around? */
2071 len = sizeof(*new_xattr) + size;
2072 if (len <= sizeof(*new_xattr))
2073 return NULL;
2074
2075 new_xattr = kmalloc(len, GFP_KERNEL);
2076 if (!new_xattr)
2077 return NULL;
2078
2079 new_xattr->size = size;
2080 memcpy(new_xattr->value, value, size);
2081 return new_xattr;
2082}
2083
2084/*
2085 * Callback for security_inode_init_security() for acquiring xattrs. 2052 * Callback for security_inode_init_security() for acquiring xattrs.
2086 */ 2053 */
2087static int shmem_initxattrs(struct inode *inode, 2054static int shmem_initxattrs(struct inode *inode,
@@ -2090,11 +2057,11 @@ static int shmem_initxattrs(struct inode *inode,
2090{ 2057{
2091 struct shmem_inode_info *info = SHMEM_I(inode); 2058 struct shmem_inode_info *info = SHMEM_I(inode);
2092 const struct xattr *xattr; 2059 const struct xattr *xattr;
2093 struct shmem_xattr *new_xattr; 2060 struct simple_xattr *new_xattr;
2094 size_t len; 2061 size_t len;
2095 2062
2096 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 2063 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
2097 new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len); 2064 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
2098 if (!new_xattr) 2065 if (!new_xattr)
2099 return -ENOMEM; 2066 return -ENOMEM;
2100 2067
@@ -2111,91 +2078,12 @@ static int shmem_initxattrs(struct inode *inode,
2111 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, 2078 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
2112 xattr->name, len); 2079 xattr->name, len);
2113 2080
2114 spin_lock(&info->lock); 2081 simple_xattr_list_add(&info->xattrs, new_xattr);
2115 list_add(&new_xattr->list, &info->xattr_list);
2116 spin_unlock(&info->lock);
2117 } 2082 }
2118 2083
2119 return 0; 2084 return 0;
2120} 2085}
2121 2086
2122static int shmem_xattr_get(struct dentry *dentry, const char *name,
2123 void *buffer, size_t size)
2124{
2125 struct shmem_inode_info *info;
2126 struct shmem_xattr *xattr;
2127 int ret = -ENODATA;
2128
2129 info = SHMEM_I(dentry->d_inode);
2130
2131 spin_lock(&info->lock);
2132 list_for_each_entry(xattr, &info->xattr_list, list) {
2133 if (strcmp(name, xattr->name))
2134 continue;
2135
2136 ret = xattr->size;
2137 if (buffer) {
2138 if (size < xattr->size)
2139 ret = -ERANGE;
2140 else
2141 memcpy(buffer, xattr->value, xattr->size);
2142 }
2143 break;
2144 }
2145 spin_unlock(&info->lock);
2146 return ret;
2147}
2148
2149static int shmem_xattr_set(struct inode *inode, const char *name,
2150 const void *value, size_t size, int flags)
2151{
2152 struct shmem_inode_info *info = SHMEM_I(inode);
2153 struct shmem_xattr *xattr;
2154 struct shmem_xattr *new_xattr = NULL;
2155 int err = 0;
2156
2157 /* value == NULL means remove */
2158 if (value) {
2159 new_xattr = shmem_xattr_alloc(value, size);
2160 if (!new_xattr)
2161 return -ENOMEM;
2162
2163 new_xattr->name = kstrdup(name, GFP_KERNEL);
2164 if (!new_xattr->name) {
2165 kfree(new_xattr);
2166 return -ENOMEM;
2167 }
2168 }
2169
2170 spin_lock(&info->lock);
2171 list_for_each_entry(xattr, &info->xattr_list, list) {
2172 if (!strcmp(name, xattr->name)) {
2173 if (flags & XATTR_CREATE) {
2174 xattr = new_xattr;
2175 err = -EEXIST;
2176 } else if (new_xattr) {
2177 list_replace(&xattr->list, &new_xattr->list);
2178 } else {
2179 list_del(&xattr->list);
2180 }
2181 goto out;
2182 }
2183 }
2184 if (flags & XATTR_REPLACE) {
2185 xattr = new_xattr;
2186 err = -ENODATA;
2187 } else {
2188 list_add(&new_xattr->list, &info->xattr_list);
2189 xattr = NULL;
2190 }
2191out:
2192 spin_unlock(&info->lock);
2193 if (xattr)
2194 kfree(xattr->name);
2195 kfree(xattr);
2196 return err;
2197}
2198
2199static const struct xattr_handler *shmem_xattr_handlers[] = { 2087static const struct xattr_handler *shmem_xattr_handlers[] = {
2200#ifdef CONFIG_TMPFS_POSIX_ACL 2088#ifdef CONFIG_TMPFS_POSIX_ACL
2201 &generic_acl_access_handler, 2089 &generic_acl_access_handler,
@@ -2226,6 +2114,7 @@ static int shmem_xattr_validate(const char *name)
2226static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, 2114static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
2227 void *buffer, size_t size) 2115 void *buffer, size_t size)
2228{ 2116{
2117 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2229 int err; 2118 int err;
2230 2119
2231 /* 2120 /*
@@ -2240,12 +2129,13 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
2240 if (err) 2129 if (err)
2241 return err; 2130 return err;
2242 2131
2243 return shmem_xattr_get(dentry, name, buffer, size); 2132 return simple_xattr_get(&info->xattrs, name, buffer, size);
2244} 2133}
2245 2134
2246static int shmem_setxattr(struct dentry *dentry, const char *name, 2135static int shmem_setxattr(struct dentry *dentry, const char *name,
2247 const void *value, size_t size, int flags) 2136 const void *value, size_t size, int flags)
2248{ 2137{
2138 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2249 int err; 2139 int err;
2250 2140
2251 /* 2141 /*
@@ -2260,15 +2150,12 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
2260 if (err) 2150 if (err)
2261 return err; 2151 return err;
2262 2152
2263 if (size == 0) 2153 return simple_xattr_set(&info->xattrs, name, value, size, flags);
2264 value = ""; /* empty EA, do not remove */
2265
2266 return shmem_xattr_set(dentry->d_inode, name, value, size, flags);
2267
2268} 2154}
2269 2155
2270static int shmem_removexattr(struct dentry *dentry, const char *name) 2156static int shmem_removexattr(struct dentry *dentry, const char *name)
2271{ 2157{
2158 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2272 int err; 2159 int err;
2273 2160
2274 /* 2161 /*
@@ -2283,45 +2170,13 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
2283 if (err) 2170 if (err)
2284 return err; 2171 return err;
2285 2172
2286 return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); 2173 return simple_xattr_remove(&info->xattrs, name);
2287}
2288
2289static bool xattr_is_trusted(const char *name)
2290{
2291 return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
2292} 2174}
2293 2175
2294static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 2176static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
2295{ 2177{
2296 bool trusted = capable(CAP_SYS_ADMIN); 2178 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2297 struct shmem_xattr *xattr; 2179 return simple_xattr_list(&info->xattrs, buffer, size);
2298 struct shmem_inode_info *info;
2299 size_t used = 0;
2300
2301 info = SHMEM_I(dentry->d_inode);
2302
2303 spin_lock(&info->lock);
2304 list_for_each_entry(xattr, &info->xattr_list, list) {
2305 size_t len;
2306
2307 /* skip "trusted." attributes for unprivileged callers */
2308 if (!trusted && xattr_is_trusted(xattr->name))
2309 continue;
2310
2311 len = strlen(xattr->name) + 1;
2312 used += len;
2313 if (buffer) {
2314 if (size < used) {
2315 used = -ERANGE;
2316 break;
2317 }
2318 memcpy(buffer, xattr->name, len);
2319 buffer += len;
2320 }
2321 }
2322 spin_unlock(&info->lock);
2323
2324 return used;
2325} 2180}
2326#endif /* CONFIG_TMPFS_XATTR */ 2181#endif /* CONFIG_TMPFS_XATTR */
2327 2182
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index c75e3f9d060f..6bc460c38e4f 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -326,9 +326,7 @@ struct cgroup_subsys net_prio_subsys = {
326 .create = cgrp_create, 326 .create = cgrp_create,
327 .destroy = cgrp_destroy, 327 .destroy = cgrp_destroy,
328 .attach = net_prio_attach, 328 .attach = net_prio_attach,
329#ifdef CONFIG_NETPRIO_CGROUP
330 .subsys_id = net_prio_subsys_id, 329 .subsys_id = net_prio_subsys_id,
331#endif
332 .base_cftypes = ss_files, 330 .base_cftypes = ss_files,
333 .module = THIS_MODULE 331 .module = THIS_MODULE
334}; 332};
@@ -366,10 +364,6 @@ static int __init init_cgroup_netprio(void)
366 ret = cgroup_load_subsys(&net_prio_subsys); 364 ret = cgroup_load_subsys(&net_prio_subsys);
367 if (ret) 365 if (ret)
368 goto out; 366 goto out;
369#ifndef CONFIG_NETPRIO_CGROUP
370 smp_wmb();
371 net_prio_subsys_id = net_prio_subsys.subsys_id;
372#endif
373 367
374 register_netdevice_notifier(&netprio_device_notifier); 368 register_netdevice_notifier(&netprio_device_notifier);
375 369
@@ -386,11 +380,6 @@ static void __exit exit_cgroup_netprio(void)
386 380
387 cgroup_unload_subsys(&net_prio_subsys); 381 cgroup_unload_subsys(&net_prio_subsys);
388 382
389#ifndef CONFIG_NETPRIO_CGROUP
390 net_prio_subsys_id = -1;
391 synchronize_rcu();
392#endif
393
394 rtnl_lock(); 383 rtnl_lock();
395 for_each_netdev(&init_net, dev) { 384 for_each_netdev(&init_net, dev) {
396 old = rtnl_dereference(dev->priomap); 385 old = rtnl_dereference(dev->priomap);
diff --git a/net/core/sock.c b/net/core/sock.c
index a6000fbad294..341fa1c3bd69 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -326,17 +326,6 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326} 326}
327EXPORT_SYMBOL(__sk_backlog_rcv); 327EXPORT_SYMBOL(__sk_backlog_rcv);
328 328
329#if defined(CONFIG_CGROUPS)
330#if !defined(CONFIG_NET_CLS_CGROUP)
331int net_cls_subsys_id = -1;
332EXPORT_SYMBOL_GPL(net_cls_subsys_id);
333#endif
334#if !defined(CONFIG_NETPRIO_CGROUP)
335int net_prio_subsys_id = -1;
336EXPORT_SYMBOL_GPL(net_prio_subsys_id);
337#endif
338#endif
339
340static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) 329static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
341{ 330{
342 struct timeval tv; 331 struct timeval tv;
@@ -1224,6 +1213,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
1224} 1213}
1225 1214
1226#ifdef CONFIG_CGROUPS 1215#ifdef CONFIG_CGROUPS
1216#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1227void sock_update_classid(struct sock *sk) 1217void sock_update_classid(struct sock *sk)
1228{ 1218{
1229 u32 classid; 1219 u32 classid;
@@ -1235,7 +1225,9 @@ void sock_update_classid(struct sock *sk)
1235 sk->sk_classid = classid; 1225 sk->sk_classid = classid;
1236} 1226}
1237EXPORT_SYMBOL(sock_update_classid); 1227EXPORT_SYMBOL(sock_update_classid);
1228#endif
1238 1229
1230#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1239void sock_update_netprioidx(struct sock *sk, struct task_struct *task) 1231void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
1240{ 1232{
1241 if (in_interrupt()) 1233 if (in_interrupt())
@@ -1245,6 +1237,7 @@ void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
1245} 1237}
1246EXPORT_SYMBOL_GPL(sock_update_netprioidx); 1238EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1247#endif 1239#endif
1240#endif
1248 1241
1249/** 1242/**
1250 * sk_alloc - All socket objects are allocated here 1243 * sk_alloc - All socket objects are allocated here
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 7743ea8d1d38..67cf90d962f4 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -77,9 +77,7 @@ struct cgroup_subsys net_cls_subsys = {
77 .name = "net_cls", 77 .name = "net_cls",
78 .create = cgrp_create, 78 .create = cgrp_create,
79 .destroy = cgrp_destroy, 79 .destroy = cgrp_destroy,
80#ifdef CONFIG_NET_CLS_CGROUP
81 .subsys_id = net_cls_subsys_id, 80 .subsys_id = net_cls_subsys_id,
82#endif
83 .base_cftypes = ss_files, 81 .base_cftypes = ss_files,
84 .module = THIS_MODULE, 82 .module = THIS_MODULE,
85}; 83};
@@ -283,12 +281,6 @@ static int __init init_cgroup_cls(void)
283 if (ret) 281 if (ret)
284 goto out; 282 goto out;
285 283
286#ifndef CONFIG_NET_CLS_CGROUP
287 /* We can't use rcu_assign_pointer because this is an int. */
288 smp_wmb();
289 net_cls_subsys_id = net_cls_subsys.subsys_id;
290#endif
291
292 ret = register_tcf_proto_ops(&cls_cgroup_ops); 284 ret = register_tcf_proto_ops(&cls_cgroup_ops);
293 if (ret) 285 if (ret)
294 cgroup_unload_subsys(&net_cls_subsys); 286 cgroup_unload_subsys(&net_cls_subsys);
@@ -301,11 +293,6 @@ static void __exit exit_cgroup_cls(void)
301{ 293{
302 unregister_tcf_proto_ops(&cls_cgroup_ops); 294 unregister_tcf_proto_ops(&cls_cgroup_ops);
303 295
304#ifndef CONFIG_NET_CLS_CGROUP
305 net_cls_subsys_id = -1;
306 synchronize_rcu();
307#endif
308
309 cgroup_unload_subsys(&net_cls_subsys); 296 cgroup_unload_subsys(&net_cls_subsys);
310} 297}
311 298