aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroup-v1/rdma.txt109
-rw-r--r--Documentation/cgroup-v2.txt46
-rw-r--r--drivers/infiniband/core/Makefile1
-rw-r--r--drivers/infiniband/core/cgroup.c62
-rw-r--r--drivers/infiniband/core/core_priv.h30
-rw-r--r--drivers/infiniband/core/device.c10
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c102
-rw-r--r--drivers/infiniband/core/uverbs_main.c20
-rw-r--r--include/linux/cgroup_rdma.h53
-rw-r--r--include/linux/cgroup_subsys.h4
-rw-r--r--include/rdma/ib_verbs.h14
-rw-r--r--init/Kconfig10
-rw-r--r--kernel/cgroup/Makefile1
-rw-r--r--kernel/cgroup/rdma.c619
14 files changed, 1071 insertions, 10 deletions
diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt
new file mode 100644
index 000000000000..af618171e0eb
--- /dev/null
+++ b/Documentation/cgroup-v1/rdma.txt
@@ -0,0 +1,109 @@
1 RDMA Controller
2 ----------------
3
4Contents
5--------
6
71. Overview
8 1-1. What is RDMA controller?
9 1-2. Why RDMA controller needed?
10 1-3. How is RDMA controller implemented?
112. Usage Examples
12
131. Overview
14
151-1. What is RDMA controller?
16-----------------------------
17
18RDMA controller allows user to limit RDMA/IB specific resources that a given
19set of processes can use. These processes are grouped using RDMA controller.
20
21RDMA controller defines two resources which can be limited for processes of a
22cgroup.
23
241-2. Why RDMA controller needed?
25--------------------------------
26
27Currently user space applications can easily take away all the rdma verb
28specific resources such as AH, CQ, QP, MR etc. Due to which other applications
29in other cgroup or kernel space ULPs may not even get chance to allocate any
30rdma resources. This can leads to service unavailability.
31
32Therefore RDMA controller is needed through which resource consumption
33of processes can be limited. Through this controller different rdma
34resources can be accounted.
35
361-3. How is RDMA controller implemented?
37----------------------------------------
38
39RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
40resource accounting per cgroup, per device using resource pool structure.
41Each such resource pool is limited up to 64 resources in given resource pool
42by rdma cgroup, which can be extended later if required.
43
44This resource pool object is linked to the cgroup css. Typically there
45are 0 to 4 resource pool instances per cgroup, per device in most use cases.
46But nothing limits to have it more. At present hundreds of RDMA devices per
47single cgroup may not be handled optimally, however there is no
48known use case or requirement for such configuration either.
49
50Since RDMA resources can be allocated from any process and can be freed by any
51of the child processes which shares the address space, rdma resources are
52always owned by the creator cgroup css. This allows process migration from one
53to other cgroup without major complexity of transferring resource ownership;
54because such ownership is not really present due to shared nature of
55rdma resources. Linking resources around css also ensures that cgroups can be
56deleted after processes migrated. This allow progress migration as well with
57active resources, even though that is not a primary use case.
58
59Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
60the caller. Same rdma cgroup should be passed while uncharging the resource.
61This also allows process migrated with active RDMA resource to charge
62to new owner cgroup for new resource. It also allows to uncharge resource of
63a process from previously charged cgroup which is migrated to new cgroup,
64even though that is not a primary use case.
65
66Resource pool object is created in following situations.
67(a) User sets the limit and no previous resource pool exist for the device
68of interest for the cgroup.
69(b) No resource limits were configured, but IB/RDMA stack tries to
70charge the resource. So that it correctly uncharge them when applications are
71running without limits and later on when limits are enforced during uncharging,
72otherwise usage count will drop to negative.
73
74Resource pool is destroyed if all the resource limits are set to max and
75it is the last resource getting deallocated.
76
77User should set all the limit to max value if it intents to remove/unconfigure
78the resource pool for a particular device.
79
80IB stack honors limits enforced by the rdma controller. When application
81query about maximum resource limits of IB device, it returns minimum of
82what is configured by user for a given cgroup and what is supported by
83IB device.
84
85Following resources can be accounted by rdma controller.
86 hca_handle Maximum number of HCA Handles
87 hca_object Maximum number of HCA Objects
88
892. Usage Examples
90-----------------
91
92(a) Configure resource limit:
93echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
94echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
95
96(b) Query resource limit:
97cat /sys/fs/cgroup/rdma/2/rdma.max
98#Output:
99mlx4_0 hca_handle=2 hca_object=2000
100ocrdma1 hca_handle=3 hca_object=max
101
102(c) Query current usage:
103cat /sys/fs/cgroup/rdma/2/rdma.current
104#Output:
105mlx4_0 hca_handle=1 hca_object=20
106ocrdma1 hca_handle=1 hca_object=23
107
108(d) Delete resource limit:
109echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 1d101423ca92..3b8449f8ac7e 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -49,8 +49,10 @@ CONTENTS
49 5-3-2. Writeback 49 5-3-2. Writeback
50 5-4. PID 50 5-4. PID
51 5-4-1. PID Interface Files 51 5-4-1. PID Interface Files
52 5-5. Misc 52 5-5. RDMA
53 5-5-1. perf_event 53 5-5-1. RDMA Interface Files
54 5-6. Misc
55 5-6-1. perf_event
546. Namespace 566. Namespace
55 6-1. Basics 57 6-1. Basics
56 6-2. The Root and Views 58 6-2. The Root and Views
@@ -1160,9 +1162,45 @@ through fork() or clone(). These will return -EAGAIN if the creation
1160of a new process would cause a cgroup policy to be violated. 1162of a new process would cause a cgroup policy to be violated.
1161 1163
1162 1164
11635-5. Misc 11655-5. RDMA
1164 1166
11655-5-1. perf_event 1167The "rdma" controller regulates the distribution and accounting of
1168of RDMA resources.
1169
11705-5-1. RDMA Interface Files
1171
1172 rdma.max
1173 A readwrite nested-keyed file that exists for all the cgroups
1174 except root that describes current configured resource limit
1175 for a RDMA/IB device.
1176
1177 Lines are keyed by device name and are not ordered.
1178 Each line contains space separated resource name and its configured
1179 limit that can be distributed.
1180
1181 The following nested keys are defined.
1182
1183 hca_handle Maximum number of HCA Handles
1184 hca_object Maximum number of HCA Objects
1185
1186 An example for mlx4 and ocrdma device follows.
1187
1188 mlx4_0 hca_handle=2 hca_object=2000
1189 ocrdma1 hca_handle=3 hca_object=max
1190
1191 rdma.current
1192 A read-only file that describes current resource usage.
1193 It exists for all the cgroup except root.
1194
1195 An example for mlx4 and ocrdma device follows.
1196
1197 mlx4_0 hca_handle=1 hca_object=20
1198 ocrdma1 hca_handle=1 hca_object=23
1199
1200
12015-6. Misc
1202
12035-6-1. perf_event
1166 1204
1167perf_event controller, if not mounted on a legacy hierarchy, is 1205perf_event controller, if not mounted on a legacy hierarchy, is
1168automatically enabled on the v2 hierarchy so that perf events can 1206automatically enabled on the v2 hierarchy so that perf events can
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index edaae9f9853c..e426ac877d19 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -13,6 +13,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
13 multicast.o mad.o smi.o agent.o mad_rmpp.o 13 multicast.o mad.o smi.o agent.o mad_rmpp.o
14ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o 14ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
15ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o 15ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
16ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
16 17
17ib_cm-y := cm.o 18ib_cm-y := cm.o
18 19
diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c
new file mode 100644
index 000000000000..126ac5f99db7
--- /dev/null
+++ b/drivers/infiniband/core/cgroup.c
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#include "core_priv.h"
15
16/**
17 * ib_device_register_rdmacg - register with rdma cgroup.
18 * @device: device to register to participate in resource
19 * accounting by rdma cgroup.
20 *
21 * Register with the rdma cgroup. Should be called before
22 * exposing rdma device to user space applications to avoid
23 * resource accounting leak.
24 * Returns 0 on success or otherwise failure code.
25 */
26int ib_device_register_rdmacg(struct ib_device *device)
27{
28 device->cg_device.name = device->name;
29 return rdmacg_register_device(&device->cg_device);
30}
31
32/**
33 * ib_device_unregister_rdmacg - unregister with rdma cgroup.
34 * @device: device to unregister.
35 *
36 * Unregister with the rdma cgroup. Should be called after
37 * all the resources are deallocated, and after a stage when any
38 * other resource allocation by user application cannot be done
39 * for this device to avoid any leak in accounting.
40 */
41void ib_device_unregister_rdmacg(struct ib_device *device)
42{
43 rdmacg_unregister_device(&device->cg_device);
44}
45
46int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
47 struct ib_device *device,
48 enum rdmacg_resource_type resource_index)
49{
50 return rdmacg_try_charge(&cg_obj->cg, &device->cg_device,
51 resource_index);
52}
53EXPORT_SYMBOL(ib_rdmacg_try_charge);
54
55void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
56 struct ib_device *device,
57 enum rdmacg_resource_type resource_index)
58{
59 rdmacg_uncharge(cg_obj->cg, &device->cg_device,
60 resource_index);
61}
62EXPORT_SYMBOL(ib_rdmacg_uncharge);
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index d29372624f3a..389f6192bddc 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -35,6 +35,7 @@
35 35
36#include <linux/list.h> 36#include <linux/list.h>
37#include <linux/spinlock.h> 37#include <linux/spinlock.h>
38#include <linux/cgroup_rdma.h>
38 39
39#include <rdma/ib_verbs.h> 40#include <rdma/ib_verbs.h>
40 41
@@ -121,6 +122,35 @@ int ib_cache_setup_one(struct ib_device *device);
121void ib_cache_cleanup_one(struct ib_device *device); 122void ib_cache_cleanup_one(struct ib_device *device);
122void ib_cache_release_one(struct ib_device *device); 123void ib_cache_release_one(struct ib_device *device);
123 124
125#ifdef CONFIG_CGROUP_RDMA
126int ib_device_register_rdmacg(struct ib_device *device);
127void ib_device_unregister_rdmacg(struct ib_device *device);
128
129int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
130 struct ib_device *device,
131 enum rdmacg_resource_type resource_index);
132
133void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
134 struct ib_device *device,
135 enum rdmacg_resource_type resource_index);
136#else
137static inline int ib_device_register_rdmacg(struct ib_device *device)
138{ return 0; }
139
140static inline void ib_device_unregister_rdmacg(struct ib_device *device)
141{ }
142
143static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
144 struct ib_device *device,
145 enum rdmacg_resource_type resource_index)
146{ return 0; }
147
148static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
149 struct ib_device *device,
150 enum rdmacg_resource_type resource_index)
151{ }
152#endif
153
124static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, 154static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
125 struct net_device *upper) 155 struct net_device *upper)
126{ 156{
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 571974cd3919..70065386acbc 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -360,10 +360,18 @@ int ib_register_device(struct ib_device *device,
360 goto out; 360 goto out;
361 } 361 }
362 362
363 ret = ib_device_register_rdmacg(device);
364 if (ret) {
365 pr_warn("Couldn't register device with rdma cgroup\n");
366 ib_cache_cleanup_one(device);
367 goto out;
368 }
369
363 memset(&device->attrs, 0, sizeof(device->attrs)); 370 memset(&device->attrs, 0, sizeof(device->attrs));
364 ret = device->query_device(device, &device->attrs, &uhw); 371 ret = device->query_device(device, &device->attrs, &uhw);
365 if (ret) { 372 if (ret) {
366 pr_warn("Couldn't query the device attributes\n"); 373 pr_warn("Couldn't query the device attributes\n");
374 ib_device_unregister_rdmacg(device);
367 ib_cache_cleanup_one(device); 375 ib_cache_cleanup_one(device);
368 goto out; 376 goto out;
369 } 377 }
@@ -372,6 +380,7 @@ int ib_register_device(struct ib_device *device,
372 if (ret) { 380 if (ret) {
373 pr_warn("Couldn't register device %s with driver model\n", 381 pr_warn("Couldn't register device %s with driver model\n",
374 device->name); 382 device->name);
383 ib_device_unregister_rdmacg(device);
375 ib_cache_cleanup_one(device); 384 ib_cache_cleanup_one(device);
376 goto out; 385 goto out;
377 } 386 }
@@ -421,6 +430,7 @@ void ib_unregister_device(struct ib_device *device)
421 430
422 mutex_unlock(&device_mutex); 431 mutex_unlock(&device_mutex);
423 432
433 ib_device_unregister_rdmacg(device);
424 ib_device_unregister_sysfs(device); 434 ib_device_unregister_sysfs(device);
425 ib_cache_cleanup_one(device); 435 ib_cache_cleanup_one(device);
426 436
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 700782203483..33bc88a38574 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -316,6 +316,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
316 struct ib_udata udata; 316 struct ib_udata udata;
317 struct ib_ucontext *ucontext; 317 struct ib_ucontext *ucontext;
318 struct file *filp; 318 struct file *filp;
319 struct ib_rdmacg_object cg_obj;
319 int ret; 320 int ret;
320 321
321 if (out_len < sizeof resp) 322 if (out_len < sizeof resp)
@@ -335,13 +336,18 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
335 (unsigned long) cmd.response + sizeof resp, 336 (unsigned long) cmd.response + sizeof resp,
336 in_len - sizeof cmd, out_len - sizeof resp); 337 in_len - sizeof cmd, out_len - sizeof resp);
337 338
339 ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
340 if (ret)
341 goto err;
342
338 ucontext = ib_dev->alloc_ucontext(ib_dev, &udata); 343 ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
339 if (IS_ERR(ucontext)) { 344 if (IS_ERR(ucontext)) {
340 ret = PTR_ERR(ucontext); 345 ret = PTR_ERR(ucontext);
341 goto err; 346 goto err_alloc;
342 } 347 }
343 348
344 ucontext->device = ib_dev; 349 ucontext->device = ib_dev;
350 ucontext->cg_obj = cg_obj;
345 INIT_LIST_HEAD(&ucontext->pd_list); 351 INIT_LIST_HEAD(&ucontext->pd_list);
346 INIT_LIST_HEAD(&ucontext->mr_list); 352 INIT_LIST_HEAD(&ucontext->mr_list);
347 INIT_LIST_HEAD(&ucontext->mw_list); 353 INIT_LIST_HEAD(&ucontext->mw_list);
@@ -407,6 +413,9 @@ err_free:
407 put_pid(ucontext->tgid); 413 put_pid(ucontext->tgid);
408 ib_dev->dealloc_ucontext(ucontext); 414 ib_dev->dealloc_ucontext(ucontext);
409 415
416err_alloc:
417 ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
418
410err: 419err:
411 mutex_unlock(&file->mutex); 420 mutex_unlock(&file->mutex);
412 return ret; 421 return ret;
@@ -561,6 +570,13 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
561 return -ENOMEM; 570 return -ENOMEM;
562 571
563 init_uobj(uobj, 0, file->ucontext, &pd_lock_class); 572 init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
573 ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
574 RDMACG_RESOURCE_HCA_OBJECT);
575 if (ret) {
576 kfree(uobj);
577 return ret;
578 }
579
564 down_write(&uobj->mutex); 580 down_write(&uobj->mutex);
565 581
566 pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata); 582 pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
@@ -605,6 +621,7 @@ err_idr:
605 ib_dealloc_pd(pd); 621 ib_dealloc_pd(pd);
606 622
607err: 623err:
624 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
608 put_uobj_write(uobj); 625 put_uobj_write(uobj);
609 return ret; 626 return ret;
610} 627}
@@ -637,6 +654,8 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
637 if (ret) 654 if (ret)
638 goto err_put; 655 goto err_put;
639 656
657 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
658
640 uobj->live = 0; 659 uobj->live = 0;
641 put_uobj_write(uobj); 660 put_uobj_write(uobj);
642 661
@@ -1006,6 +1025,10 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
1006 goto err_put; 1025 goto err_put;
1007 } 1026 }
1008 } 1027 }
1028 ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
1029 RDMACG_RESOURCE_HCA_OBJECT);
1030 if (ret)
1031 goto err_charge;
1009 1032
1010 mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, 1033 mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
1011 cmd.access_flags, &udata); 1034 cmd.access_flags, &udata);
@@ -1054,6 +1077,9 @@ err_unreg:
1054 ib_dereg_mr(mr); 1077 ib_dereg_mr(mr);
1055 1078
1056err_put: 1079err_put:
1080 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
1081
1082err_charge:
1057 put_pd_read(pd); 1083 put_pd_read(pd);
1058 1084
1059err_free: 1085err_free:
@@ -1178,6 +1204,8 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
1178 if (ret) 1204 if (ret)
1179 return ret; 1205 return ret;
1180 1206
1207 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
1208
1181 idr_remove_uobj(&ib_uverbs_mr_idr, uobj); 1209 idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
1182 1210
1183 mutex_lock(&file->mutex); 1211 mutex_lock(&file->mutex);
@@ -1226,6 +1254,11 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
1226 in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), 1254 in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
1227 out_len - sizeof(resp)); 1255 out_len - sizeof(resp));
1228 1256
1257 ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
1258 RDMACG_RESOURCE_HCA_OBJECT);
1259 if (ret)
1260 goto err_charge;
1261
1229 mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata); 1262 mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata);
1230 if (IS_ERR(mw)) { 1263 if (IS_ERR(mw)) {
1231 ret = PTR_ERR(mw); 1264 ret = PTR_ERR(mw);
@@ -1271,6 +1304,9 @@ err_unalloc:
1271 uverbs_dealloc_mw(mw); 1304 uverbs_dealloc_mw(mw);
1272 1305
1273err_put: 1306err_put:
1307 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
1308
1309err_charge:
1274 put_pd_read(pd); 1310 put_pd_read(pd);
1275 1311
1276err_free: 1312err_free:
@@ -1306,6 +1342,8 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
1306 if (ret) 1342 if (ret)
1307 return ret; 1343 return ret;
1308 1344
1345 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
1346
1309 idr_remove_uobj(&ib_uverbs_mw_idr, uobj); 1347 idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
1310 1348
1311 mutex_lock(&file->mutex); 1349 mutex_lock(&file->mutex);
@@ -1405,6 +1443,11 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
1405 if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags)) 1443 if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
1406 attr.flags = cmd->flags; 1444 attr.flags = cmd->flags;
1407 1445
1446 ret = ib_rdmacg_try_charge(&obj->uobject.cg_obj, ib_dev,
1447 RDMACG_RESOURCE_HCA_OBJECT);
1448 if (ret)
1449 goto err_charge;
1450
1408 cq = ib_dev->create_cq(ib_dev, &attr, 1451 cq = ib_dev->create_cq(ib_dev, &attr,
1409 file->ucontext, uhw); 1452 file->ucontext, uhw);
1410 if (IS_ERR(cq)) { 1453 if (IS_ERR(cq)) {
@@ -1452,6 +1495,10 @@ err_free:
1452 ib_destroy_cq(cq); 1495 ib_destroy_cq(cq);
1453 1496
1454err_file: 1497err_file:
1498 ib_rdmacg_uncharge(&obj->uobject.cg_obj, ib_dev,
1499 RDMACG_RESOURCE_HCA_OBJECT);
1500
1501err_charge:
1455 if (ev_file) 1502 if (ev_file)
1456 ib_uverbs_release_ucq(file, ev_file, obj); 1503 ib_uverbs_release_ucq(file, ev_file, obj);
1457 1504
@@ -1732,6 +1779,8 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
1732 if (ret) 1779 if (ret)
1733 return ret; 1780 return ret;
1734 1781
1782 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
1783
1735 idr_remove_uobj(&ib_uverbs_cq_idr, uobj); 1784 idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
1736 1785
1737 mutex_lock(&file->mutex); 1786 mutex_lock(&file->mutex);
@@ -1904,6 +1953,11 @@ static int create_qp(struct ib_uverbs_file *file,
1904 goto err_put; 1953 goto err_put;
1905 } 1954 }
1906 1955
1956 ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, device,
1957 RDMACG_RESOURCE_HCA_OBJECT);
1958 if (ret)
1959 goto err_put;
1960
1907 if (cmd->qp_type == IB_QPT_XRC_TGT) 1961 if (cmd->qp_type == IB_QPT_XRC_TGT)
1908 qp = ib_create_qp(pd, &attr); 1962 qp = ib_create_qp(pd, &attr);
1909 else 1963 else
@@ -1911,7 +1965,7 @@ static int create_qp(struct ib_uverbs_file *file,
1911 1965
1912 if (IS_ERR(qp)) { 1966 if (IS_ERR(qp)) {
1913 ret = PTR_ERR(qp); 1967 ret = PTR_ERR(qp);
1914 goto err_put; 1968 goto err_create;
1915 } 1969 }
1916 1970
1917 if (cmd->qp_type != IB_QPT_XRC_TGT) { 1971 if (cmd->qp_type != IB_QPT_XRC_TGT) {
@@ -1992,6 +2046,10 @@ err_cb:
1992err_destroy: 2046err_destroy:
1993 ib_destroy_qp(qp); 2047 ib_destroy_qp(qp);
1994 2048
2049err_create:
2050 ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, device,
2051 RDMACG_RESOURCE_HCA_OBJECT);
2052
1995err_put: 2053err_put:
1996 if (xrcd) 2054 if (xrcd)
1997 put_xrcd_read(xrcd_uobj); 2055 put_xrcd_read(xrcd_uobj);
@@ -2518,6 +2576,8 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
2518 if (ret) 2576 if (ret)
2519 return ret; 2577 return ret;
2520 2578
2579 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
2580
2521 if (obj->uxrcd) 2581 if (obj->uxrcd)
2522 atomic_dec(&obj->uxrcd->refcnt); 2582 atomic_dec(&obj->uxrcd->refcnt);
2523 2583
@@ -2969,11 +3029,16 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
2969 memset(&attr.dmac, 0, sizeof(attr.dmac)); 3029 memset(&attr.dmac, 0, sizeof(attr.dmac));
2970 memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16); 3030 memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
2971 3031
3032 ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
3033 RDMACG_RESOURCE_HCA_OBJECT);
3034 if (ret)
3035 goto err_charge;
3036
2972 ah = pd->device->create_ah(pd, &attr, &udata); 3037 ah = pd->device->create_ah(pd, &attr, &udata);
2973 3038
2974 if (IS_ERR(ah)) { 3039 if (IS_ERR(ah)) {
2975 ret = PTR_ERR(ah); 3040 ret = PTR_ERR(ah);
2976 goto err_put; 3041 goto err_create;
2977 } 3042 }
2978 3043
2979 ah->device = pd->device; 3044 ah->device = pd->device;
@@ -3012,7 +3077,10 @@ err_copy:
3012err_destroy: 3077err_destroy:
3013 ib_destroy_ah(ah); 3078 ib_destroy_ah(ah);
3014 3079
3015err_put: 3080err_create:
3081 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
3082
3083err_charge:
3016 put_pd_read(pd); 3084 put_pd_read(pd);
3017 3085
3018err: 3086err:
@@ -3046,6 +3114,8 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
3046 if (ret) 3114 if (ret)
3047 return ret; 3115 return ret;
3048 3116
3117 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
3118
3049 idr_remove_uobj(&ib_uverbs_ah_idr, uobj); 3119 idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
3050 3120
3051 mutex_lock(&file->mutex); 3121 mutex_lock(&file->mutex);
@@ -3822,10 +3892,16 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
3822 err = -EINVAL; 3892 err = -EINVAL;
3823 goto err_free; 3893 goto err_free;
3824 } 3894 }
3895
3896 err = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
3897 RDMACG_RESOURCE_HCA_OBJECT);
3898 if (err)
3899 goto err_free;
3900
3825 flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); 3901 flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
3826 if (IS_ERR(flow_id)) { 3902 if (IS_ERR(flow_id)) {
3827 err = PTR_ERR(flow_id); 3903 err = PTR_ERR(flow_id);
3828 goto err_free; 3904 goto err_create;
3829 } 3905 }
3830 flow_id->uobject = uobj; 3906 flow_id->uobject = uobj;
3831 uobj->object = flow_id; 3907 uobj->object = flow_id;
@@ -3858,6 +3934,8 @@ err_copy:
3858 idr_remove_uobj(&ib_uverbs_rule_idr, uobj); 3934 idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
3859destroy_flow: 3935destroy_flow:
3860 ib_destroy_flow(flow_id); 3936 ib_destroy_flow(flow_id);
3937err_create:
3938 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
3861err_free: 3939err_free:
3862 kfree(flow_attr); 3940 kfree(flow_attr);
3863err_put: 3941err_put:
@@ -3897,8 +3975,11 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
3897 flow_id = uobj->object; 3975 flow_id = uobj->object;
3898 3976
3899 ret = ib_destroy_flow(flow_id); 3977 ret = ib_destroy_flow(flow_id);
3900 if (!ret) 3978 if (!ret) {
3979 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev,
3980 RDMACG_RESOURCE_HCA_OBJECT);
3901 uobj->live = 0; 3981 uobj->live = 0;
3982 }
3902 3983
3903 put_uobj_write(uobj); 3984 put_uobj_write(uobj);
3904 3985
@@ -3966,6 +4047,11 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
3966 obj->uevent.events_reported = 0; 4047 obj->uevent.events_reported = 0;
3967 INIT_LIST_HEAD(&obj->uevent.event_list); 4048 INIT_LIST_HEAD(&obj->uevent.event_list);
3968 4049
4050 ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, ib_dev,
4051 RDMACG_RESOURCE_HCA_OBJECT);
4052 if (ret)
4053 goto err_put_cq;
4054
3969 srq = pd->device->create_srq(pd, &attr, udata); 4055 srq = pd->device->create_srq(pd, &attr, udata);
3970 if (IS_ERR(srq)) { 4056 if (IS_ERR(srq)) {
3971 ret = PTR_ERR(srq); 4057 ret = PTR_ERR(srq);
@@ -4030,6 +4116,8 @@ err_destroy:
4030 ib_destroy_srq(srq); 4116 ib_destroy_srq(srq);
4031 4117
4032err_put: 4118err_put:
4119 ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, ib_dev,
4120 RDMACG_RESOURCE_HCA_OBJECT);
4033 put_pd_read(pd); 4121 put_pd_read(pd);
4034 4122
4035err_put_cq: 4123err_put_cq:
@@ -4216,6 +4304,8 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
4216 if (ret) 4304 if (ret)
4217 return ret; 4305 return ret;
4218 4306
4307 ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
4308
4219 if (srq_type == IB_SRQT_XRC) { 4309 if (srq_type == IB_SRQT_XRC) {
4220 us = container_of(obj, struct ib_usrq_object, uevent); 4310 us = container_of(obj, struct ib_usrq_object, uevent);
4221 atomic_dec(&us->uxrcd->refcnt); 4311 atomic_dec(&us->uxrcd->refcnt);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index b3f95d453fba..cdbd26d6574b 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -51,6 +51,7 @@
51#include <rdma/ib.h> 51#include <rdma/ib.h>
52 52
53#include "uverbs.h" 53#include "uverbs.h"
54#include "core_priv.h"
54 55
55MODULE_AUTHOR("Roland Dreier"); 56MODULE_AUTHOR("Roland Dreier");
56MODULE_DESCRIPTION("InfiniBand userspace verbs access"); 57MODULE_DESCRIPTION("InfiniBand userspace verbs access");
@@ -237,6 +238,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
237 238
238 idr_remove_uobj(&ib_uverbs_ah_idr, uobj); 239 idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
239 ib_destroy_ah(ah); 240 ib_destroy_ah(ah);
241 ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
242 RDMACG_RESOURCE_HCA_OBJECT);
240 kfree(uobj); 243 kfree(uobj);
241 } 244 }
242 245
@@ -246,6 +249,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
246 249
247 idr_remove_uobj(&ib_uverbs_mw_idr, uobj); 250 idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
248 uverbs_dealloc_mw(mw); 251 uverbs_dealloc_mw(mw);
252 ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
253 RDMACG_RESOURCE_HCA_OBJECT);
249 kfree(uobj); 254 kfree(uobj);
250 } 255 }
251 256
@@ -254,6 +259,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
254 259
255 idr_remove_uobj(&ib_uverbs_rule_idr, uobj); 260 idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
256 ib_destroy_flow(flow_id); 261 ib_destroy_flow(flow_id);
262 ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
263 RDMACG_RESOURCE_HCA_OBJECT);
257 kfree(uobj); 264 kfree(uobj);
258 } 265 }
259 266
@@ -266,6 +273,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
266 if (qp == qp->real_qp) 273 if (qp == qp->real_qp)
267 ib_uverbs_detach_umcast(qp, uqp); 274 ib_uverbs_detach_umcast(qp, uqp);
268 ib_destroy_qp(qp); 275 ib_destroy_qp(qp);
276 ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
277 RDMACG_RESOURCE_HCA_OBJECT);
269 ib_uverbs_release_uevent(file, &uqp->uevent); 278 ib_uverbs_release_uevent(file, &uqp->uevent);
270 kfree(uqp); 279 kfree(uqp);
271 } 280 }
@@ -298,6 +307,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
298 307
299 idr_remove_uobj(&ib_uverbs_srq_idr, uobj); 308 idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
300 ib_destroy_srq(srq); 309 ib_destroy_srq(srq);
310 ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
311 RDMACG_RESOURCE_HCA_OBJECT);
301 ib_uverbs_release_uevent(file, uevent); 312 ib_uverbs_release_uevent(file, uevent);
302 kfree(uevent); 313 kfree(uevent);
303 } 314 }
@@ -310,6 +321,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
310 321
311 idr_remove_uobj(&ib_uverbs_cq_idr, uobj); 322 idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
312 ib_destroy_cq(cq); 323 ib_destroy_cq(cq);
324 ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
325 RDMACG_RESOURCE_HCA_OBJECT);
313 ib_uverbs_release_ucq(file, ev_file, ucq); 326 ib_uverbs_release_ucq(file, ev_file, ucq);
314 kfree(ucq); 327 kfree(ucq);
315 } 328 }
@@ -319,6 +332,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
319 332
320 idr_remove_uobj(&ib_uverbs_mr_idr, uobj); 333 idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
321 ib_dereg_mr(mr); 334 ib_dereg_mr(mr);
335 ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
336 RDMACG_RESOURCE_HCA_OBJECT);
322 kfree(uobj); 337 kfree(uobj);
323 } 338 }
324 339
@@ -339,11 +354,16 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
339 354
340 idr_remove_uobj(&ib_uverbs_pd_idr, uobj); 355 idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
341 ib_dealloc_pd(pd); 356 ib_dealloc_pd(pd);
357 ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
358 RDMACG_RESOURCE_HCA_OBJECT);
342 kfree(uobj); 359 kfree(uobj);
343 } 360 }
344 361
345 put_pid(context->tgid); 362 put_pid(context->tgid);
346 363
364 ib_rdmacg_uncharge(&context->cg_obj, context->device,
365 RDMACG_RESOURCE_HCA_HANDLE);
366
347 return context->device->dealloc_ucontext(context); 367 return context->device->dealloc_ucontext(context);
348} 368}
349 369
diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
new file mode 100644
index 000000000000..e94290b29e99
--- /dev/null
+++ b/include/linux/cgroup_rdma.h
@@ -0,0 +1,53 @@
1/*
2 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
3 *
4 * This file is subject to the terms and conditions of version 2 of the GNU
5 * General Public License. See the file COPYING in the main directory of the
6 * Linux distribution for more details.
7 */
8
9#ifndef _CGROUP_RDMA_H
10#define _CGROUP_RDMA_H
11
12#include <linux/cgroup.h>
13
14enum rdmacg_resource_type {
15 RDMACG_RESOURCE_HCA_HANDLE,
16 RDMACG_RESOURCE_HCA_OBJECT,
17 RDMACG_RESOURCE_MAX,
18};
19
20#ifdef CONFIG_CGROUP_RDMA
21
22struct rdma_cgroup {
23 struct cgroup_subsys_state css;
24
25 /*
26 * head to keep track of all resource pools
27 * that belongs to this cgroup.
28 */
29 struct list_head rpools;
30};
31
32struct rdmacg_device {
33 struct list_head dev_node;
34 struct list_head rpools;
35 char *name;
36};
37
38/*
39 * APIs for RDMA/IB stack to publish when a device wants to
40 * participate in resource accounting
41 */
42int rdmacg_register_device(struct rdmacg_device *device);
43void rdmacg_unregister_device(struct rdmacg_device *device);
44
45/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
46int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
47 struct rdmacg_device *device,
48 enum rdmacg_resource_type index);
49void rdmacg_uncharge(struct rdma_cgroup *cg,
50 struct rdmacg_device *device,
51 enum rdmacg_resource_type index);
52#endif /* CONFIG_CGROUP_RDMA */
53#endif /* _CGROUP_RDMA_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0df0336acee9..d0e597c44585 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -56,6 +56,10 @@ SUBSYS(hugetlb)
56SUBSYS(pids) 56SUBSYS(pids)
57#endif 57#endif
58 58
59#if IS_ENABLED(CONFIG_CGROUP_RDMA)
60SUBSYS(rdma)
61#endif
62
59/* 63/*
60 * The following subsystems are not supported on the default hierarchy. 64 * The following subsystems are not supported on the default hierarchy.
61 */ 65 */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 958a24d8fae7..63896a477896 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -60,6 +60,7 @@
60#include <linux/atomic.h> 60#include <linux/atomic.h>
61#include <linux/mmu_notifier.h> 61#include <linux/mmu_notifier.h>
62#include <linux/uaccess.h> 62#include <linux/uaccess.h>
63#include <linux/cgroup_rdma.h>
63 64
64extern struct workqueue_struct *ib_wq; 65extern struct workqueue_struct *ib_wq;
65extern struct workqueue_struct *ib_comp_wq; 66extern struct workqueue_struct *ib_comp_wq;
@@ -1331,6 +1332,12 @@ struct ib_fmr_attr {
1331 1332
1332struct ib_umem; 1333struct ib_umem;
1333 1334
1335struct ib_rdmacg_object {
1336#ifdef CONFIG_CGROUP_RDMA
1337 struct rdma_cgroup *cg; /* owner rdma cgroup */
1338#endif
1339};
1340
1334struct ib_ucontext { 1341struct ib_ucontext {
1335 struct ib_device *device; 1342 struct ib_device *device;
1336 struct list_head pd_list; 1343 struct list_head pd_list;
@@ -1363,6 +1370,8 @@ struct ib_ucontext {
1363 struct list_head no_private_counters; 1370 struct list_head no_private_counters;
1364 int odp_mrs_count; 1371 int odp_mrs_count;
1365#endif 1372#endif
1373
1374 struct ib_rdmacg_object cg_obj;
1366}; 1375};
1367 1376
1368struct ib_uobject { 1377struct ib_uobject {
@@ -1370,6 +1379,7 @@ struct ib_uobject {
1370 struct ib_ucontext *context; /* associated user context */ 1379 struct ib_ucontext *context; /* associated user context */
1371 void *object; /* containing object */ 1380 void *object; /* containing object */
1372 struct list_head list; /* link to context's list */ 1381 struct list_head list; /* link to context's list */
1382 struct ib_rdmacg_object cg_obj; /* rdmacg object */
1373 int id; /* index into kernel idr */ 1383 int id; /* index into kernel idr */
1374 struct kref ref; 1384 struct kref ref;
1375 struct rw_semaphore mutex; /* protects .live */ 1385 struct rw_semaphore mutex; /* protects .live */
@@ -2118,6 +2128,10 @@ struct ib_device {
2118 struct attribute_group *hw_stats_ag; 2128 struct attribute_group *hw_stats_ag;
2119 struct rdma_hw_stats *hw_stats; 2129 struct rdma_hw_stats *hw_stats;
2120 2130
2131#ifdef CONFIG_CGROUP_RDMA
2132 struct rdmacg_device cg_device;
2133#endif
2134
2121 /** 2135 /**
2122 * The following mandatory functions are used only at device 2136 * The following mandatory functions are used only at device
2123 * registration. Keep functions such as these at the end of this 2137 * registration. Keep functions such as these at the end of this
diff --git a/init/Kconfig b/init/Kconfig
index 223b734abccd..ef80d46a32b6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1090,6 +1090,16 @@ config CGROUP_PIDS
1090 since the PIDs limit only affects a process's ability to fork, not to 1090 since the PIDs limit only affects a process's ability to fork, not to
1091 attach to a cgroup. 1091 attach to a cgroup.
1092 1092
1093config CGROUP_RDMA
1094 bool "RDMA controller"
1095 help
1096 Provides enforcement of RDMA resources defined by IB stack.
1097 It is fairly easy for consumers to exhaust RDMA resources, which
1098 can result into resource unavailability to other consumers.
1099 RDMA controller is designed to stop this from happening.
1100 Attaching processes with active RDMA resources to the cgroup
1101 hierarchy is allowed even if can cross the hierarchy's limit.
1102
1093config CGROUP_FREEZER 1103config CGROUP_FREEZER
1094 bool "Freezer controller" 1104 bool "Freezer controller"
1095 help 1105 help
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 6d42a3211164..387348a40c64 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -2,4 +2,5 @@ obj-y := cgroup.o namespace.o cgroup-v1.o
2 2
3obj-$(CONFIG_CGROUP_FREEZER) += freezer.o 3obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
4obj-$(CONFIG_CGROUP_PIDS) += pids.o 4obj-$(CONFIG_CGROUP_PIDS) += pids.o
5obj-$(CONFIG_CGROUP_RDMA) += rdma.o
5obj-$(CONFIG_CPUSETS) += cpuset.o 6obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
new file mode 100644
index 000000000000..defad3c5e7dc
--- /dev/null
+++ b/kernel/cgroup/rdma.c
@@ -0,0 +1,619 @@
1/*
2 * RDMA resource limiting controller for cgroups.
3 *
4 * Used to allow a cgroup hierarchy to stop processes from consuming
5 * additional RDMA resources after a certain limit is reached.
6 *
7 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
8 *
9 * This file is subject to the terms and conditions of version 2 of the GNU
10 * General Public License. See the file COPYING in the main directory of the
11 * Linux distribution for more details.
12 */
13
14#include <linux/bitops.h>
15#include <linux/slab.h>
16#include <linux/seq_file.h>
17#include <linux/cgroup.h>
18#include <linux/parser.h>
19#include <linux/cgroup_rdma.h>
20
21#define RDMACG_MAX_STR "max"
22
23/*
24 * Protects list of resource pools maintained on per cgroup basis
25 * and rdma device list.
26 */
27static DEFINE_MUTEX(rdmacg_mutex);
28static LIST_HEAD(rdmacg_devices);
29
30enum rdmacg_file_type {
31 RDMACG_RESOURCE_TYPE_MAX,
32 RDMACG_RESOURCE_TYPE_STAT,
33};
34
35/*
36 * resource table definition as to be seen by the user.
37 * Need to add entries to it when more resources are
38 * added/defined at IB verb/core layer.
39 */
40static char const *rdmacg_resource_names[] = {
41 [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle",
42 [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object",
43};
44
45/* resource tracker for each resource of rdma cgroup */
46struct rdmacg_resource {
47 int max;
48 int usage;
49};
50
51/*
52 * resource pool object which represents per cgroup, per device
53 * resources. There are multiple instances of this object per cgroup,
54 * therefore it cannot be embedded within rdma_cgroup structure. It
55 * is maintained as list.
56 */
57struct rdmacg_resource_pool {
58 struct rdmacg_device *device;
59 struct rdmacg_resource resources[RDMACG_RESOURCE_MAX];
60
61 struct list_head cg_node;
62 struct list_head dev_node;
63
64 /* count active user tasks of this pool */
65 u64 usage_sum;
66 /* total number counts which are set to max */
67 int num_max_cnt;
68};
69
70static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
71{
72 return container_of(css, struct rdma_cgroup, css);
73}
74
75static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
76{
77 return css_rdmacg(cg->css.parent);
78}
79
80static inline struct rdma_cgroup *get_current_rdmacg(void)
81{
82 return css_rdmacg(task_get_css(current, rdma_cgrp_id));
83}
84
85static void set_resource_limit(struct rdmacg_resource_pool *rpool,
86 int index, int new_max)
87{
88 if (new_max == S32_MAX) {
89 if (rpool->resources[index].max != S32_MAX)
90 rpool->num_max_cnt++;
91 } else {
92 if (rpool->resources[index].max == S32_MAX)
93 rpool->num_max_cnt--;
94 }
95 rpool->resources[index].max = new_max;
96}
97
98static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
99{
100 int i;
101
102 for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
103 set_resource_limit(rpool, i, S32_MAX);
104}
105
106static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
107{
108 lockdep_assert_held(&rdmacg_mutex);
109
110 list_del(&rpool->cg_node);
111 list_del(&rpool->dev_node);
112 kfree(rpool);
113}
114
115static struct rdmacg_resource_pool *
116find_cg_rpool_locked(struct rdma_cgroup *cg,
117 struct rdmacg_device *device)
118
119{
120 struct rdmacg_resource_pool *pool;
121
122 lockdep_assert_held(&rdmacg_mutex);
123
124 list_for_each_entry(pool, &cg->rpools, cg_node)
125 if (pool->device == device)
126 return pool;
127
128 return NULL;
129}
130
131static struct rdmacg_resource_pool *
132get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
133{
134 struct rdmacg_resource_pool *rpool;
135
136 rpool = find_cg_rpool_locked(cg, device);
137 if (rpool)
138 return rpool;
139
140 rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
141 if (!rpool)
142 return ERR_PTR(-ENOMEM);
143
144 rpool->device = device;
145 set_all_resource_max_limit(rpool);
146
147 INIT_LIST_HEAD(&rpool->cg_node);
148 INIT_LIST_HEAD(&rpool->dev_node);
149 list_add_tail(&rpool->cg_node, &cg->rpools);
150 list_add_tail(&rpool->dev_node, &device->rpools);
151 return rpool;
152}
153
154/**
155 * uncharge_cg_locked - uncharge resource for rdma cgroup
156 * @cg: pointer to cg to uncharge and all parents in hierarchy
157 * @device: pointer to rdmacg device
158 * @index: index of the resource to uncharge in cg (resource pool)
159 *
160 * It also frees the resource pool which was created as part of
161 * charging operation when there are no resources attached to
162 * resource pool.
163 */
164static void
165uncharge_cg_locked(struct rdma_cgroup *cg,
166 struct rdmacg_device *device,
167 enum rdmacg_resource_type index)
168{
169 struct rdmacg_resource_pool *rpool;
170
171 rpool = find_cg_rpool_locked(cg, device);
172
173 /*
174 * rpool cannot be null at this stage. Let kernel operate in case
175 * if there a bug in IB stack or rdma controller, instead of crashing
176 * the system.
177 */
178 if (unlikely(!rpool)) {
179 pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
180 return;
181 }
182
183 rpool->resources[index].usage--;
184
185 /*
186 * A negative count (or overflow) is invalid,
187 * it indicates a bug in the rdma controller.
188 */
189 WARN_ON_ONCE(rpool->resources[index].usage < 0);
190 rpool->usage_sum--;
191 if (rpool->usage_sum == 0 &&
192 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
193 /*
194 * No user of the rpool and all entries are set to max, so
195 * safe to delete this rpool.
196 */
197 free_cg_rpool_locked(rpool);
198 }
199}
200
201/**
202 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
203 * @device: pointer to rdmacg device
204 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
205 * stop uncharging
206 * @index: index of the resource to uncharge in cg in given resource pool
207 */
208static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
209 struct rdmacg_device *device,
210 struct rdma_cgroup *stop_cg,
211 enum rdmacg_resource_type index)
212{
213 struct rdma_cgroup *p;
214
215 mutex_lock(&rdmacg_mutex);
216
217 for (p = cg; p != stop_cg; p = parent_rdmacg(p))
218 uncharge_cg_locked(p, device, index);
219
220 mutex_unlock(&rdmacg_mutex);
221
222 css_put(&cg->css);
223}
224
225/**
226 * rdmacg_uncharge - hierarchically uncharge rdma resource count
227 * @device: pointer to rdmacg device
228 * @index: index of the resource to uncharge in cgroup in given resource pool
229 */
230void rdmacg_uncharge(struct rdma_cgroup *cg,
231 struct rdmacg_device *device,
232 enum rdmacg_resource_type index)
233{
234 if (index >= RDMACG_RESOURCE_MAX)
235 return;
236
237 rdmacg_uncharge_hierarchy(cg, device, NULL, index);
238}
239EXPORT_SYMBOL(rdmacg_uncharge);
240
241/**
242 * rdmacg_try_charge - hierarchically try to charge the rdma resource
243 * @rdmacg: pointer to rdma cgroup which will own this resource
244 * @device: pointer to rdmacg device
245 * @index: index of the resource to charge in cgroup (resource pool)
246 *
247 * This function follows charging resource in hierarchical way.
248 * It will fail if the charge would cause the new value to exceed the
249 * hierarchical limit.
250 * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
251 * Returns pointer to rdmacg for this resource when charging is successful.
252 *
253 * Charger needs to account resources on two criteria.
254 * (a) per cgroup & (b) per device resource usage.
255 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
256 * the configured limits. Per device provides granular configuration
257 * in multi device usage. It allocates resource pool in the hierarchy
258 * for each parent it come across for first resource. Later on resource
259 * pool will be available. Therefore it will be much faster thereon
260 * to charge/uncharge.
261 */
262int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
263 struct rdmacg_device *device,
264 enum rdmacg_resource_type index)
265{
266 struct rdma_cgroup *cg, *p;
267 struct rdmacg_resource_pool *rpool;
268 s64 new;
269 int ret = 0;
270
271 if (index >= RDMACG_RESOURCE_MAX)
272 return -EINVAL;
273
274 /*
275 * hold on to css, as cgroup can be removed but resource
276 * accounting happens on css.
277 */
278 cg = get_current_rdmacg();
279
280 mutex_lock(&rdmacg_mutex);
281 for (p = cg; p; p = parent_rdmacg(p)) {
282 rpool = get_cg_rpool_locked(p, device);
283 if (IS_ERR(rpool)) {
284 ret = PTR_ERR(rpool);
285 goto err;
286 } else {
287 new = rpool->resources[index].usage + 1;
288 if (new > rpool->resources[index].max) {
289 ret = -EAGAIN;
290 goto err;
291 } else {
292 rpool->resources[index].usage = new;
293 rpool->usage_sum++;
294 }
295 }
296 }
297 mutex_unlock(&rdmacg_mutex);
298
299 *rdmacg = cg;
300 return 0;
301
302err:
303 mutex_unlock(&rdmacg_mutex);
304 rdmacg_uncharge_hierarchy(cg, device, p, index);
305 return ret;
306}
307EXPORT_SYMBOL(rdmacg_try_charge);
308
309/**
310 * rdmacg_register_device - register rdmacg device to rdma controller.
311 * @device: pointer to rdmacg device whose resources need to be accounted.
312 *
313 * If IB stack wish a device to participate in rdma cgroup resource
314 * tracking, it must invoke this API to register with rdma cgroup before
315 * any user space application can start using the RDMA resources.
316 * Returns 0 on success or EINVAL when table length given is beyond
317 * supported size.
318 */
319int rdmacg_register_device(struct rdmacg_device *device)
320{
321 INIT_LIST_HEAD(&device->dev_node);
322 INIT_LIST_HEAD(&device->rpools);
323
324 mutex_lock(&rdmacg_mutex);
325 list_add_tail(&device->dev_node, &rdmacg_devices);
326 mutex_unlock(&rdmacg_mutex);
327 return 0;
328}
329EXPORT_SYMBOL(rdmacg_register_device);
330
331/**
332 * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
333 * @device: pointer to rdmacg device which was previously registered with rdma
334 * controller using rdmacg_register_device().
335 *
336 * IB stack must invoke this after all the resources of the IB device
337 * are destroyed and after ensuring that no more resources will be created
338 * when this API is invoked.
339 */
340void rdmacg_unregister_device(struct rdmacg_device *device)
341{
342 struct rdmacg_resource_pool *rpool, *tmp;
343
344 /*
345 * Synchronize with any active resource settings,
346 * usage query happening via configfs.
347 */
348 mutex_lock(&rdmacg_mutex);
349 list_del_init(&device->dev_node);
350
351 /*
352 * Now that this device is off the cgroup list, its safe to free
353 * all the rpool resources.
354 */
355 list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
356 free_cg_rpool_locked(rpool);
357
358 mutex_unlock(&rdmacg_mutex);
359}
360EXPORT_SYMBOL(rdmacg_unregister_device);
361
362static int parse_resource(char *c, int *intval)
363{
364 substring_t argstr;
365 const char **table = &rdmacg_resource_names[0];
366 char *name, *value = c;
367 size_t len;
368 int ret, i = 0;
369
370 name = strsep(&value, "=");
371 if (!name || !value)
372 return -EINVAL;
373
374 len = strlen(value);
375
376 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
377 if (strcmp(table[i], name))
378 continue;
379
380 argstr.from = value;
381 argstr.to = value + len;
382
383 ret = match_int(&argstr, intval);
384 if (ret >= 0) {
385 if (*intval < 0)
386 break;
387 return i;
388 }
389 if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
390 *intval = S32_MAX;
391 return i;
392 }
393 break;
394 }
395 return -EINVAL;
396}
397
398static int rdmacg_parse_limits(char *options,
399 int *new_limits, unsigned long *enables)
400{
401 char *c;
402 int err = -EINVAL;
403
404 /* parse resource options */
405 while ((c = strsep(&options, " ")) != NULL) {
406 int index, intval;
407
408 index = parse_resource(c, &intval);
409 if (index < 0)
410 goto err;
411
412 new_limits[index] = intval;
413 *enables |= BIT(index);
414 }
415 return 0;
416
417err:
418 return err;
419}
420
421static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
422{
423 struct rdmacg_device *device;
424
425 lockdep_assert_held(&rdmacg_mutex);
426
427 list_for_each_entry(device, &rdmacg_devices, dev_node)
428 if (!strcmp(name, device->name))
429 return device;
430
431 return NULL;
432}
433
434static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
435 char *buf, size_t nbytes, loff_t off)
436{
437 struct rdma_cgroup *cg = css_rdmacg(of_css(of));
438 const char *dev_name;
439 struct rdmacg_resource_pool *rpool;
440 struct rdmacg_device *device;
441 char *options = strstrip(buf);
442 int *new_limits;
443 unsigned long enables = 0;
444 int i = 0, ret = 0;
445
446 /* extract the device name first */
447 dev_name = strsep(&options, " ");
448 if (!dev_name) {
449 ret = -EINVAL;
450 goto err;
451 }
452
453 new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
454 if (!new_limits) {
455 ret = -ENOMEM;
456 goto err;
457 }
458
459 ret = rdmacg_parse_limits(options, new_limits, &enables);
460 if (ret)
461 goto parse_err;
462
463 /* acquire lock to synchronize with hot plug devices */
464 mutex_lock(&rdmacg_mutex);
465
466 device = rdmacg_get_device_locked(dev_name);
467 if (!device) {
468 ret = -ENODEV;
469 goto dev_err;
470 }
471
472 rpool = get_cg_rpool_locked(cg, device);
473 if (IS_ERR(rpool)) {
474 ret = PTR_ERR(rpool);
475 goto dev_err;
476 }
477
478 /* now set the new limits of the rpool */
479 for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
480 set_resource_limit(rpool, i, new_limits[i]);
481
482 if (rpool->usage_sum == 0 &&
483 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
484 /*
485 * No user of the rpool and all entries are set to max, so
486 * safe to delete this rpool.
487 */
488 free_cg_rpool_locked(rpool);
489 }
490
491dev_err:
492 mutex_unlock(&rdmacg_mutex);
493
494parse_err:
495 kfree(new_limits);
496
497err:
498 return ret ?: nbytes;
499}
500
501static void print_rpool_values(struct seq_file *sf,
502 struct rdmacg_resource_pool *rpool)
503{
504 enum rdmacg_file_type sf_type;
505 int i;
506 u32 value;
507
508 sf_type = seq_cft(sf)->private;
509
510 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
511 seq_puts(sf, rdmacg_resource_names[i]);
512 seq_putc(sf, '=');
513 if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
514 if (rpool)
515 value = rpool->resources[i].max;
516 else
517 value = S32_MAX;
518 } else {
519 if (rpool)
520 value = rpool->resources[i].usage;
521 else
522 value = 0;
523 }
524
525 if (value == S32_MAX)
526 seq_puts(sf, RDMACG_MAX_STR);
527 else
528 seq_printf(sf, "%d", value);
529 seq_putc(sf, ' ');
530 }
531}
532
533static int rdmacg_resource_read(struct seq_file *sf, void *v)
534{
535 struct rdmacg_device *device;
536 struct rdmacg_resource_pool *rpool;
537 struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
538
539 mutex_lock(&rdmacg_mutex);
540
541 list_for_each_entry(device, &rdmacg_devices, dev_node) {
542 seq_printf(sf, "%s ", device->name);
543
544 rpool = find_cg_rpool_locked(cg, device);
545 print_rpool_values(sf, rpool);
546
547 seq_putc(sf, '\n');
548 }
549
550 mutex_unlock(&rdmacg_mutex);
551 return 0;
552}
553
554static struct cftype rdmacg_files[] = {
555 {
556 .name = "max",
557 .write = rdmacg_resource_set_max,
558 .seq_show = rdmacg_resource_read,
559 .private = RDMACG_RESOURCE_TYPE_MAX,
560 .flags = CFTYPE_NOT_ON_ROOT,
561 },
562 {
563 .name = "current",
564 .seq_show = rdmacg_resource_read,
565 .private = RDMACG_RESOURCE_TYPE_STAT,
566 .flags = CFTYPE_NOT_ON_ROOT,
567 },
568 { } /* terminate */
569};
570
571static struct cgroup_subsys_state *
572rdmacg_css_alloc(struct cgroup_subsys_state *parent)
573{
574 struct rdma_cgroup *cg;
575
576 cg = kzalloc(sizeof(*cg), GFP_KERNEL);
577 if (!cg)
578 return ERR_PTR(-ENOMEM);
579
580 INIT_LIST_HEAD(&cg->rpools);
581 return &cg->css;
582}
583
584static void rdmacg_css_free(struct cgroup_subsys_state *css)
585{
586 struct rdma_cgroup *cg = css_rdmacg(css);
587
588 kfree(cg);
589}
590
591/**
592 * rdmacg_css_offline - cgroup css_offline callback
593 * @css: css of interest
594 *
595 * This function is called when @css is about to go away and responsible
596 * for shooting down all rdmacg associated with @css. As part of that it
597 * marks all the resource pool entries to max value, so that when resources are
598 * uncharged, associated resource pool can be freed as well.
599 */
600static void rdmacg_css_offline(struct cgroup_subsys_state *css)
601{
602 struct rdma_cgroup *cg = css_rdmacg(css);
603 struct rdmacg_resource_pool *rpool;
604
605 mutex_lock(&rdmacg_mutex);
606
607 list_for_each_entry(rpool, &cg->rpools, cg_node)
608 set_all_resource_max_limit(rpool);
609
610 mutex_unlock(&rdmacg_mutex);
611}
612
613struct cgroup_subsys rdma_cgrp_subsys = {
614 .css_alloc = rdmacg_css_alloc,
615 .css_free = rdmacg_css_free,
616 .css_offline = rdmacg_css_offline,
617 .legacy_cftypes = rdmacg_files,
618 .dfl_cftypes = rdmacg_files,
619};