diff options
| -rw-r--r-- | Documentation/cgroup-v1/rdma.txt | 109 | ||||
| -rw-r--r-- | Documentation/cgroup-v2.txt | 46 | ||||
| -rw-r--r-- | drivers/infiniband/core/Makefile | 1 | ||||
| -rw-r--r-- | drivers/infiniband/core/cgroup.c | 62 | ||||
| -rw-r--r-- | drivers/infiniband/core/core_priv.h | 30 | ||||
| -rw-r--r-- | drivers/infiniband/core/device.c | 10 | ||||
| -rw-r--r-- | drivers/infiniband/core/uverbs_cmd.c | 102 | ||||
| -rw-r--r-- | drivers/infiniband/core/uverbs_main.c | 20 | ||||
| -rw-r--r-- | include/linux/cgroup_rdma.h | 53 | ||||
| -rw-r--r-- | include/linux/cgroup_subsys.h | 4 | ||||
| -rw-r--r-- | include/rdma/ib_verbs.h | 14 | ||||
| -rw-r--r-- | init/Kconfig | 10 | ||||
| -rw-r--r-- | kernel/cgroup/Makefile | 1 | ||||
| -rw-r--r-- | kernel/cgroup/rdma.c | 619 |
14 files changed, 1071 insertions, 10 deletions
diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt new file mode 100644 index 000000000000..af618171e0eb --- /dev/null +++ b/Documentation/cgroup-v1/rdma.txt | |||
| @@ -0,0 +1,109 @@ | |||
| 1 | RDMA Controller | ||
| 2 | ---------------- | ||
| 3 | |||
| 4 | Contents | ||
| 5 | -------- | ||
| 6 | |||
| 7 | 1. Overview | ||
| 8 | 1-1. What is RDMA controller? | ||
| 9 | 1-2. Why RDMA controller needed? | ||
| 10 | 1-3. How is RDMA controller implemented? | ||
| 11 | 2. Usage Examples | ||
| 12 | |||
| 13 | 1. Overview | ||
| 14 | |||
| 15 | 1-1. What is RDMA controller? | ||
| 16 | ----------------------------- | ||
| 17 | |||
| 18 | RDMA controller allows user to limit RDMA/IB specific resources that a given | ||
| 19 | set of processes can use. These processes are grouped using RDMA controller. | ||
| 20 | |||
| 21 | RDMA controller defines two resources which can be limited for processes of a | ||
| 22 | cgroup. | ||
| 23 | |||
| 24 | 1-2. Why RDMA controller needed? | ||
| 25 | -------------------------------- | ||
| 26 | |||
| 27 | Currently user space applications can easily take away all the rdma verb | ||
| 28 | specific resources such as AH, CQ, QP, MR etc. Due to which other applications | ||
| 29 | in other cgroup or kernel space ULPs may not even get chance to allocate any | ||
| 30 | rdma resources. This can leads to service unavailability. | ||
| 31 | |||
| 32 | Therefore RDMA controller is needed through which resource consumption | ||
| 33 | of processes can be limited. Through this controller different rdma | ||
| 34 | resources can be accounted. | ||
| 35 | |||
| 36 | 1-3. How is RDMA controller implemented? | ||
| 37 | ---------------------------------------- | ||
| 38 | |||
| 39 | RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains | ||
| 40 | resource accounting per cgroup, per device using resource pool structure. | ||
| 41 | Each such resource pool is limited up to 64 resources in given resource pool | ||
| 42 | by rdma cgroup, which can be extended later if required. | ||
| 43 | |||
| 44 | This resource pool object is linked to the cgroup css. Typically there | ||
| 45 | are 0 to 4 resource pool instances per cgroup, per device in most use cases. | ||
| 46 | But nothing limits to have it more. At present hundreds of RDMA devices per | ||
| 47 | single cgroup may not be handled optimally, however there is no | ||
| 48 | known use case or requirement for such configuration either. | ||
| 49 | |||
| 50 | Since RDMA resources can be allocated from any process and can be freed by any | ||
| 51 | of the child processes which shares the address space, rdma resources are | ||
| 52 | always owned by the creator cgroup css. This allows process migration from one | ||
| 53 | to other cgroup without major complexity of transferring resource ownership; | ||
| 54 | because such ownership is not really present due to shared nature of | ||
| 55 | rdma resources. Linking resources around css also ensures that cgroups can be | ||
| 56 | deleted after processes migrated. This allow progress migration as well with | ||
| 57 | active resources, even though that is not a primary use case. | ||
| 58 | |||
| 59 | Whenever RDMA resource charging occurs, owner rdma cgroup is returned to | ||
| 60 | the caller. Same rdma cgroup should be passed while uncharging the resource. | ||
| 61 | This also allows process migrated with active RDMA resource to charge | ||
| 62 | to new owner cgroup for new resource. It also allows to uncharge resource of | ||
| 63 | a process from previously charged cgroup which is migrated to new cgroup, | ||
| 64 | even though that is not a primary use case. | ||
| 65 | |||
| 66 | Resource pool object is created in following situations. | ||
| 67 | (a) User sets the limit and no previous resource pool exist for the device | ||
| 68 | of interest for the cgroup. | ||
| 69 | (b) No resource limits were configured, but IB/RDMA stack tries to | ||
| 70 | charge the resource. So that it correctly uncharge them when applications are | ||
| 71 | running without limits and later on when limits are enforced during uncharging, | ||
| 72 | otherwise usage count will drop to negative. | ||
| 73 | |||
| 74 | Resource pool is destroyed if all the resource limits are set to max and | ||
| 75 | it is the last resource getting deallocated. | ||
| 76 | |||
| 77 | User should set all the limit to max value if it intents to remove/unconfigure | ||
| 78 | the resource pool for a particular device. | ||
| 79 | |||
| 80 | IB stack honors limits enforced by the rdma controller. When application | ||
| 81 | query about maximum resource limits of IB device, it returns minimum of | ||
| 82 | what is configured by user for a given cgroup and what is supported by | ||
| 83 | IB device. | ||
| 84 | |||
| 85 | Following resources can be accounted by rdma controller. | ||
| 86 | hca_handle Maximum number of HCA Handles | ||
| 87 | hca_object Maximum number of HCA Objects | ||
| 88 | |||
| 89 | 2. Usage Examples | ||
| 90 | ----------------- | ||
| 91 | |||
| 92 | (a) Configure resource limit: | ||
| 93 | echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max | ||
| 94 | echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max | ||
| 95 | |||
| 96 | (b) Query resource limit: | ||
| 97 | cat /sys/fs/cgroup/rdma/2/rdma.max | ||
| 98 | #Output: | ||
| 99 | mlx4_0 hca_handle=2 hca_object=2000 | ||
| 100 | ocrdma1 hca_handle=3 hca_object=max | ||
| 101 | |||
| 102 | (c) Query current usage: | ||
| 103 | cat /sys/fs/cgroup/rdma/2/rdma.current | ||
| 104 | #Output: | ||
| 105 | mlx4_0 hca_handle=1 hca_object=20 | ||
| 106 | ocrdma1 hca_handle=1 hca_object=23 | ||
| 107 | |||
| 108 | (d) Delete resource limit: | ||
| 109 | echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max | ||
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 1d101423ca92..3b8449f8ac7e 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt | |||
| @@ -49,8 +49,10 @@ CONTENTS | |||
| 49 | 5-3-2. Writeback | 49 | 5-3-2. Writeback |
| 50 | 5-4. PID | 50 | 5-4. PID |
| 51 | 5-4-1. PID Interface Files | 51 | 5-4-1. PID Interface Files |
| 52 | 5-5. Misc | 52 | 5-5. RDMA |
| 53 | 5-5-1. perf_event | 53 | 5-5-1. RDMA Interface Files |
| 54 | 5-6. Misc | ||
| 55 | 5-6-1. perf_event | ||
| 54 | 6. Namespace | 56 | 6. Namespace |
| 55 | 6-1. Basics | 57 | 6-1. Basics |
| 56 | 6-2. The Root and Views | 58 | 6-2. The Root and Views |
| @@ -1160,9 +1162,45 @@ through fork() or clone(). These will return -EAGAIN if the creation | |||
| 1160 | of a new process would cause a cgroup policy to be violated. | 1162 | of a new process would cause a cgroup policy to be violated. |
| 1161 | 1163 | ||
| 1162 | 1164 | ||
| 1163 | 5-5. Misc | 1165 | 5-5. RDMA |
| 1164 | 1166 | ||
| 1165 | 5-5-1. perf_event | 1167 | The "rdma" controller regulates the distribution and accounting of |
| 1168 | of RDMA resources. | ||
| 1169 | |||
| 1170 | 5-5-1. RDMA Interface Files | ||
| 1171 | |||
| 1172 | rdma.max | ||
| 1173 | A readwrite nested-keyed file that exists for all the cgroups | ||
| 1174 | except root that describes current configured resource limit | ||
| 1175 | for a RDMA/IB device. | ||
| 1176 | |||
| 1177 | Lines are keyed by device name and are not ordered. | ||
| 1178 | Each line contains space separated resource name and its configured | ||
| 1179 | limit that can be distributed. | ||
| 1180 | |||
| 1181 | The following nested keys are defined. | ||
| 1182 | |||
| 1183 | hca_handle Maximum number of HCA Handles | ||
| 1184 | hca_object Maximum number of HCA Objects | ||
| 1185 | |||
| 1186 | An example for mlx4 and ocrdma device follows. | ||
| 1187 | |||
| 1188 | mlx4_0 hca_handle=2 hca_object=2000 | ||
| 1189 | ocrdma1 hca_handle=3 hca_object=max | ||
| 1190 | |||
| 1191 | rdma.current | ||
| 1192 | A read-only file that describes current resource usage. | ||
| 1193 | It exists for all the cgroup except root. | ||
| 1194 | |||
| 1195 | An example for mlx4 and ocrdma device follows. | ||
| 1196 | |||
| 1197 | mlx4_0 hca_handle=1 hca_object=20 | ||
| 1198 | ocrdma1 hca_handle=1 hca_object=23 | ||
| 1199 | |||
| 1200 | |||
| 1201 | 5-6. Misc | ||
| 1202 | |||
| 1203 | 5-6-1. perf_event | ||
| 1166 | 1204 | ||
| 1167 | perf_event controller, if not mounted on a legacy hierarchy, is | 1205 | perf_event controller, if not mounted on a legacy hierarchy, is |
| 1168 | automatically enabled on the v2 hierarchy so that perf events can | 1206 | automatically enabled on the v2 hierarchy so that perf events can |
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index edaae9f9853c..e426ac877d19 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile | |||
| @@ -13,6 +13,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ | |||
| 13 | multicast.o mad.o smi.o agent.o mad_rmpp.o | 13 | multicast.o mad.o smi.o agent.o mad_rmpp.o |
| 14 | ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o | 14 | ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o |
| 15 | ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o | 15 | ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o |
| 16 | ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o | ||
| 16 | 17 | ||
| 17 | ib_cm-y := cm.o | 18 | ib_cm-y := cm.o |
| 18 | 19 | ||
diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c new file mode 100644 index 000000000000..126ac5f99db7 --- /dev/null +++ b/drivers/infiniband/core/cgroup.c | |||
| @@ -0,0 +1,62 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify it | ||
| 5 | * under the terms and conditions of the GNU General Public License, | ||
| 6 | * version 2, as published by the Free Software Foundation. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
| 9 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
| 10 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
| 11 | * more details. | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include "core_priv.h" | ||
| 15 | |||
| 16 | /** | ||
| 17 | * ib_device_register_rdmacg - register with rdma cgroup. | ||
| 18 | * @device: device to register to participate in resource | ||
| 19 | * accounting by rdma cgroup. | ||
| 20 | * | ||
| 21 | * Register with the rdma cgroup. Should be called before | ||
| 22 | * exposing rdma device to user space applications to avoid | ||
| 23 | * resource accounting leak. | ||
| 24 | * Returns 0 on success or otherwise failure code. | ||
| 25 | */ | ||
| 26 | int ib_device_register_rdmacg(struct ib_device *device) | ||
| 27 | { | ||
| 28 | device->cg_device.name = device->name; | ||
| 29 | return rdmacg_register_device(&device->cg_device); | ||
| 30 | } | ||
| 31 | |||
| 32 | /** | ||
| 33 | * ib_device_unregister_rdmacg - unregister with rdma cgroup. | ||
| 34 | * @device: device to unregister. | ||
| 35 | * | ||
| 36 | * Unregister with the rdma cgroup. Should be called after | ||
| 37 | * all the resources are deallocated, and after a stage when any | ||
| 38 | * other resource allocation by user application cannot be done | ||
| 39 | * for this device to avoid any leak in accounting. | ||
| 40 | */ | ||
| 41 | void ib_device_unregister_rdmacg(struct ib_device *device) | ||
| 42 | { | ||
| 43 | rdmacg_unregister_device(&device->cg_device); | ||
| 44 | } | ||
| 45 | |||
| 46 | int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, | ||
| 47 | struct ib_device *device, | ||
| 48 | enum rdmacg_resource_type resource_index) | ||
| 49 | { | ||
| 50 | return rdmacg_try_charge(&cg_obj->cg, &device->cg_device, | ||
| 51 | resource_index); | ||
| 52 | } | ||
| 53 | EXPORT_SYMBOL(ib_rdmacg_try_charge); | ||
| 54 | |||
| 55 | void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, | ||
| 56 | struct ib_device *device, | ||
| 57 | enum rdmacg_resource_type resource_index) | ||
| 58 | { | ||
| 59 | rdmacg_uncharge(cg_obj->cg, &device->cg_device, | ||
| 60 | resource_index); | ||
| 61 | } | ||
| 62 | EXPORT_SYMBOL(ib_rdmacg_uncharge); | ||
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index d29372624f3a..389f6192bddc 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h | |||
| @@ -35,6 +35,7 @@ | |||
| 35 | 35 | ||
| 36 | #include <linux/list.h> | 36 | #include <linux/list.h> |
| 37 | #include <linux/spinlock.h> | 37 | #include <linux/spinlock.h> |
| 38 | #include <linux/cgroup_rdma.h> | ||
| 38 | 39 | ||
| 39 | #include <rdma/ib_verbs.h> | 40 | #include <rdma/ib_verbs.h> |
| 40 | 41 | ||
| @@ -121,6 +122,35 @@ int ib_cache_setup_one(struct ib_device *device); | |||
| 121 | void ib_cache_cleanup_one(struct ib_device *device); | 122 | void ib_cache_cleanup_one(struct ib_device *device); |
| 122 | void ib_cache_release_one(struct ib_device *device); | 123 | void ib_cache_release_one(struct ib_device *device); |
| 123 | 124 | ||
| 125 | #ifdef CONFIG_CGROUP_RDMA | ||
| 126 | int ib_device_register_rdmacg(struct ib_device *device); | ||
| 127 | void ib_device_unregister_rdmacg(struct ib_device *device); | ||
| 128 | |||
| 129 | int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, | ||
| 130 | struct ib_device *device, | ||
| 131 | enum rdmacg_resource_type resource_index); | ||
| 132 | |||
| 133 | void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, | ||
| 134 | struct ib_device *device, | ||
| 135 | enum rdmacg_resource_type resource_index); | ||
| 136 | #else | ||
| 137 | static inline int ib_device_register_rdmacg(struct ib_device *device) | ||
| 138 | { return 0; } | ||
| 139 | |||
| 140 | static inline void ib_device_unregister_rdmacg(struct ib_device *device) | ||
| 141 | { } | ||
| 142 | |||
| 143 | static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, | ||
| 144 | struct ib_device *device, | ||
| 145 | enum rdmacg_resource_type resource_index) | ||
| 146 | { return 0; } | ||
| 147 | |||
| 148 | static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, | ||
| 149 | struct ib_device *device, | ||
| 150 | enum rdmacg_resource_type resource_index) | ||
| 151 | { } | ||
| 152 | #endif | ||
| 153 | |||
| 124 | static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, | 154 | static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, |
| 125 | struct net_device *upper) | 155 | struct net_device *upper) |
| 126 | { | 156 | { |
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 571974cd3919..70065386acbc 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c | |||
| @@ -360,10 +360,18 @@ int ib_register_device(struct ib_device *device, | |||
| 360 | goto out; | 360 | goto out; |
| 361 | } | 361 | } |
| 362 | 362 | ||
| 363 | ret = ib_device_register_rdmacg(device); | ||
| 364 | if (ret) { | ||
| 365 | pr_warn("Couldn't register device with rdma cgroup\n"); | ||
| 366 | ib_cache_cleanup_one(device); | ||
| 367 | goto out; | ||
| 368 | } | ||
| 369 | |||
| 363 | memset(&device->attrs, 0, sizeof(device->attrs)); | 370 | memset(&device->attrs, 0, sizeof(device->attrs)); |
| 364 | ret = device->query_device(device, &device->attrs, &uhw); | 371 | ret = device->query_device(device, &device->attrs, &uhw); |
| 365 | if (ret) { | 372 | if (ret) { |
| 366 | pr_warn("Couldn't query the device attributes\n"); | 373 | pr_warn("Couldn't query the device attributes\n"); |
| 374 | ib_device_unregister_rdmacg(device); | ||
| 367 | ib_cache_cleanup_one(device); | 375 | ib_cache_cleanup_one(device); |
| 368 | goto out; | 376 | goto out; |
| 369 | } | 377 | } |
| @@ -372,6 +380,7 @@ int ib_register_device(struct ib_device *device, | |||
| 372 | if (ret) { | 380 | if (ret) { |
| 373 | pr_warn("Couldn't register device %s with driver model\n", | 381 | pr_warn("Couldn't register device %s with driver model\n", |
| 374 | device->name); | 382 | device->name); |
| 383 | ib_device_unregister_rdmacg(device); | ||
| 375 | ib_cache_cleanup_one(device); | 384 | ib_cache_cleanup_one(device); |
| 376 | goto out; | 385 | goto out; |
| 377 | } | 386 | } |
| @@ -421,6 +430,7 @@ void ib_unregister_device(struct ib_device *device) | |||
| 421 | 430 | ||
| 422 | mutex_unlock(&device_mutex); | 431 | mutex_unlock(&device_mutex); |
| 423 | 432 | ||
| 433 | ib_device_unregister_rdmacg(device); | ||
| 424 | ib_device_unregister_sysfs(device); | 434 | ib_device_unregister_sysfs(device); |
| 425 | ib_cache_cleanup_one(device); | 435 | ib_cache_cleanup_one(device); |
| 426 | 436 | ||
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 700782203483..33bc88a38574 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c | |||
| @@ -316,6 +316,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, | |||
| 316 | struct ib_udata udata; | 316 | struct ib_udata udata; |
| 317 | struct ib_ucontext *ucontext; | 317 | struct ib_ucontext *ucontext; |
| 318 | struct file *filp; | 318 | struct file *filp; |
| 319 | struct ib_rdmacg_object cg_obj; | ||
| 319 | int ret; | 320 | int ret; |
| 320 | 321 | ||
| 321 | if (out_len < sizeof resp) | 322 | if (out_len < sizeof resp) |
| @@ -335,13 +336,18 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, | |||
| 335 | (unsigned long) cmd.response + sizeof resp, | 336 | (unsigned long) cmd.response + sizeof resp, |
| 336 | in_len - sizeof cmd, out_len - sizeof resp); | 337 | in_len - sizeof cmd, out_len - sizeof resp); |
| 337 | 338 | ||
| 339 | ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); | ||
| 340 | if (ret) | ||
| 341 | goto err; | ||
| 342 | |||
| 338 | ucontext = ib_dev->alloc_ucontext(ib_dev, &udata); | 343 | ucontext = ib_dev->alloc_ucontext(ib_dev, &udata); |
| 339 | if (IS_ERR(ucontext)) { | 344 | if (IS_ERR(ucontext)) { |
| 340 | ret = PTR_ERR(ucontext); | 345 | ret = PTR_ERR(ucontext); |
| 341 | goto err; | 346 | goto err_alloc; |
| 342 | } | 347 | } |
| 343 | 348 | ||
| 344 | ucontext->device = ib_dev; | 349 | ucontext->device = ib_dev; |
| 350 | ucontext->cg_obj = cg_obj; | ||
| 345 | INIT_LIST_HEAD(&ucontext->pd_list); | 351 | INIT_LIST_HEAD(&ucontext->pd_list); |
| 346 | INIT_LIST_HEAD(&ucontext->mr_list); | 352 | INIT_LIST_HEAD(&ucontext->mr_list); |
| 347 | INIT_LIST_HEAD(&ucontext->mw_list); | 353 | INIT_LIST_HEAD(&ucontext->mw_list); |
| @@ -407,6 +413,9 @@ err_free: | |||
| 407 | put_pid(ucontext->tgid); | 413 | put_pid(ucontext->tgid); |
| 408 | ib_dev->dealloc_ucontext(ucontext); | 414 | ib_dev->dealloc_ucontext(ucontext); |
| 409 | 415 | ||
| 416 | err_alloc: | ||
| 417 | ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); | ||
| 418 | |||
| 410 | err: | 419 | err: |
| 411 | mutex_unlock(&file->mutex); | 420 | mutex_unlock(&file->mutex); |
| 412 | return ret; | 421 | return ret; |
| @@ -561,6 +570,13 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, | |||
| 561 | return -ENOMEM; | 570 | return -ENOMEM; |
| 562 | 571 | ||
| 563 | init_uobj(uobj, 0, file->ucontext, &pd_lock_class); | 572 | init_uobj(uobj, 0, file->ucontext, &pd_lock_class); |
| 573 | ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev, | ||
| 574 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 575 | if (ret) { | ||
| 576 | kfree(uobj); | ||
| 577 | return ret; | ||
| 578 | } | ||
| 579 | |||
| 564 | down_write(&uobj->mutex); | 580 | down_write(&uobj->mutex); |
| 565 | 581 | ||
| 566 | pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata); | 582 | pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata); |
| @@ -605,6 +621,7 @@ err_idr: | |||
| 605 | ib_dealloc_pd(pd); | 621 | ib_dealloc_pd(pd); |
| 606 | 622 | ||
| 607 | err: | 623 | err: |
| 624 | ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); | ||
| 608 | put_uobj_write(uobj); | 625 | put_uobj_write(uobj); |
| 609 | return ret; | 626 | return ret; |
| 610 | } | 627 | } |
| @@ -637,6 +654,8 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, | |||
| 637 | if (ret) | 654 | if (ret) |
| 638 | goto err_put; | 655 | goto err_put; |
| 639 | 656 | ||
| 657 | ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); | ||
| 658 | |||
| 640 | uobj->live = 0; | 659 | uobj->live = 0; |
| 641 | put_uobj_write(uobj); | 660 | put_uobj_write(uobj); |
| 642 | 661 | ||
| @@ -1006,6 +1025,10 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, | |||
| 1006 | goto err_put; | 1025 | goto err_put; |
| 1007 | } | 1026 | } |
| 1008 | } | 1027 | } |
| 1028 | ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev, | ||
| 1029 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 1030 | if (ret) | ||
| 1031 | goto err_charge; | ||
| 1009 | 1032 | ||
| 1010 | mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, | 1033 | mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, |
| 1011 | cmd.access_flags, &udata); | 1034 | cmd.access_flags, &udata); |
| @@ -1054,6 +1077,9 @@ err_unreg: | |||
| 1054 | ib_dereg_mr(mr); | 1077 | ib_dereg_mr(mr); |
| 1055 | 1078 | ||
| 1056 | err_put: | 1079 | err_put: |
| 1080 | ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); | ||
| 1081 | |||
| 1082 | err_charge: | ||
| 1057 | put_pd_read(pd); | 1083 | put_pd_read(pd); |
| 1058 | 1084 | ||
| 1059 | err_free: | 1085 | err_free: |
| @@ -1178,6 +1204,8 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, | |||
| 1178 | if (ret) | 1204 | if (ret) |
| 1179 | return ret; | 1205 | return ret; |
| 1180 | 1206 | ||
| 1207 | ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); | ||
| 1208 | |||
| 1181 | idr_remove_uobj(&ib_uverbs_mr_idr, uobj); | 1209 | idr_remove_uobj(&ib_uverbs_mr_idr, uobj); |
| 1182 | 1210 | ||
| 1183 | mutex_lock(&file->mutex); | 1211 | mutex_lock(&file->mutex); |
| @@ -1226,6 +1254,11 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, | |||
| 1226 | in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), | 1254 | in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), |
| 1227 | out_len - sizeof(resp)); | 1255 | out_len - sizeof(resp)); |
| 1228 | 1256 | ||
| 1257 | ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev, | ||
| 1258 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 1259 | if (ret) | ||
| 1260 | goto err_charge; | ||
| 1261 | |||
| 1229 | mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata); | 1262 | mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata); |
| 1230 | if (IS_ERR(mw)) { | 1263 | if (IS_ERR(mw)) { |
| 1231 | ret = PTR_ERR(mw); | 1264 | ret = PTR_ERR(mw); |
| @@ -1271,6 +1304,9 @@ err_unalloc: | |||
| 1271 | uverbs_dealloc_mw(mw); | 1304 | uverbs_dealloc_mw(mw); |
| 1272 | 1305 | ||
| 1273 | err_put: | 1306 | err_put: |
| 1307 | ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); | ||
| 1308 | |||
| 1309 | err_charge: | ||
| 1274 | put_pd_read(pd); | 1310 | put_pd_read(pd); |
| 1275 | 1311 | ||
| 1276 | err_free: | 1312 | err_free: |
| @@ -1306,6 +1342,8 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, | |||
| 1306 | if (ret) | 1342 | if (ret) |
| 1307 | return ret; | 1343 | return ret; |
| 1308 | 1344 | ||
| 1345 | ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); | ||
| 1346 | |||
| 1309 | idr_remove_uobj(&ib_uverbs_mw_idr, uobj); | 1347 | idr_remove_uobj(&ib_uverbs_mw_idr, uobj); |
| 1310 | 1348 | ||
| 1311 | mutex_lock(&file->mutex); | 1349 | mutex_lock(&file->mutex); |
| @@ -1405,6 +1443,11 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, | |||
| 1405 | if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags)) | 1443 | if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags)) |
| 1406 | attr.flags = cmd->flags; | 1444 | attr.flags = cmd->flags; |
| 1407 | 1445 | ||
| 1446 | ret = ib_rdmacg_try_charge(&obj->uobject.cg_obj, ib_dev, | ||
| 1447 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 1448 | if (ret) | ||
| 1449 | goto err_charge; | ||
| 1450 | |||
| 1408 | cq = ib_dev->create_cq(ib_dev, &attr, | 1451 | cq = ib_dev->create_cq(ib_dev, &attr, |
| 1409 | file->ucontext, uhw); | 1452 | file->ucontext, uhw); |
| 1410 | if (IS_ERR(cq)) { | 1453 | if (IS_ERR(cq)) { |
| @@ -1452,6 +1495,10 @@ err_free: | |||
| 1452 | ib_destroy_cq(cq); | 1495 | ib_destroy_cq(cq); |
| 1453 | 1496 | ||
| 1454 | err_file: | 1497 | err_file: |
| 1498 | ib_rdmacg_uncharge(&obj->uobject.cg_obj, ib_dev, | ||
| 1499 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 1500 | |||
| 1501 | err_charge: | ||
| 1455 | if (ev_file) | 1502 | if (ev_file) |
| 1456 | ib_uverbs_release_ucq(file, ev_file, obj); | 1503 | ib_uverbs_release_ucq(file, ev_file, obj); |
| 1457 | 1504 | ||
| @@ -1732,6 +1779,8 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, | |||
| 1732 | if (ret) | 1779 | if (ret) |
| 1733 | return ret; | 1780 | return ret; |
| 1734 | 1781 | ||
| 1782 | ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); | ||
| 1783 | |||
| 1735 | idr_remove_uobj(&ib_uverbs_cq_idr, uobj); | 1784 | idr_remove_uobj(&ib_uverbs_cq_idr, uobj); |
| 1736 | 1785 | ||
| 1737 | mutex_lock(&file->mutex); | 1786 | mutex_lock(&file->mutex); |
| @@ -1904,6 +1953,11 @@ static int create_qp(struct ib_uverbs_file *file, | |||
| 1904 | goto err_put; | 1953 | goto err_put; |
| 1905 | } | 1954 | } |
| 1906 | 1955 | ||
| 1956 | ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, device, | ||
| 1957 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 1958 | if (ret) | ||
| 1959 | goto err_put; | ||
| 1960 | |||
| 1907 | if (cmd->qp_type == IB_QPT_XRC_TGT) | 1961 | if (cmd->qp_type == IB_QPT_XRC_TGT) |
| 1908 | qp = ib_create_qp(pd, &attr); | 1962 | qp = ib_create_qp(pd, &attr); |
| 1909 | else | 1963 | else |
| @@ -1911,7 +1965,7 @@ static int create_qp(struct ib_uverbs_file *file, | |||
| 1911 | 1965 | ||
| 1912 | if (IS_ERR(qp)) { | 1966 | if (IS_ERR(qp)) { |
| 1913 | ret = PTR_ERR(qp); | 1967 | ret = PTR_ERR(qp); |
| 1914 | goto err_put; | 1968 | goto err_create; |
| 1915 | } | 1969 | } |
| 1916 | 1970 | ||
| 1917 | if (cmd->qp_type != IB_QPT_XRC_TGT) { | 1971 | if (cmd->qp_type != IB_QPT_XRC_TGT) { |
| @@ -1992,6 +2046,10 @@ err_cb: | |||
| 1992 | err_destroy: | 2046 | err_destroy: |
| 1993 | ib_destroy_qp(qp); | 2047 | ib_destroy_qp(qp); |
| 1994 | 2048 | ||
| 2049 | err_create: | ||
| 2050 | ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, device, | ||
| 2051 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 2052 | |||
| 1995 | err_put: | 2053 | err_put: |
| 1996 | if (xrcd) | 2054 | if (xrcd) |
| 1997 | put_xrcd_read(xrcd_uobj); | 2055 | put_xrcd_read(xrcd_uobj); |
| @@ -2518,6 +2576,8 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, | |||
| 2518 | if (ret) | 2576 | if (ret) |
| 2519 | return ret; | 2577 | return ret; |
| 2520 | 2578 | ||
| 2579 | ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); | ||
| 2580 | |||
| 2521 | if (obj->uxrcd) | 2581 | if (obj->uxrcd) |
| 2522 | atomic_dec(&obj->uxrcd->refcnt); | 2582 | atomic_dec(&obj->uxrcd->refcnt); |
| 2523 | 2583 | ||
| @@ -2969,11 +3029,16 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, | |||
| 2969 | memset(&attr.dmac, 0, sizeof(attr.dmac)); | 3029 | memset(&attr.dmac, 0, sizeof(attr.dmac)); |
| 2970 | memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16); | 3030 | memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16); |
| 2971 | 3031 | ||
| 3032 | ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev, | ||
| 3033 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 3034 | if (ret) | ||
| 3035 | goto err_charge; | ||
| 3036 | |||
| 2972 | ah = pd->device->create_ah(pd, &attr, &udata); | 3037 | ah = pd->device->create_ah(pd, &attr, &udata); |
| 2973 | 3038 | ||
| 2974 | if (IS_ERR(ah)) { | 3039 | if (IS_ERR(ah)) { |
| 2975 | ret = PTR_ERR(ah); | 3040 | ret = PTR_ERR(ah); |
| 2976 | goto err_put; | 3041 | goto err_create; |
| 2977 | } | 3042 | } |
| 2978 | 3043 | ||
| 2979 | ah->device = pd->device; | 3044 | ah->device = pd->device; |
| @@ -3012,7 +3077,10 @@ err_copy: | |||
| 3012 | err_destroy: | 3077 | err_destroy: |
| 3013 | ib_destroy_ah(ah); | 3078 | ib_destroy_ah(ah); |
| 3014 | 3079 | ||
| 3015 | err_put: | 3080 | err_create: |
| 3081 | ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); | ||
| 3082 | |||
| 3083 | err_charge: | ||
| 3016 | put_pd_read(pd); | 3084 | put_pd_read(pd); |
| 3017 | 3085 | ||
| 3018 | err: | 3086 | err: |
| @@ -3046,6 +3114,8 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, | |||
| 3046 | if (ret) | 3114 | if (ret) |
| 3047 | return ret; | 3115 | return ret; |
| 3048 | 3116 | ||
| 3117 | ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); | ||
| 3118 | |||
| 3049 | idr_remove_uobj(&ib_uverbs_ah_idr, uobj); | 3119 | idr_remove_uobj(&ib_uverbs_ah_idr, uobj); |
| 3050 | 3120 | ||
| 3051 | mutex_lock(&file->mutex); | 3121 | mutex_lock(&file->mutex); |
| @@ -3822,10 +3892,16 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, | |||
| 3822 | err = -EINVAL; | 3892 | err = -EINVAL; |
| 3823 | goto err_free; | 3893 | goto err_free; |
| 3824 | } | 3894 | } |
| 3895 | |||
| 3896 | err = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev, | ||
| 3897 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 3898 | if (err) | ||
| 3899 | goto err_free; | ||
| 3900 | |||
| 3825 | flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); | 3901 | flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); |
| 3826 | if (IS_ERR(flow_id)) { | 3902 | if (IS_ERR(flow_id)) { |
| 3827 | err = PTR_ERR(flow_id); | 3903 | err = PTR_ERR(flow_id); |
| 3828 | goto err_free; | 3904 | goto err_create; |
| 3829 | } | 3905 | } |
| 3830 | flow_id->uobject = uobj; | 3906 | flow_id->uobject = uobj; |
| 3831 | uobj->object = flow_id; | 3907 | uobj->object = flow_id; |
| @@ -3858,6 +3934,8 @@ err_copy: | |||
| 3858 | idr_remove_uobj(&ib_uverbs_rule_idr, uobj); | 3934 | idr_remove_uobj(&ib_uverbs_rule_idr, uobj); |
| 3859 | destroy_flow: | 3935 | destroy_flow: |
| 3860 | ib_destroy_flow(flow_id); | 3936 | ib_destroy_flow(flow_id); |
| 3937 | err_create: | ||
| 3938 | ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); | ||
| 3861 | err_free: | 3939 | err_free: |
| 3862 | kfree(flow_attr); | 3940 | kfree(flow_attr); |
| 3863 | err_put: | 3941 | err_put: |
| @@ -3897,8 +3975,11 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, | |||
| 3897 | flow_id = uobj->object; | 3975 | flow_id = uobj->object; |
| 3898 | 3976 | ||
| 3899 | ret = ib_destroy_flow(flow_id); | 3977 | ret = ib_destroy_flow(flow_id); |
| 3900 | if (!ret) | 3978 | if (!ret) { |
| 3979 | ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, | ||
| 3980 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 3901 | uobj->live = 0; | 3981 | uobj->live = 0; |
| 3982 | } | ||
| 3902 | 3983 | ||
| 3903 | put_uobj_write(uobj); | 3984 | put_uobj_write(uobj); |
| 3904 | 3985 | ||
| @@ -3966,6 +4047,11 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, | |||
| 3966 | obj->uevent.events_reported = 0; | 4047 | obj->uevent.events_reported = 0; |
| 3967 | INIT_LIST_HEAD(&obj->uevent.event_list); | 4048 | INIT_LIST_HEAD(&obj->uevent.event_list); |
| 3968 | 4049 | ||
| 4050 | ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, ib_dev, | ||
| 4051 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 4052 | if (ret) | ||
| 4053 | goto err_put_cq; | ||
| 4054 | |||
| 3969 | srq = pd->device->create_srq(pd, &attr, udata); | 4055 | srq = pd->device->create_srq(pd, &attr, udata); |
| 3970 | if (IS_ERR(srq)) { | 4056 | if (IS_ERR(srq)) { |
| 3971 | ret = PTR_ERR(srq); | 4057 | ret = PTR_ERR(srq); |
| @@ -4030,6 +4116,8 @@ err_destroy: | |||
| 4030 | ib_destroy_srq(srq); | 4116 | ib_destroy_srq(srq); |
| 4031 | 4117 | ||
| 4032 | err_put: | 4118 | err_put: |
| 4119 | ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, ib_dev, | ||
| 4120 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 4033 | put_pd_read(pd); | 4121 | put_pd_read(pd); |
| 4034 | 4122 | ||
| 4035 | err_put_cq: | 4123 | err_put_cq: |
| @@ -4216,6 +4304,8 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, | |||
| 4216 | if (ret) | 4304 | if (ret) |
| 4217 | return ret; | 4305 | return ret; |
| 4218 | 4306 | ||
| 4307 | ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT); | ||
| 4308 | |||
| 4219 | if (srq_type == IB_SRQT_XRC) { | 4309 | if (srq_type == IB_SRQT_XRC) { |
| 4220 | us = container_of(obj, struct ib_usrq_object, uevent); | 4310 | us = container_of(obj, struct ib_usrq_object, uevent); |
| 4221 | atomic_dec(&us->uxrcd->refcnt); | 4311 | atomic_dec(&us->uxrcd->refcnt); |
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index b3f95d453fba..cdbd26d6574b 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c | |||
| @@ -51,6 +51,7 @@ | |||
| 51 | #include <rdma/ib.h> | 51 | #include <rdma/ib.h> |
| 52 | 52 | ||
| 53 | #include "uverbs.h" | 53 | #include "uverbs.h" |
| 54 | #include "core_priv.h" | ||
| 54 | 55 | ||
| 55 | MODULE_AUTHOR("Roland Dreier"); | 56 | MODULE_AUTHOR("Roland Dreier"); |
| 56 | MODULE_DESCRIPTION("InfiniBand userspace verbs access"); | 57 | MODULE_DESCRIPTION("InfiniBand userspace verbs access"); |
| @@ -237,6 +238,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, | |||
| 237 | 238 | ||
| 238 | idr_remove_uobj(&ib_uverbs_ah_idr, uobj); | 239 | idr_remove_uobj(&ib_uverbs_ah_idr, uobj); |
| 239 | ib_destroy_ah(ah); | 240 | ib_destroy_ah(ah); |
| 241 | ib_rdmacg_uncharge(&uobj->cg_obj, context->device, | ||
| 242 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 240 | kfree(uobj); | 243 | kfree(uobj); |
| 241 | } | 244 | } |
| 242 | 245 | ||
| @@ -246,6 +249,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, | |||
| 246 | 249 | ||
| 247 | idr_remove_uobj(&ib_uverbs_mw_idr, uobj); | 250 | idr_remove_uobj(&ib_uverbs_mw_idr, uobj); |
| 248 | uverbs_dealloc_mw(mw); | 251 | uverbs_dealloc_mw(mw); |
| 252 | ib_rdmacg_uncharge(&uobj->cg_obj, context->device, | ||
| 253 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 249 | kfree(uobj); | 254 | kfree(uobj); |
| 250 | } | 255 | } |
| 251 | 256 | ||
| @@ -254,6 +259,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, | |||
| 254 | 259 | ||
| 255 | idr_remove_uobj(&ib_uverbs_rule_idr, uobj); | 260 | idr_remove_uobj(&ib_uverbs_rule_idr, uobj); |
| 256 | ib_destroy_flow(flow_id); | 261 | ib_destroy_flow(flow_id); |
| 262 | ib_rdmacg_uncharge(&uobj->cg_obj, context->device, | ||
| 263 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 257 | kfree(uobj); | 264 | kfree(uobj); |
| 258 | } | 265 | } |
| 259 | 266 | ||
| @@ -266,6 +273,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, | |||
| 266 | if (qp == qp->real_qp) | 273 | if (qp == qp->real_qp) |
| 267 | ib_uverbs_detach_umcast(qp, uqp); | 274 | ib_uverbs_detach_umcast(qp, uqp); |
| 268 | ib_destroy_qp(qp); | 275 | ib_destroy_qp(qp); |
| 276 | ib_rdmacg_uncharge(&uobj->cg_obj, context->device, | ||
| 277 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 269 | ib_uverbs_release_uevent(file, &uqp->uevent); | 278 | ib_uverbs_release_uevent(file, &uqp->uevent); |
| 270 | kfree(uqp); | 279 | kfree(uqp); |
| 271 | } | 280 | } |
| @@ -298,6 +307,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, | |||
| 298 | 307 | ||
| 299 | idr_remove_uobj(&ib_uverbs_srq_idr, uobj); | 308 | idr_remove_uobj(&ib_uverbs_srq_idr, uobj); |
| 300 | ib_destroy_srq(srq); | 309 | ib_destroy_srq(srq); |
| 310 | ib_rdmacg_uncharge(&uobj->cg_obj, context->device, | ||
| 311 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 301 | ib_uverbs_release_uevent(file, uevent); | 312 | ib_uverbs_release_uevent(file, uevent); |
| 302 | kfree(uevent); | 313 | kfree(uevent); |
| 303 | } | 314 | } |
| @@ -310,6 +321,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, | |||
| 310 | 321 | ||
| 311 | idr_remove_uobj(&ib_uverbs_cq_idr, uobj); | 322 | idr_remove_uobj(&ib_uverbs_cq_idr, uobj); |
| 312 | ib_destroy_cq(cq); | 323 | ib_destroy_cq(cq); |
| 324 | ib_rdmacg_uncharge(&uobj->cg_obj, context->device, | ||
| 325 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 313 | ib_uverbs_release_ucq(file, ev_file, ucq); | 326 | ib_uverbs_release_ucq(file, ev_file, ucq); |
| 314 | kfree(ucq); | 327 | kfree(ucq); |
| 315 | } | 328 | } |
| @@ -319,6 +332,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, | |||
| 319 | 332 | ||
| 320 | idr_remove_uobj(&ib_uverbs_mr_idr, uobj); | 333 | idr_remove_uobj(&ib_uverbs_mr_idr, uobj); |
| 321 | ib_dereg_mr(mr); | 334 | ib_dereg_mr(mr); |
| 335 | ib_rdmacg_uncharge(&uobj->cg_obj, context->device, | ||
| 336 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 322 | kfree(uobj); | 337 | kfree(uobj); |
| 323 | } | 338 | } |
| 324 | 339 | ||
| @@ -339,11 +354,16 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, | |||
| 339 | 354 | ||
| 340 | idr_remove_uobj(&ib_uverbs_pd_idr, uobj); | 355 | idr_remove_uobj(&ib_uverbs_pd_idr, uobj); |
| 341 | ib_dealloc_pd(pd); | 356 | ib_dealloc_pd(pd); |
| 357 | ib_rdmacg_uncharge(&uobj->cg_obj, context->device, | ||
| 358 | RDMACG_RESOURCE_HCA_OBJECT); | ||
| 342 | kfree(uobj); | 359 | kfree(uobj); |
| 343 | } | 360 | } |
| 344 | 361 | ||
| 345 | put_pid(context->tgid); | 362 | put_pid(context->tgid); |
| 346 | 363 | ||
| 364 | ib_rdmacg_uncharge(&context->cg_obj, context->device, | ||
| 365 | RDMACG_RESOURCE_HCA_HANDLE); | ||
| 366 | |||
| 347 | return context->device->dealloc_ucontext(context); | 367 | return context->device->dealloc_ucontext(context); |
| 348 | } | 368 | } |
| 349 | 369 | ||
diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h new file mode 100644 index 000000000000..e94290b29e99 --- /dev/null +++ b/include/linux/cgroup_rdma.h | |||
| @@ -0,0 +1,53 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> | ||
| 3 | * | ||
| 4 | * This file is subject to the terms and conditions of version 2 of the GNU | ||
| 5 | * General Public License. See the file COPYING in the main directory of the | ||
| 6 | * Linux distribution for more details. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #ifndef _CGROUP_RDMA_H | ||
| 10 | #define _CGROUP_RDMA_H | ||
| 11 | |||
| 12 | #include <linux/cgroup.h> | ||
| 13 | |||
| 14 | enum rdmacg_resource_type { | ||
| 15 | RDMACG_RESOURCE_HCA_HANDLE, | ||
| 16 | RDMACG_RESOURCE_HCA_OBJECT, | ||
| 17 | RDMACG_RESOURCE_MAX, | ||
| 18 | }; | ||
| 19 | |||
| 20 | #ifdef CONFIG_CGROUP_RDMA | ||
| 21 | |||
| 22 | struct rdma_cgroup { | ||
| 23 | struct cgroup_subsys_state css; | ||
| 24 | |||
| 25 | /* | ||
| 26 | * head to keep track of all resource pools | ||
| 27 | * that belongs to this cgroup. | ||
| 28 | */ | ||
| 29 | struct list_head rpools; | ||
| 30 | }; | ||
| 31 | |||
| 32 | struct rdmacg_device { | ||
| 33 | struct list_head dev_node; | ||
| 34 | struct list_head rpools; | ||
| 35 | char *name; | ||
| 36 | }; | ||
| 37 | |||
| 38 | /* | ||
| 39 | * APIs for RDMA/IB stack to publish when a device wants to | ||
| 40 | * participate in resource accounting | ||
| 41 | */ | ||
| 42 | int rdmacg_register_device(struct rdmacg_device *device); | ||
| 43 | void rdmacg_unregister_device(struct rdmacg_device *device); | ||
| 44 | |||
| 45 | /* APIs for RDMA/IB stack to charge/uncharge pool specific resources */ | ||
| 46 | int rdmacg_try_charge(struct rdma_cgroup **rdmacg, | ||
| 47 | struct rdmacg_device *device, | ||
| 48 | enum rdmacg_resource_type index); | ||
| 49 | void rdmacg_uncharge(struct rdma_cgroup *cg, | ||
| 50 | struct rdmacg_device *device, | ||
| 51 | enum rdmacg_resource_type index); | ||
| 52 | #endif /* CONFIG_CGROUP_RDMA */ | ||
| 53 | #endif /* _CGROUP_RDMA_H */ | ||
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 0df0336acee9..d0e597c44585 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h | |||
| @@ -56,6 +56,10 @@ SUBSYS(hugetlb) | |||
| 56 | SUBSYS(pids) | 56 | SUBSYS(pids) |
| 57 | #endif | 57 | #endif |
| 58 | 58 | ||
| 59 | #if IS_ENABLED(CONFIG_CGROUP_RDMA) | ||
| 60 | SUBSYS(rdma) | ||
| 61 | #endif | ||
| 62 | |||
| 59 | /* | 63 | /* |
| 60 | * The following subsystems are not supported on the default hierarchy. | 64 | * The following subsystems are not supported on the default hierarchy. |
| 61 | */ | 65 | */ |
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 958a24d8fae7..63896a477896 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h | |||
| @@ -60,6 +60,7 @@ | |||
| 60 | #include <linux/atomic.h> | 60 | #include <linux/atomic.h> |
| 61 | #include <linux/mmu_notifier.h> | 61 | #include <linux/mmu_notifier.h> |
| 62 | #include <linux/uaccess.h> | 62 | #include <linux/uaccess.h> |
| 63 | #include <linux/cgroup_rdma.h> | ||
| 63 | 64 | ||
| 64 | extern struct workqueue_struct *ib_wq; | 65 | extern struct workqueue_struct *ib_wq; |
| 65 | extern struct workqueue_struct *ib_comp_wq; | 66 | extern struct workqueue_struct *ib_comp_wq; |
| @@ -1331,6 +1332,12 @@ struct ib_fmr_attr { | |||
| 1331 | 1332 | ||
| 1332 | struct ib_umem; | 1333 | struct ib_umem; |
| 1333 | 1334 | ||
| 1335 | struct ib_rdmacg_object { | ||
| 1336 | #ifdef CONFIG_CGROUP_RDMA | ||
| 1337 | struct rdma_cgroup *cg; /* owner rdma cgroup */ | ||
| 1338 | #endif | ||
| 1339 | }; | ||
| 1340 | |||
| 1334 | struct ib_ucontext { | 1341 | struct ib_ucontext { |
| 1335 | struct ib_device *device; | 1342 | struct ib_device *device; |
| 1336 | struct list_head pd_list; | 1343 | struct list_head pd_list; |
| @@ -1363,6 +1370,8 @@ struct ib_ucontext { | |||
| 1363 | struct list_head no_private_counters; | 1370 | struct list_head no_private_counters; |
| 1364 | int odp_mrs_count; | 1371 | int odp_mrs_count; |
| 1365 | #endif | 1372 | #endif |
| 1373 | |||
| 1374 | struct ib_rdmacg_object cg_obj; | ||
| 1366 | }; | 1375 | }; |
| 1367 | 1376 | ||
| 1368 | struct ib_uobject { | 1377 | struct ib_uobject { |
| @@ -1370,6 +1379,7 @@ struct ib_uobject { | |||
| 1370 | struct ib_ucontext *context; /* associated user context */ | 1379 | struct ib_ucontext *context; /* associated user context */ |
| 1371 | void *object; /* containing object */ | 1380 | void *object; /* containing object */ |
| 1372 | struct list_head list; /* link to context's list */ | 1381 | struct list_head list; /* link to context's list */ |
| 1382 | struct ib_rdmacg_object cg_obj; /* rdmacg object */ | ||
| 1373 | int id; /* index into kernel idr */ | 1383 | int id; /* index into kernel idr */ |
| 1374 | struct kref ref; | 1384 | struct kref ref; |
| 1375 | struct rw_semaphore mutex; /* protects .live */ | 1385 | struct rw_semaphore mutex; /* protects .live */ |
| @@ -2118,6 +2128,10 @@ struct ib_device { | |||
| 2118 | struct attribute_group *hw_stats_ag; | 2128 | struct attribute_group *hw_stats_ag; |
| 2119 | struct rdma_hw_stats *hw_stats; | 2129 | struct rdma_hw_stats *hw_stats; |
| 2120 | 2130 | ||
| 2131 | #ifdef CONFIG_CGROUP_RDMA | ||
| 2132 | struct rdmacg_device cg_device; | ||
| 2133 | #endif | ||
| 2134 | |||
| 2121 | /** | 2135 | /** |
| 2122 | * The following mandatory functions are used only at device | 2136 | * The following mandatory functions are used only at device |
| 2123 | * registration. Keep functions such as these at the end of this | 2137 | * registration. Keep functions such as these at the end of this |
diff --git a/init/Kconfig b/init/Kconfig index 223b734abccd..ef80d46a32b6 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
| @@ -1090,6 +1090,16 @@ config CGROUP_PIDS | |||
| 1090 | since the PIDs limit only affects a process's ability to fork, not to | 1090 | since the PIDs limit only affects a process's ability to fork, not to |
| 1091 | attach to a cgroup. | 1091 | attach to a cgroup. |
| 1092 | 1092 | ||
| 1093 | config CGROUP_RDMA | ||
| 1094 | bool "RDMA controller" | ||
| 1095 | help | ||
| 1096 | Provides enforcement of RDMA resources defined by IB stack. | ||
| 1097 | It is fairly easy for consumers to exhaust RDMA resources, which | ||
| 1098 | can result into resource unavailability to other consumers. | ||
| 1099 | RDMA controller is designed to stop this from happening. | ||
| 1100 | Attaching processes with active RDMA resources to the cgroup | ||
| 1101 | hierarchy is allowed even if can cross the hierarchy's limit. | ||
| 1102 | |||
| 1093 | config CGROUP_FREEZER | 1103 | config CGROUP_FREEZER |
| 1094 | bool "Freezer controller" | 1104 | bool "Freezer controller" |
| 1095 | help | 1105 | help |
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 6d42a3211164..387348a40c64 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile | |||
| @@ -2,4 +2,5 @@ obj-y := cgroup.o namespace.o cgroup-v1.o | |||
| 2 | 2 | ||
| 3 | obj-$(CONFIG_CGROUP_FREEZER) += freezer.o | 3 | obj-$(CONFIG_CGROUP_FREEZER) += freezer.o |
| 4 | obj-$(CONFIG_CGROUP_PIDS) += pids.o | 4 | obj-$(CONFIG_CGROUP_PIDS) += pids.o |
| 5 | obj-$(CONFIG_CGROUP_RDMA) += rdma.o | ||
| 5 | obj-$(CONFIG_CPUSETS) += cpuset.o | 6 | obj-$(CONFIG_CPUSETS) += cpuset.o |
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c new file mode 100644 index 000000000000..defad3c5e7dc --- /dev/null +++ b/kernel/cgroup/rdma.c | |||
| @@ -0,0 +1,619 @@ | |||
| 1 | /* | ||
| 2 | * RDMA resource limiting controller for cgroups. | ||
| 3 | * | ||
| 4 | * Used to allow a cgroup hierarchy to stop processes from consuming | ||
| 5 | * additional RDMA resources after a certain limit is reached. | ||
| 6 | * | ||
| 7 | * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> | ||
| 8 | * | ||
| 9 | * This file is subject to the terms and conditions of version 2 of the GNU | ||
| 10 | * General Public License. See the file COPYING in the main directory of the | ||
| 11 | * Linux distribution for more details. | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <linux/bitops.h> | ||
| 15 | #include <linux/slab.h> | ||
| 16 | #include <linux/seq_file.h> | ||
| 17 | #include <linux/cgroup.h> | ||
| 18 | #include <linux/parser.h> | ||
| 19 | #include <linux/cgroup_rdma.h> | ||
| 20 | |||
| 21 | #define RDMACG_MAX_STR "max" | ||
| 22 | |||
| 23 | /* | ||
| 24 | * Protects list of resource pools maintained on per cgroup basis | ||
| 25 | * and rdma device list. | ||
| 26 | */ | ||
| 27 | static DEFINE_MUTEX(rdmacg_mutex); | ||
| 28 | static LIST_HEAD(rdmacg_devices); | ||
| 29 | |||
| 30 | enum rdmacg_file_type { | ||
| 31 | RDMACG_RESOURCE_TYPE_MAX, | ||
| 32 | RDMACG_RESOURCE_TYPE_STAT, | ||
| 33 | }; | ||
| 34 | |||
| 35 | /* | ||
| 36 | * resource table definition as to be seen by the user. | ||
| 37 | * Need to add entries to it when more resources are | ||
| 38 | * added/defined at IB verb/core layer. | ||
| 39 | */ | ||
| 40 | static char const *rdmacg_resource_names[] = { | ||
| 41 | [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle", | ||
| 42 | [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object", | ||
| 43 | }; | ||
| 44 | |||
| 45 | /* resource tracker for each resource of rdma cgroup */ | ||
| 46 | struct rdmacg_resource { | ||
| 47 | int max; | ||
| 48 | int usage; | ||
| 49 | }; | ||
| 50 | |||
| 51 | /* | ||
| 52 | * resource pool object which represents per cgroup, per device | ||
| 53 | * resources. There are multiple instances of this object per cgroup, | ||
| 54 | * therefore it cannot be embedded within rdma_cgroup structure. It | ||
| 55 | * is maintained as list. | ||
| 56 | */ | ||
| 57 | struct rdmacg_resource_pool { | ||
| 58 | struct rdmacg_device *device; | ||
| 59 | struct rdmacg_resource resources[RDMACG_RESOURCE_MAX]; | ||
| 60 | |||
| 61 | struct list_head cg_node; | ||
| 62 | struct list_head dev_node; | ||
| 63 | |||
| 64 | /* count active user tasks of this pool */ | ||
| 65 | u64 usage_sum; | ||
| 66 | /* total number counts which are set to max */ | ||
| 67 | int num_max_cnt; | ||
| 68 | }; | ||
| 69 | |||
| 70 | static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css) | ||
| 71 | { | ||
| 72 | return container_of(css, struct rdma_cgroup, css); | ||
| 73 | } | ||
| 74 | |||
| 75 | static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg) | ||
| 76 | { | ||
| 77 | return css_rdmacg(cg->css.parent); | ||
| 78 | } | ||
| 79 | |||
| 80 | static inline struct rdma_cgroup *get_current_rdmacg(void) | ||
| 81 | { | ||
| 82 | return css_rdmacg(task_get_css(current, rdma_cgrp_id)); | ||
| 83 | } | ||
| 84 | |||
| 85 | static void set_resource_limit(struct rdmacg_resource_pool *rpool, | ||
| 86 | int index, int new_max) | ||
| 87 | { | ||
| 88 | if (new_max == S32_MAX) { | ||
| 89 | if (rpool->resources[index].max != S32_MAX) | ||
| 90 | rpool->num_max_cnt++; | ||
| 91 | } else { | ||
| 92 | if (rpool->resources[index].max == S32_MAX) | ||
| 93 | rpool->num_max_cnt--; | ||
| 94 | } | ||
| 95 | rpool->resources[index].max = new_max; | ||
| 96 | } | ||
| 97 | |||
| 98 | static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool) | ||
| 99 | { | ||
| 100 | int i; | ||
| 101 | |||
| 102 | for (i = 0; i < RDMACG_RESOURCE_MAX; i++) | ||
| 103 | set_resource_limit(rpool, i, S32_MAX); | ||
| 104 | } | ||
| 105 | |||
| 106 | static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool) | ||
| 107 | { | ||
| 108 | lockdep_assert_held(&rdmacg_mutex); | ||
| 109 | |||
| 110 | list_del(&rpool->cg_node); | ||
| 111 | list_del(&rpool->dev_node); | ||
| 112 | kfree(rpool); | ||
| 113 | } | ||
| 114 | |||
| 115 | static struct rdmacg_resource_pool * | ||
| 116 | find_cg_rpool_locked(struct rdma_cgroup *cg, | ||
| 117 | struct rdmacg_device *device) | ||
| 118 | |||
| 119 | { | ||
| 120 | struct rdmacg_resource_pool *pool; | ||
| 121 | |||
| 122 | lockdep_assert_held(&rdmacg_mutex); | ||
| 123 | |||
| 124 | list_for_each_entry(pool, &cg->rpools, cg_node) | ||
| 125 | if (pool->device == device) | ||
| 126 | return pool; | ||
| 127 | |||
| 128 | return NULL; | ||
| 129 | } | ||
| 130 | |||
| 131 | static struct rdmacg_resource_pool * | ||
| 132 | get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) | ||
| 133 | { | ||
| 134 | struct rdmacg_resource_pool *rpool; | ||
| 135 | |||
| 136 | rpool = find_cg_rpool_locked(cg, device); | ||
| 137 | if (rpool) | ||
| 138 | return rpool; | ||
| 139 | |||
| 140 | rpool = kzalloc(sizeof(*rpool), GFP_KERNEL); | ||
| 141 | if (!rpool) | ||
| 142 | return ERR_PTR(-ENOMEM); | ||
| 143 | |||
| 144 | rpool->device = device; | ||
| 145 | set_all_resource_max_limit(rpool); | ||
| 146 | |||
| 147 | INIT_LIST_HEAD(&rpool->cg_node); | ||
| 148 | INIT_LIST_HEAD(&rpool->dev_node); | ||
| 149 | list_add_tail(&rpool->cg_node, &cg->rpools); | ||
| 150 | list_add_tail(&rpool->dev_node, &device->rpools); | ||
| 151 | return rpool; | ||
| 152 | } | ||
| 153 | |||
| 154 | /** | ||
| 155 | * uncharge_cg_locked - uncharge resource for rdma cgroup | ||
| 156 | * @cg: pointer to cg to uncharge and all parents in hierarchy | ||
| 157 | * @device: pointer to rdmacg device | ||
| 158 | * @index: index of the resource to uncharge in cg (resource pool) | ||
| 159 | * | ||
| 160 | * It also frees the resource pool which was created as part of | ||
| 161 | * charging operation when there are no resources attached to | ||
| 162 | * resource pool. | ||
| 163 | */ | ||
| 164 | static void | ||
| 165 | uncharge_cg_locked(struct rdma_cgroup *cg, | ||
| 166 | struct rdmacg_device *device, | ||
| 167 | enum rdmacg_resource_type index) | ||
| 168 | { | ||
| 169 | struct rdmacg_resource_pool *rpool; | ||
| 170 | |||
| 171 | rpool = find_cg_rpool_locked(cg, device); | ||
| 172 | |||
| 173 | /* | ||
| 174 | * rpool cannot be null at this stage. Let kernel operate in case | ||
| 175 | * if there a bug in IB stack or rdma controller, instead of crashing | ||
| 176 | * the system. | ||
| 177 | */ | ||
| 178 | if (unlikely(!rpool)) { | ||
| 179 | pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device); | ||
| 180 | return; | ||
| 181 | } | ||
| 182 | |||
| 183 | rpool->resources[index].usage--; | ||
| 184 | |||
| 185 | /* | ||
| 186 | * A negative count (or overflow) is invalid, | ||
| 187 | * it indicates a bug in the rdma controller. | ||
| 188 | */ | ||
| 189 | WARN_ON_ONCE(rpool->resources[index].usage < 0); | ||
| 190 | rpool->usage_sum--; | ||
| 191 | if (rpool->usage_sum == 0 && | ||
| 192 | rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { | ||
| 193 | /* | ||
| 194 | * No user of the rpool and all entries are set to max, so | ||
| 195 | * safe to delete this rpool. | ||
| 196 | */ | ||
| 197 | free_cg_rpool_locked(rpool); | ||
| 198 | } | ||
| 199 | } | ||
| 200 | |||
| 201 | /** | ||
| 202 | * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count | ||
| 203 | * @device: pointer to rdmacg device | ||
| 204 | * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup | ||
| 205 | * stop uncharging | ||
| 206 | * @index: index of the resource to uncharge in cg in given resource pool | ||
| 207 | */ | ||
| 208 | static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, | ||
| 209 | struct rdmacg_device *device, | ||
| 210 | struct rdma_cgroup *stop_cg, | ||
| 211 | enum rdmacg_resource_type index) | ||
| 212 | { | ||
| 213 | struct rdma_cgroup *p; | ||
| 214 | |||
| 215 | mutex_lock(&rdmacg_mutex); | ||
| 216 | |||
| 217 | for (p = cg; p != stop_cg; p = parent_rdmacg(p)) | ||
| 218 | uncharge_cg_locked(p, device, index); | ||
| 219 | |||
| 220 | mutex_unlock(&rdmacg_mutex); | ||
| 221 | |||
| 222 | css_put(&cg->css); | ||
| 223 | } | ||
| 224 | |||
| 225 | /** | ||
| 226 | * rdmacg_uncharge - hierarchically uncharge rdma resource count | ||
| 227 | * @device: pointer to rdmacg device | ||
| 228 | * @index: index of the resource to uncharge in cgroup in given resource pool | ||
| 229 | */ | ||
| 230 | void rdmacg_uncharge(struct rdma_cgroup *cg, | ||
| 231 | struct rdmacg_device *device, | ||
| 232 | enum rdmacg_resource_type index) | ||
| 233 | { | ||
| 234 | if (index >= RDMACG_RESOURCE_MAX) | ||
| 235 | return; | ||
| 236 | |||
| 237 | rdmacg_uncharge_hierarchy(cg, device, NULL, index); | ||
| 238 | } | ||
| 239 | EXPORT_SYMBOL(rdmacg_uncharge); | ||
| 240 | |||
| 241 | /** | ||
| 242 | * rdmacg_try_charge - hierarchically try to charge the rdma resource | ||
| 243 | * @rdmacg: pointer to rdma cgroup which will own this resource | ||
| 244 | * @device: pointer to rdmacg device | ||
| 245 | * @index: index of the resource to charge in cgroup (resource pool) | ||
| 246 | * | ||
| 247 | * This function follows charging resource in hierarchical way. | ||
| 248 | * It will fail if the charge would cause the new value to exceed the | ||
| 249 | * hierarchical limit. | ||
| 250 | * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. | ||
| 251 | * Returns pointer to rdmacg for this resource when charging is successful. | ||
| 252 | * | ||
| 253 | * Charger needs to account resources on two criteria. | ||
| 254 | * (a) per cgroup & (b) per device resource usage. | ||
| 255 | * Per cgroup resource usage ensures that tasks of cgroup doesn't cross | ||
| 256 | * the configured limits. Per device provides granular configuration | ||
| 257 | * in multi device usage. It allocates resource pool in the hierarchy | ||
| 258 | * for each parent it come across for first resource. Later on resource | ||
| 259 | * pool will be available. Therefore it will be much faster thereon | ||
| 260 | * to charge/uncharge. | ||
| 261 | */ | ||
| 262 | int rdmacg_try_charge(struct rdma_cgroup **rdmacg, | ||
| 263 | struct rdmacg_device *device, | ||
| 264 | enum rdmacg_resource_type index) | ||
| 265 | { | ||
| 266 | struct rdma_cgroup *cg, *p; | ||
| 267 | struct rdmacg_resource_pool *rpool; | ||
| 268 | s64 new; | ||
| 269 | int ret = 0; | ||
| 270 | |||
| 271 | if (index >= RDMACG_RESOURCE_MAX) | ||
| 272 | return -EINVAL; | ||
| 273 | |||
| 274 | /* | ||
| 275 | * hold on to css, as cgroup can be removed but resource | ||
| 276 | * accounting happens on css. | ||
| 277 | */ | ||
| 278 | cg = get_current_rdmacg(); | ||
| 279 | |||
| 280 | mutex_lock(&rdmacg_mutex); | ||
| 281 | for (p = cg; p; p = parent_rdmacg(p)) { | ||
| 282 | rpool = get_cg_rpool_locked(p, device); | ||
| 283 | if (IS_ERR(rpool)) { | ||
| 284 | ret = PTR_ERR(rpool); | ||
| 285 | goto err; | ||
| 286 | } else { | ||
| 287 | new = rpool->resources[index].usage + 1; | ||
| 288 | if (new > rpool->resources[index].max) { | ||
| 289 | ret = -EAGAIN; | ||
| 290 | goto err; | ||
| 291 | } else { | ||
| 292 | rpool->resources[index].usage = new; | ||
| 293 | rpool->usage_sum++; | ||
| 294 | } | ||
| 295 | } | ||
| 296 | } | ||
| 297 | mutex_unlock(&rdmacg_mutex); | ||
| 298 | |||
| 299 | *rdmacg = cg; | ||
| 300 | return 0; | ||
| 301 | |||
| 302 | err: | ||
| 303 | mutex_unlock(&rdmacg_mutex); | ||
| 304 | rdmacg_uncharge_hierarchy(cg, device, p, index); | ||
| 305 | return ret; | ||
| 306 | } | ||
| 307 | EXPORT_SYMBOL(rdmacg_try_charge); | ||
| 308 | |||
| 309 | /** | ||
| 310 | * rdmacg_register_device - register rdmacg device to rdma controller. | ||
| 311 | * @device: pointer to rdmacg device whose resources need to be accounted. | ||
| 312 | * | ||
| 313 | * If IB stack wish a device to participate in rdma cgroup resource | ||
| 314 | * tracking, it must invoke this API to register with rdma cgroup before | ||
| 315 | * any user space application can start using the RDMA resources. | ||
| 316 | * Returns 0 on success or EINVAL when table length given is beyond | ||
| 317 | * supported size. | ||
| 318 | */ | ||
| 319 | int rdmacg_register_device(struct rdmacg_device *device) | ||
| 320 | { | ||
| 321 | INIT_LIST_HEAD(&device->dev_node); | ||
| 322 | INIT_LIST_HEAD(&device->rpools); | ||
| 323 | |||
| 324 | mutex_lock(&rdmacg_mutex); | ||
| 325 | list_add_tail(&device->dev_node, &rdmacg_devices); | ||
| 326 | mutex_unlock(&rdmacg_mutex); | ||
| 327 | return 0; | ||
| 328 | } | ||
| 329 | EXPORT_SYMBOL(rdmacg_register_device); | ||
| 330 | |||
| 331 | /** | ||
| 332 | * rdmacg_unregister_device - unregister rdmacg device from rdma controller. | ||
| 333 | * @device: pointer to rdmacg device which was previously registered with rdma | ||
| 334 | * controller using rdmacg_register_device(). | ||
| 335 | * | ||
| 336 | * IB stack must invoke this after all the resources of the IB device | ||
| 337 | * are destroyed and after ensuring that no more resources will be created | ||
| 338 | * when this API is invoked. | ||
| 339 | */ | ||
| 340 | void rdmacg_unregister_device(struct rdmacg_device *device) | ||
| 341 | { | ||
| 342 | struct rdmacg_resource_pool *rpool, *tmp; | ||
| 343 | |||
| 344 | /* | ||
| 345 | * Synchronize with any active resource settings, | ||
| 346 | * usage query happening via configfs. | ||
| 347 | */ | ||
| 348 | mutex_lock(&rdmacg_mutex); | ||
| 349 | list_del_init(&device->dev_node); | ||
| 350 | |||
| 351 | /* | ||
| 352 | * Now that this device is off the cgroup list, its safe to free | ||
| 353 | * all the rpool resources. | ||
| 354 | */ | ||
| 355 | list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node) | ||
| 356 | free_cg_rpool_locked(rpool); | ||
| 357 | |||
| 358 | mutex_unlock(&rdmacg_mutex); | ||
| 359 | } | ||
| 360 | EXPORT_SYMBOL(rdmacg_unregister_device); | ||
| 361 | |||
| 362 | static int parse_resource(char *c, int *intval) | ||
| 363 | { | ||
| 364 | substring_t argstr; | ||
| 365 | const char **table = &rdmacg_resource_names[0]; | ||
| 366 | char *name, *value = c; | ||
| 367 | size_t len; | ||
| 368 | int ret, i = 0; | ||
| 369 | |||
| 370 | name = strsep(&value, "="); | ||
| 371 | if (!name || !value) | ||
| 372 | return -EINVAL; | ||
| 373 | |||
| 374 | len = strlen(value); | ||
| 375 | |||
| 376 | for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { | ||
| 377 | if (strcmp(table[i], name)) | ||
| 378 | continue; | ||
| 379 | |||
| 380 | argstr.from = value; | ||
| 381 | argstr.to = value + len; | ||
| 382 | |||
| 383 | ret = match_int(&argstr, intval); | ||
| 384 | if (ret >= 0) { | ||
| 385 | if (*intval < 0) | ||
| 386 | break; | ||
| 387 | return i; | ||
| 388 | } | ||
| 389 | if (strncmp(value, RDMACG_MAX_STR, len) == 0) { | ||
| 390 | *intval = S32_MAX; | ||
| 391 | return i; | ||
| 392 | } | ||
| 393 | break; | ||
| 394 | } | ||
| 395 | return -EINVAL; | ||
| 396 | } | ||
| 397 | |||
| 398 | static int rdmacg_parse_limits(char *options, | ||
| 399 | int *new_limits, unsigned long *enables) | ||
| 400 | { | ||
| 401 | char *c; | ||
| 402 | int err = -EINVAL; | ||
| 403 | |||
| 404 | /* parse resource options */ | ||
| 405 | while ((c = strsep(&options, " ")) != NULL) { | ||
| 406 | int index, intval; | ||
| 407 | |||
| 408 | index = parse_resource(c, &intval); | ||
| 409 | if (index < 0) | ||
| 410 | goto err; | ||
| 411 | |||
| 412 | new_limits[index] = intval; | ||
| 413 | *enables |= BIT(index); | ||
| 414 | } | ||
| 415 | return 0; | ||
| 416 | |||
| 417 | err: | ||
| 418 | return err; | ||
| 419 | } | ||
| 420 | |||
| 421 | static struct rdmacg_device *rdmacg_get_device_locked(const char *name) | ||
| 422 | { | ||
| 423 | struct rdmacg_device *device; | ||
| 424 | |||
| 425 | lockdep_assert_held(&rdmacg_mutex); | ||
| 426 | |||
| 427 | list_for_each_entry(device, &rdmacg_devices, dev_node) | ||
| 428 | if (!strcmp(name, device->name)) | ||
| 429 | return device; | ||
| 430 | |||
| 431 | return NULL; | ||
| 432 | } | ||
| 433 | |||
| 434 | static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, | ||
| 435 | char *buf, size_t nbytes, loff_t off) | ||
| 436 | { | ||
| 437 | struct rdma_cgroup *cg = css_rdmacg(of_css(of)); | ||
| 438 | const char *dev_name; | ||
| 439 | struct rdmacg_resource_pool *rpool; | ||
| 440 | struct rdmacg_device *device; | ||
| 441 | char *options = strstrip(buf); | ||
| 442 | int *new_limits; | ||
| 443 | unsigned long enables = 0; | ||
| 444 | int i = 0, ret = 0; | ||
| 445 | |||
| 446 | /* extract the device name first */ | ||
| 447 | dev_name = strsep(&options, " "); | ||
| 448 | if (!dev_name) { | ||
| 449 | ret = -EINVAL; | ||
| 450 | goto err; | ||
| 451 | } | ||
| 452 | |||
| 453 | new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL); | ||
| 454 | if (!new_limits) { | ||
| 455 | ret = -ENOMEM; | ||
| 456 | goto err; | ||
| 457 | } | ||
| 458 | |||
| 459 | ret = rdmacg_parse_limits(options, new_limits, &enables); | ||
| 460 | if (ret) | ||
| 461 | goto parse_err; | ||
| 462 | |||
| 463 | /* acquire lock to synchronize with hot plug devices */ | ||
| 464 | mutex_lock(&rdmacg_mutex); | ||
| 465 | |||
| 466 | device = rdmacg_get_device_locked(dev_name); | ||
| 467 | if (!device) { | ||
| 468 | ret = -ENODEV; | ||
| 469 | goto dev_err; | ||
| 470 | } | ||
| 471 | |||
| 472 | rpool = get_cg_rpool_locked(cg, device); | ||
| 473 | if (IS_ERR(rpool)) { | ||
| 474 | ret = PTR_ERR(rpool); | ||
| 475 | goto dev_err; | ||
| 476 | } | ||
| 477 | |||
| 478 | /* now set the new limits of the rpool */ | ||
| 479 | for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX) | ||
| 480 | set_resource_limit(rpool, i, new_limits[i]); | ||
| 481 | |||
| 482 | if (rpool->usage_sum == 0 && | ||
| 483 | rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { | ||
| 484 | /* | ||
| 485 | * No user of the rpool and all entries are set to max, so | ||
| 486 | * safe to delete this rpool. | ||
| 487 | */ | ||
| 488 | free_cg_rpool_locked(rpool); | ||
| 489 | } | ||
| 490 | |||
| 491 | dev_err: | ||
| 492 | mutex_unlock(&rdmacg_mutex); | ||
| 493 | |||
| 494 | parse_err: | ||
| 495 | kfree(new_limits); | ||
| 496 | |||
| 497 | err: | ||
| 498 | return ret ?: nbytes; | ||
| 499 | } | ||
| 500 | |||
| 501 | static void print_rpool_values(struct seq_file *sf, | ||
| 502 | struct rdmacg_resource_pool *rpool) | ||
| 503 | { | ||
| 504 | enum rdmacg_file_type sf_type; | ||
| 505 | int i; | ||
| 506 | u32 value; | ||
| 507 | |||
| 508 | sf_type = seq_cft(sf)->private; | ||
| 509 | |||
| 510 | for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { | ||
| 511 | seq_puts(sf, rdmacg_resource_names[i]); | ||
| 512 | seq_putc(sf, '='); | ||
| 513 | if (sf_type == RDMACG_RESOURCE_TYPE_MAX) { | ||
| 514 | if (rpool) | ||
| 515 | value = rpool->resources[i].max; | ||
| 516 | else | ||
| 517 | value = S32_MAX; | ||
| 518 | } else { | ||
| 519 | if (rpool) | ||
| 520 | value = rpool->resources[i].usage; | ||
| 521 | else | ||
| 522 | value = 0; | ||
| 523 | } | ||
| 524 | |||
| 525 | if (value == S32_MAX) | ||
| 526 | seq_puts(sf, RDMACG_MAX_STR); | ||
| 527 | else | ||
| 528 | seq_printf(sf, "%d", value); | ||
| 529 | seq_putc(sf, ' '); | ||
| 530 | } | ||
| 531 | } | ||
| 532 | |||
| 533 | static int rdmacg_resource_read(struct seq_file *sf, void *v) | ||
| 534 | { | ||
| 535 | struct rdmacg_device *device; | ||
| 536 | struct rdmacg_resource_pool *rpool; | ||
| 537 | struct rdma_cgroup *cg = css_rdmacg(seq_css(sf)); | ||
| 538 | |||
| 539 | mutex_lock(&rdmacg_mutex); | ||
| 540 | |||
| 541 | list_for_each_entry(device, &rdmacg_devices, dev_node) { | ||
| 542 | seq_printf(sf, "%s ", device->name); | ||
| 543 | |||
| 544 | rpool = find_cg_rpool_locked(cg, device); | ||
| 545 | print_rpool_values(sf, rpool); | ||
| 546 | |||
| 547 | seq_putc(sf, '\n'); | ||
| 548 | } | ||
| 549 | |||
| 550 | mutex_unlock(&rdmacg_mutex); | ||
| 551 | return 0; | ||
| 552 | } | ||
| 553 | |||
| 554 | static struct cftype rdmacg_files[] = { | ||
| 555 | { | ||
| 556 | .name = "max", | ||
| 557 | .write = rdmacg_resource_set_max, | ||
| 558 | .seq_show = rdmacg_resource_read, | ||
| 559 | .private = RDMACG_RESOURCE_TYPE_MAX, | ||
| 560 | .flags = CFTYPE_NOT_ON_ROOT, | ||
| 561 | }, | ||
| 562 | { | ||
| 563 | .name = "current", | ||
| 564 | .seq_show = rdmacg_resource_read, | ||
| 565 | .private = RDMACG_RESOURCE_TYPE_STAT, | ||
| 566 | .flags = CFTYPE_NOT_ON_ROOT, | ||
| 567 | }, | ||
| 568 | { } /* terminate */ | ||
| 569 | }; | ||
| 570 | |||
| 571 | static struct cgroup_subsys_state * | ||
| 572 | rdmacg_css_alloc(struct cgroup_subsys_state *parent) | ||
| 573 | { | ||
| 574 | struct rdma_cgroup *cg; | ||
| 575 | |||
| 576 | cg = kzalloc(sizeof(*cg), GFP_KERNEL); | ||
| 577 | if (!cg) | ||
| 578 | return ERR_PTR(-ENOMEM); | ||
| 579 | |||
| 580 | INIT_LIST_HEAD(&cg->rpools); | ||
| 581 | return &cg->css; | ||
| 582 | } | ||
| 583 | |||
| 584 | static void rdmacg_css_free(struct cgroup_subsys_state *css) | ||
| 585 | { | ||
| 586 | struct rdma_cgroup *cg = css_rdmacg(css); | ||
| 587 | |||
| 588 | kfree(cg); | ||
| 589 | } | ||
| 590 | |||
| 591 | /** | ||
| 592 | * rdmacg_css_offline - cgroup css_offline callback | ||
| 593 | * @css: css of interest | ||
| 594 | * | ||
| 595 | * This function is called when @css is about to go away and responsible | ||
| 596 | * for shooting down all rdmacg associated with @css. As part of that it | ||
| 597 | * marks all the resource pool entries to max value, so that when resources are | ||
| 598 | * uncharged, associated resource pool can be freed as well. | ||
| 599 | */ | ||
| 600 | static void rdmacg_css_offline(struct cgroup_subsys_state *css) | ||
| 601 | { | ||
| 602 | struct rdma_cgroup *cg = css_rdmacg(css); | ||
| 603 | struct rdmacg_resource_pool *rpool; | ||
| 604 | |||
| 605 | mutex_lock(&rdmacg_mutex); | ||
| 606 | |||
| 607 | list_for_each_entry(rpool, &cg->rpools, cg_node) | ||
| 608 | set_all_resource_max_limit(rpool); | ||
| 609 | |||
| 610 | mutex_unlock(&rdmacg_mutex); | ||
| 611 | } | ||
| 612 | |||
| 613 | struct cgroup_subsys rdma_cgrp_subsys = { | ||
| 614 | .css_alloc = rdmacg_css_alloc, | ||
| 615 | .css_free = rdmacg_css_free, | ||
| 616 | .css_offline = rdmacg_css_offline, | ||
| 617 | .legacy_cftypes = rdmacg_files, | ||
| 618 | .dfl_cftypes = rdmacg_files, | ||
| 619 | }; | ||
