summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-07-11 12:54:40 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-24 13:12:09 -0400
commitd7852fbd0f0423937fa287a598bfde188bb68c22 (patch)
tree45527bac1f3d1805a288e5151b4bc0eaadd7b478
parent0ecfebd2b52404ae0c54a878c872bb93363ada36 (diff)
access: avoid the RCU grace period for the temporary subjective credentials
It turns out that 'access()' (and 'faccessat()') can cause a lot of RCU work because it installs a temporary credential that gets allocated and freed for each system call. The allocation and freeing overhead is mostly benign, but because credentials can be accessed under the RCU read lock, the freeing involves a RCU grace period. Which is not a huge deal normally, but if you have a lot of access() calls, this causes a fair amount of seconday damage: instead of having a nice alloc/free patterns that hits in hot per-CPU slab caches, you have all those delayed free's, and on big machines with hundreds of cores, the RCU overhead can end up being enormous. But it turns out that all of this is entirely unnecessary. Exactly because access() only installs the credential as the thread-local subjective credential, the temporary cred pointer doesn't actually need to be RCU free'd at all. Once we're done using it, we can just free it synchronously and avoid all the RCU overhead. So add a 'non_rcu' flag to 'struct cred', which can be set by users that know they only use it in non-RCU context (there are other potential users for this). We can make it a union with the rcu freeing list head that we need for the RCU case, so this doesn't need any extra storage. Note that this also makes 'get_current_cred()' clear the new non_rcu flag, in case we have filesystems that take a long-term reference to the cred and then expect the RCU delayed freeing afterwards. It's not entirely clear that this is required, but it makes for clear semantics: the subjective cred remains non-RCU as long as you only access it synchronously using the thread-local accessors, but you _can_ use it as a generic cred if you want to. It is possible that we should just remove the whole RCU markings for ->cred entirely. Only ->real_cred is really supposed to be accessed through RCU, and the long-term cred copies that nfs uses might want to explicitly re-enable RCU freeing if required, rather than have get_current_cred() do it implicitly. But this is a "minimal semantic changes" change for the immediate problem. Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Eric Dumazet <edumazet@google.com> Acked-by: Paul E. McKenney <paulmck@linux.ibm.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Jan Glauber <jglauber@marvell.com> Cc: Jiri Kosina <jikos@kernel.org> Cc: Jayachandran Chandrasekharan Nair <jnair@marvell.com> Cc: Greg KH <greg@kroah.com> Cc: Kees Cook <keescook@chromium.org> Cc: David Howells <dhowells@redhat.com> Cc: Miklos Szeredi <miklos@szeredi.hu> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/open.c19
-rw-r--r--include/linux/cred.h8
-rw-r--r--kernel/cred.c21
3 files changed, 45 insertions, 3 deletions
diff --git a/fs/open.c b/fs/open.c
index b5b80469b93d..a59abe3c669a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -374,6 +374,25 @@ long do_faccessat(int dfd, const char __user *filename, int mode)
374 override_cred->cap_permitted; 374 override_cred->cap_permitted;
375 } 375 }
376 376
377 /*
378 * The new set of credentials can *only* be used in
379 * task-synchronous circumstances, and does not need
380 * RCU freeing, unless somebody then takes a separate
381 * reference to it.
382 *
383 * NOTE! This is _only_ true because this credential
384 * is used purely for override_creds() that installs
385 * it as the subjective cred. Other threads will be
386 * accessing ->real_cred, not the subjective cred.
387 *
388 * If somebody _does_ make a copy of this (using the
389 * 'get_current_cred()' function), that will clear the
390 * non_rcu field, because now that other user may be
391 * expecting RCU freeing. But normal thread-synchronous
392 * cred accesses will keep things non-RCY.
393 */
394 override_cred->non_rcu = 1;
395
377 old_cred = override_creds(override_cred); 396 old_cred = override_creds(override_cred);
378retry: 397retry:
379 res = user_path_at(dfd, filename, lookup_flags, &path); 398 res = user_path_at(dfd, filename, lookup_flags, &path);
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 7eb43a038330..f7a30e0099be 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -145,7 +145,11 @@ struct cred {
145 struct user_struct *user; /* real user ID subscription */ 145 struct user_struct *user; /* real user ID subscription */
146 struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ 146 struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
147 struct group_info *group_info; /* supplementary groups for euid/fsgid */ 147 struct group_info *group_info; /* supplementary groups for euid/fsgid */
148 struct rcu_head rcu; /* RCU deletion hook */ 148 /* RCU deletion */
149 union {
150 int non_rcu; /* Can we skip RCU deletion? */
151 struct rcu_head rcu; /* RCU deletion hook */
152 };
149} __randomize_layout; 153} __randomize_layout;
150 154
151extern void __put_cred(struct cred *); 155extern void __put_cred(struct cred *);
@@ -246,6 +250,7 @@ static inline const struct cred *get_cred(const struct cred *cred)
246 if (!cred) 250 if (!cred)
247 return cred; 251 return cred;
248 validate_creds(cred); 252 validate_creds(cred);
253 nonconst_cred->non_rcu = 0;
249 return get_new_cred(nonconst_cred); 254 return get_new_cred(nonconst_cred);
250} 255}
251 256
@@ -257,6 +262,7 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
257 if (!atomic_inc_not_zero(&nonconst_cred->usage)) 262 if (!atomic_inc_not_zero(&nonconst_cred->usage))
258 return NULL; 263 return NULL;
259 validate_creds(cred); 264 validate_creds(cred);
265 nonconst_cred->non_rcu = 0;
260 return cred; 266 return cred;
261} 267}
262 268
diff --git a/kernel/cred.c b/kernel/cred.c
index c73a87a4df13..153ae369e024 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -144,7 +144,10 @@ void __put_cred(struct cred *cred)
144 BUG_ON(cred == current->cred); 144 BUG_ON(cred == current->cred);
145 BUG_ON(cred == current->real_cred); 145 BUG_ON(cred == current->real_cred);
146 146
147 call_rcu(&cred->rcu, put_cred_rcu); 147 if (cred->non_rcu)
148 put_cred_rcu(&cred->rcu);
149 else
150 call_rcu(&cred->rcu, put_cred_rcu);
148} 151}
149EXPORT_SYMBOL(__put_cred); 152EXPORT_SYMBOL(__put_cred);
150 153
@@ -256,6 +259,7 @@ struct cred *prepare_creds(void)
256 old = task->cred; 259 old = task->cred;
257 memcpy(new, old, sizeof(struct cred)); 260 memcpy(new, old, sizeof(struct cred));
258 261
262 new->non_rcu = 0;
259 atomic_set(&new->usage, 1); 263 atomic_set(&new->usage, 1);
260 set_cred_subscribers(new, 0); 264 set_cred_subscribers(new, 0);
261 get_group_info(new->group_info); 265 get_group_info(new->group_info);
@@ -535,7 +539,19 @@ const struct cred *override_creds(const struct cred *new)
535 539
536 validate_creds(old); 540 validate_creds(old);
537 validate_creds(new); 541 validate_creds(new);
538 get_cred(new); 542
543 /*
544 * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
545 *
546 * That means that we do not clear the 'non_rcu' flag, since
547 * we are only installing the cred into the thread-synchronous
548 * '->cred' pointer, not the '->real_cred' pointer that is
549 * visible to other threads under RCU.
550 *
551 * Also note that we did validate_creds() manually, not depending
552 * on the validation in 'get_cred()'.
553 */
554 get_new_cred((struct cred *)new);
539 alter_cred_subscribers(new, 1); 555 alter_cred_subscribers(new, 1);
540 rcu_assign_pointer(current->cred, new); 556 rcu_assign_pointer(current->cred, new);
541 alter_cred_subscribers(old, -1); 557 alter_cred_subscribers(old, -1);
@@ -672,6 +688,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
672 validate_creds(old); 688 validate_creds(old);
673 689
674 *new = *old; 690 *new = *old;
691 new->non_rcu = 0;
675 atomic_set(&new->usage, 1); 692 atomic_set(&new->usage, 1);
676 set_cred_subscribers(new, 0); 693 set_cred_subscribers(new, 0);
677 get_uid(new->user); 694 get_uid(new->user);