diff options
Diffstat (limited to 'kernel')
108 files changed, 5267 insertions, 3749 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 8d528f9930da..a8a91bd2b2a9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -932,7 +932,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 932 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) | 932 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) |
| 933 | return 0; | 933 | return 0; |
| 934 | 934 | ||
| 935 | err = audit_filter_user(msg_type); | 935 | err = audit_filter(msg_type, AUDIT_FILTER_USER); |
| 936 | if (err == 1) { /* match or error */ | 936 | if (err == 1) { /* match or error */ |
| 937 | err = 0; | 937 | err = 0; |
| 938 | if (msg_type == AUDIT_USER_TTY) { | 938 | if (msg_type == AUDIT_USER_TTY) { |
| @@ -1379,7 +1379,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | |||
| 1379 | if (audit_initialized != AUDIT_INITIALIZED) | 1379 | if (audit_initialized != AUDIT_INITIALIZED) |
| 1380 | return NULL; | 1380 | return NULL; |
| 1381 | 1381 | ||
| 1382 | if (unlikely(audit_filter_type(type))) | 1382 | if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE))) |
| 1383 | return NULL; | 1383 | return NULL; |
| 1384 | 1384 | ||
| 1385 | if (gfp_mask & __GFP_DIRECT_RECLAIM) { | 1385 | if (gfp_mask & __GFP_DIRECT_RECLAIM) { |
diff --git a/kernel/audit.h b/kernel/audit.h index a492f4c4e710..431444c3708b 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
| @@ -331,6 +331,8 @@ extern pid_t audit_sig_pid; | |||
| 331 | extern kuid_t audit_sig_uid; | 331 | extern kuid_t audit_sig_uid; |
| 332 | extern u32 audit_sig_sid; | 332 | extern u32 audit_sig_sid; |
| 333 | 333 | ||
| 334 | extern int audit_filter(int msgtype, unsigned int listtype); | ||
| 335 | |||
| 334 | #ifdef CONFIG_AUDITSYSCALL | 336 | #ifdef CONFIG_AUDITSYSCALL |
| 335 | extern int __audit_signal_info(int sig, struct task_struct *t); | 337 | extern int __audit_signal_info(int sig, struct task_struct *t); |
| 336 | static inline int audit_signal_info(int sig, struct task_struct *t) | 338 | static inline int audit_signal_info(int sig, struct task_struct *t) |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 94ca7b1e5e7e..85d9cac497e4 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
| @@ -1290,113 +1290,72 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen) | |||
| 1290 | return strncmp(p, dname, dlen); | 1290 | return strncmp(p, dname, dlen); |
| 1291 | } | 1291 | } |
| 1292 | 1292 | ||
| 1293 | static int audit_filter_user_rules(struct audit_krule *rule, int type, | 1293 | int audit_filter(int msgtype, unsigned int listtype) |
| 1294 | enum audit_state *state) | ||
| 1295 | { | 1294 | { |
| 1296 | int i; | ||
| 1297 | |||
| 1298 | for (i = 0; i < rule->field_count; i++) { | ||
| 1299 | struct audit_field *f = &rule->fields[i]; | ||
| 1300 | pid_t pid; | ||
| 1301 | int result = 0; | ||
| 1302 | u32 sid; | ||
| 1303 | |||
| 1304 | switch (f->type) { | ||
| 1305 | case AUDIT_PID: | ||
| 1306 | pid = task_pid_nr(current); | ||
| 1307 | result = audit_comparator(pid, f->op, f->val); | ||
| 1308 | break; | ||
| 1309 | case AUDIT_UID: | ||
| 1310 | result = audit_uid_comparator(current_uid(), f->op, f->uid); | ||
| 1311 | break; | ||
| 1312 | case AUDIT_GID: | ||
| 1313 | result = audit_gid_comparator(current_gid(), f->op, f->gid); | ||
| 1314 | break; | ||
| 1315 | case AUDIT_LOGINUID: | ||
| 1316 | result = audit_uid_comparator(audit_get_loginuid(current), | ||
| 1317 | f->op, f->uid); | ||
| 1318 | break; | ||
| 1319 | case AUDIT_LOGINUID_SET: | ||
| 1320 | result = audit_comparator(audit_loginuid_set(current), | ||
| 1321 | f->op, f->val); | ||
| 1322 | break; | ||
| 1323 | case AUDIT_MSGTYPE: | ||
| 1324 | result = audit_comparator(type, f->op, f->val); | ||
| 1325 | break; | ||
| 1326 | case AUDIT_SUBJ_USER: | ||
| 1327 | case AUDIT_SUBJ_ROLE: | ||
| 1328 | case AUDIT_SUBJ_TYPE: | ||
| 1329 | case AUDIT_SUBJ_SEN: | ||
| 1330 | case AUDIT_SUBJ_CLR: | ||
| 1331 | if (f->lsm_rule) { | ||
| 1332 | security_task_getsecid(current, &sid); | ||
| 1333 | result = security_audit_rule_match(sid, | ||
| 1334 | f->type, | ||
| 1335 | f->op, | ||
| 1336 | f->lsm_rule, | ||
| 1337 | NULL); | ||
| 1338 | } | ||
| 1339 | break; | ||
| 1340 | } | ||
| 1341 | |||
| 1342 | if (!result) | ||
| 1343 | return 0; | ||
| 1344 | } | ||
| 1345 | switch (rule->action) { | ||
| 1346 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; | ||
| 1347 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; | ||
| 1348 | } | ||
| 1349 | return 1; | ||
| 1350 | } | ||
| 1351 | |||
| 1352 | int audit_filter_user(int type) | ||
| 1353 | { | ||
| 1354 | enum audit_state state = AUDIT_DISABLED; | ||
| 1355 | struct audit_entry *e; | 1295 | struct audit_entry *e; |
| 1356 | int rc, ret; | 1296 | int ret = 1; /* Audit by default */ |
| 1357 | |||
| 1358 | ret = 1; /* Audit by default */ | ||
| 1359 | |||
| 1360 | rcu_read_lock(); | ||
| 1361 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { | ||
| 1362 | rc = audit_filter_user_rules(&e->rule, type, &state); | ||
| 1363 | if (rc) { | ||
| 1364 | if (rc > 0 && state == AUDIT_DISABLED) | ||
| 1365 | ret = 0; | ||
| 1366 | break; | ||
| 1367 | } | ||
| 1368 | } | ||
| 1369 | rcu_read_unlock(); | ||
| 1370 | |||
| 1371 | return ret; | ||
| 1372 | } | ||
| 1373 | |||
| 1374 | int audit_filter_type(int type) | ||
| 1375 | { | ||
| 1376 | struct audit_entry *e; | ||
| 1377 | int result = 0; | ||
| 1378 | 1297 | ||
| 1379 | rcu_read_lock(); | 1298 | rcu_read_lock(); |
| 1380 | if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE])) | 1299 | if (list_empty(&audit_filter_list[listtype])) |
| 1381 | goto unlock_and_return; | 1300 | goto unlock_and_return; |
| 1301 | list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) { | ||
| 1302 | int i, result = 0; | ||
| 1382 | 1303 | ||
| 1383 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE], | ||
| 1384 | list) { | ||
| 1385 | int i; | ||
| 1386 | for (i = 0; i < e->rule.field_count; i++) { | 1304 | for (i = 0; i < e->rule.field_count; i++) { |
| 1387 | struct audit_field *f = &e->rule.fields[i]; | 1305 | struct audit_field *f = &e->rule.fields[i]; |
| 1388 | if (f->type == AUDIT_MSGTYPE) { | 1306 | pid_t pid; |
| 1389 | result = audit_comparator(type, f->op, f->val); | 1307 | u32 sid; |
| 1390 | if (!result) | 1308 | |
| 1391 | break; | 1309 | switch (f->type) { |
| 1310 | case AUDIT_PID: | ||
| 1311 | pid = task_pid_nr(current); | ||
| 1312 | result = audit_comparator(pid, f->op, f->val); | ||
| 1313 | break; | ||
| 1314 | case AUDIT_UID: | ||
| 1315 | result = audit_uid_comparator(current_uid(), f->op, f->uid); | ||
| 1316 | break; | ||
| 1317 | case AUDIT_GID: | ||
| 1318 | result = audit_gid_comparator(current_gid(), f->op, f->gid); | ||
| 1319 | break; | ||
| 1320 | case AUDIT_LOGINUID: | ||
| 1321 | result = audit_uid_comparator(audit_get_loginuid(current), | ||
| 1322 | f->op, f->uid); | ||
| 1323 | break; | ||
| 1324 | case AUDIT_LOGINUID_SET: | ||
| 1325 | result = audit_comparator(audit_loginuid_set(current), | ||
| 1326 | f->op, f->val); | ||
| 1327 | break; | ||
| 1328 | case AUDIT_MSGTYPE: | ||
| 1329 | result = audit_comparator(msgtype, f->op, f->val); | ||
| 1330 | break; | ||
| 1331 | case AUDIT_SUBJ_USER: | ||
| 1332 | case AUDIT_SUBJ_ROLE: | ||
| 1333 | case AUDIT_SUBJ_TYPE: | ||
| 1334 | case AUDIT_SUBJ_SEN: | ||
| 1335 | case AUDIT_SUBJ_CLR: | ||
| 1336 | if (f->lsm_rule) { | ||
| 1337 | security_task_getsecid(current, &sid); | ||
| 1338 | result = security_audit_rule_match(sid, | ||
| 1339 | f->type, f->op, f->lsm_rule, NULL); | ||
| 1340 | } | ||
| 1341 | break; | ||
| 1342 | default: | ||
| 1343 | goto unlock_and_return; | ||
| 1392 | } | 1344 | } |
| 1345 | if (result < 0) /* error */ | ||
| 1346 | goto unlock_and_return; | ||
| 1347 | if (!result) | ||
| 1348 | break; | ||
| 1349 | } | ||
| 1350 | if (result > 0) { | ||
| 1351 | if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_TYPE) | ||
| 1352 | ret = 0; | ||
| 1353 | break; | ||
| 1393 | } | 1354 | } |
| 1394 | if (result) | ||
| 1395 | goto unlock_and_return; | ||
| 1396 | } | 1355 | } |
| 1397 | unlock_and_return: | 1356 | unlock_and_return: |
| 1398 | rcu_read_unlock(); | 1357 | rcu_read_unlock(); |
| 1399 | return result; | 1358 | return ret; |
| 1400 | } | 1359 | } |
| 1401 | 1360 | ||
| 1402 | static int update_lsm_rule(struct audit_krule *r) | 1361 | static int update_lsm_rule(struct audit_krule *r) |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2672d105cffc..5abf1dc1f91c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -72,6 +72,7 @@ | |||
| 72 | #include <linux/compat.h> | 72 | #include <linux/compat.h> |
| 73 | #include <linux/ctype.h> | 73 | #include <linux/ctype.h> |
| 74 | #include <linux/string.h> | 74 | #include <linux/string.h> |
| 75 | #include <linux/uaccess.h> | ||
| 75 | #include <uapi/linux/limits.h> | 76 | #include <uapi/linux/limits.h> |
| 76 | 77 | ||
| 77 | #include "audit.h" | 78 | #include "audit.h" |
| @@ -81,7 +82,8 @@ | |||
| 81 | #define AUDITSC_SUCCESS 1 | 82 | #define AUDITSC_SUCCESS 1 |
| 82 | #define AUDITSC_FAILURE 2 | 83 | #define AUDITSC_FAILURE 2 |
| 83 | 84 | ||
| 84 | /* no execve audit message should be longer than this (userspace limits) */ | 85 | /* no execve audit message should be longer than this (userspace limits), |
| 86 | * see the note near the top of audit_log_execve_info() about this value */ | ||
| 85 | #define MAX_EXECVE_AUDIT_LEN 7500 | 87 | #define MAX_EXECVE_AUDIT_LEN 7500 |
| 86 | 88 | ||
| 87 | /* max length to print of cmdline/proctitle value during audit */ | 89 | /* max length to print of cmdline/proctitle value during audit */ |
| @@ -694,8 +696,12 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 694 | ctx->prio = rule->prio; | 696 | ctx->prio = rule->prio; |
| 695 | } | 697 | } |
| 696 | switch (rule->action) { | 698 | switch (rule->action) { |
| 697 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; | 699 | case AUDIT_NEVER: |
| 698 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; | 700 | *state = AUDIT_DISABLED; |
| 701 | break; | ||
| 702 | case AUDIT_ALWAYS: | ||
| 703 | *state = AUDIT_RECORD_CONTEXT; | ||
| 704 | break; | ||
| 699 | } | 705 | } |
| 700 | return 1; | 706 | return 1; |
| 701 | } | 707 | } |
| @@ -987,184 +993,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, | |||
| 987 | return rc; | 993 | return rc; |
| 988 | } | 994 | } |
| 989 | 995 | ||
| 990 | /* | 996 | static void audit_log_execve_info(struct audit_context *context, |
| 991 | * to_send and len_sent accounting are very loose estimates. We aren't | 997 | struct audit_buffer **ab) |
| 992 | * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being | ||
| 993 | * within about 500 bytes (next page boundary) | ||
| 994 | * | ||
| 995 | * why snprintf? an int is up to 12 digits long. if we just assumed when | ||
| 996 | * logging that a[%d]= was going to be 16 characters long we would be wasting | ||
| 997 | * space in every audit message. In one 7500 byte message we can log up to | ||
| 998 | * about 1000 min size arguments. That comes down to about 50% waste of space | ||
| 999 | * if we didn't do the snprintf to find out how long arg_num_len was. | ||
| 1000 | */ | ||
| 1001 | static int audit_log_single_execve_arg(struct audit_context *context, | ||
| 1002 | struct audit_buffer **ab, | ||
| 1003 | int arg_num, | ||
| 1004 | size_t *len_sent, | ||
| 1005 | const char __user *p, | ||
| 1006 | char *buf) | ||
| 1007 | { | 998 | { |
| 1008 | char arg_num_len_buf[12]; | 999 | long len_max; |
| 1009 | const char __user *tmp_p = p; | 1000 | long len_rem; |
| 1010 | /* how many digits are in arg_num? 5 is the length of ' a=""' */ | 1001 | long len_full; |
| 1011 | size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; | 1002 | long len_buf; |
| 1012 | size_t len, len_left, to_send; | 1003 | long len_abuf; |
| 1013 | size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; | 1004 | long len_tmp; |
| 1014 | unsigned int i, has_cntl = 0, too_long = 0; | 1005 | bool require_data; |
| 1015 | int ret; | 1006 | bool encode; |
| 1016 | 1007 | unsigned int iter; | |
| 1017 | /* strnlen_user includes the null we don't want to send */ | 1008 | unsigned int arg; |
| 1018 | len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1; | 1009 | char *buf_head; |
| 1019 | 1010 | char *buf; | |
| 1020 | /* | 1011 | const char __user *p = (const char __user *)current->mm->arg_start; |
| 1021 | * We just created this mm, if we can't find the strings | 1012 | |
| 1022 | * we just copied into it something is _very_ wrong. Similar | 1013 | /* NOTE: this buffer needs to be large enough to hold all the non-arg |
| 1023 | * for strings that are too long, we should not have created | 1014 | * data we put in the audit record for this argument (see the |
| 1024 | * any. | 1015 | * code below) ... at this point in time 96 is plenty */ |
| 1025 | */ | 1016 | char abuf[96]; |
| 1026 | if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { | 1017 | |
| 1027 | send_sig(SIGKILL, current, 0); | 1018 | /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the |
| 1028 | return -1; | 1019 | * current value of 7500 is not as important as the fact that it |
| 1020 | * is less than 8k, a setting of 7500 gives us plenty of wiggle | ||
| 1021 | * room if we go over a little bit in the logging below */ | ||
| 1022 | WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500); | ||
| 1023 | len_max = MAX_EXECVE_AUDIT_LEN; | ||
| 1024 | |||
| 1025 | /* scratch buffer to hold the userspace args */ | ||
| 1026 | buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); | ||
| 1027 | if (!buf_head) { | ||
| 1028 | audit_panic("out of memory for argv string"); | ||
| 1029 | return; | ||
| 1029 | } | 1030 | } |
| 1031 | buf = buf_head; | ||
| 1030 | 1032 | ||
| 1031 | /* walk the whole argument looking for non-ascii chars */ | 1033 | audit_log_format(*ab, "argc=%d", context->execve.argc); |
| 1034 | |||
| 1035 | len_rem = len_max; | ||
| 1036 | len_buf = 0; | ||
| 1037 | len_full = 0; | ||
| 1038 | require_data = true; | ||
| 1039 | encode = false; | ||
| 1040 | iter = 0; | ||
| 1041 | arg = 0; | ||
| 1032 | do { | 1042 | do { |
| 1033 | if (len_left > MAX_EXECVE_AUDIT_LEN) | 1043 | /* NOTE: we don't ever want to trust this value for anything |
| 1034 | to_send = MAX_EXECVE_AUDIT_LEN; | 1044 | * serious, but the audit record format insists we |
| 1035 | else | 1045 | * provide an argument length for really long arguments, |
| 1036 | to_send = len_left; | 1046 | * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but |
| 1037 | ret = copy_from_user(buf, tmp_p, to_send); | 1047 | * to use strncpy_from_user() to obtain this value for |
| 1038 | /* | 1048 | * recording in the log, although we don't use it |
| 1039 | * There is no reason for this copy to be short. We just | 1049 | * anywhere here to avoid a double-fetch problem */ |
| 1040 | * copied them here, and the mm hasn't been exposed to user- | 1050 | if (len_full == 0) |
| 1041 | * space yet. | 1051 | len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1; |
| 1042 | */ | 1052 | |
| 1043 | if (ret) { | 1053 | /* read more data from userspace */ |
| 1044 | WARN_ON(1); | 1054 | if (require_data) { |
| 1045 | send_sig(SIGKILL, current, 0); | 1055 | /* can we make more room in the buffer? */ |
| 1046 | return -1; | 1056 | if (buf != buf_head) { |
| 1047 | } | 1057 | memmove(buf_head, buf, len_buf); |
| 1048 | buf[to_send] = '\0'; | 1058 | buf = buf_head; |
| 1049 | has_cntl = audit_string_contains_control(buf, to_send); | 1059 | } |
| 1050 | if (has_cntl) { | 1060 | |
| 1051 | /* | 1061 | /* fetch as much as we can of the argument */ |
| 1052 | * hex messages get logged as 2 bytes, so we can only | 1062 | len_tmp = strncpy_from_user(&buf_head[len_buf], p, |
| 1053 | * send half as much in each message | 1063 | len_max - len_buf); |
| 1054 | */ | 1064 | if (len_tmp == -EFAULT) { |
| 1055 | max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2; | 1065 | /* unable to copy from userspace */ |
| 1056 | break; | 1066 | send_sig(SIGKILL, current, 0); |
| 1057 | } | 1067 | goto out; |
| 1058 | len_left -= to_send; | 1068 | } else if (len_tmp == (len_max - len_buf)) { |
| 1059 | tmp_p += to_send; | 1069 | /* buffer is not large enough */ |
| 1060 | } while (len_left > 0); | 1070 | require_data = true; |
| 1061 | 1071 | /* NOTE: if we are going to span multiple | |
| 1062 | len_left = len; | 1072 | * buffers force the encoding so we stand |
| 1063 | 1073 | * a chance at a sane len_full value and | |
| 1064 | if (len > max_execve_audit_len) | 1074 | * consistent record encoding */ |
| 1065 | too_long = 1; | 1075 | encode = true; |
| 1066 | 1076 | len_full = len_full * 2; | |
| 1067 | /* rewalk the argument actually logging the message */ | 1077 | p += len_tmp; |
| 1068 | for (i = 0; len_left > 0; i++) { | 1078 | } else { |
| 1069 | int room_left; | 1079 | require_data = false; |
| 1070 | 1080 | if (!encode) | |
| 1071 | if (len_left > max_execve_audit_len) | 1081 | encode = audit_string_contains_control( |
| 1072 | to_send = max_execve_audit_len; | 1082 | buf, len_tmp); |
| 1073 | else | 1083 | /* try to use a trusted value for len_full */ |
| 1074 | to_send = len_left; | 1084 | if (len_full < len_max) |
| 1075 | 1085 | len_full = (encode ? | |
| 1076 | /* do we have space left to send this argument in this ab? */ | 1086 | len_tmp * 2 : len_tmp); |
| 1077 | room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent; | 1087 | p += len_tmp + 1; |
| 1078 | if (has_cntl) | 1088 | } |
| 1079 | room_left -= (to_send * 2); | 1089 | len_buf += len_tmp; |
| 1080 | else | 1090 | buf_head[len_buf] = '\0'; |
| 1081 | room_left -= to_send; | ||
| 1082 | if (room_left < 0) { | ||
| 1083 | *len_sent = 0; | ||
| 1084 | audit_log_end(*ab); | ||
| 1085 | *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); | ||
| 1086 | if (!*ab) | ||
| 1087 | return 0; | ||
| 1088 | } | ||
| 1089 | 1091 | ||
| 1090 | /* | 1092 | /* length of the buffer in the audit record? */ |
| 1091 | * first record needs to say how long the original string was | 1093 | len_abuf = (encode ? len_buf * 2 : len_buf + 2); |
| 1092 | * so we can be sure nothing was lost. | ||
| 1093 | */ | ||
| 1094 | if ((i == 0) && (too_long)) | ||
| 1095 | audit_log_format(*ab, " a%d_len=%zu", arg_num, | ||
| 1096 | has_cntl ? 2*len : len); | ||
| 1097 | |||
| 1098 | /* | ||
| 1099 | * normally arguments are small enough to fit and we already | ||
| 1100 | * filled buf above when we checked for control characters | ||
| 1101 | * so don't bother with another copy_from_user | ||
| 1102 | */ | ||
| 1103 | if (len >= max_execve_audit_len) | ||
| 1104 | ret = copy_from_user(buf, p, to_send); | ||
| 1105 | else | ||
| 1106 | ret = 0; | ||
| 1107 | if (ret) { | ||
| 1108 | WARN_ON(1); | ||
| 1109 | send_sig(SIGKILL, current, 0); | ||
| 1110 | return -1; | ||
| 1111 | } | 1094 | } |
| 1112 | buf[to_send] = '\0'; | ||
| 1113 | |||
| 1114 | /* actually log it */ | ||
| 1115 | audit_log_format(*ab, " a%d", arg_num); | ||
| 1116 | if (too_long) | ||
| 1117 | audit_log_format(*ab, "[%d]", i); | ||
| 1118 | audit_log_format(*ab, "="); | ||
| 1119 | if (has_cntl) | ||
| 1120 | audit_log_n_hex(*ab, buf, to_send); | ||
| 1121 | else | ||
| 1122 | audit_log_string(*ab, buf); | ||
| 1123 | |||
| 1124 | p += to_send; | ||
| 1125 | len_left -= to_send; | ||
| 1126 | *len_sent += arg_num_len; | ||
| 1127 | if (has_cntl) | ||
| 1128 | *len_sent += to_send * 2; | ||
| 1129 | else | ||
| 1130 | *len_sent += to_send; | ||
| 1131 | } | ||
| 1132 | /* include the null we didn't log */ | ||
| 1133 | return len + 1; | ||
| 1134 | } | ||
| 1135 | 1095 | ||
| 1136 | static void audit_log_execve_info(struct audit_context *context, | 1096 | /* write as much as we can to the audit log */ |
| 1137 | struct audit_buffer **ab) | 1097 | if (len_buf > 0) { |
| 1138 | { | 1098 | /* NOTE: some magic numbers here - basically if we |
| 1139 | int i, len; | 1099 | * can't fit a reasonable amount of data into the |
| 1140 | size_t len_sent = 0; | 1100 | * existing audit buffer, flush it and start with |
| 1141 | const char __user *p; | 1101 | * a new buffer */ |
| 1142 | char *buf; | 1102 | if ((sizeof(abuf) + 8) > len_rem) { |
| 1103 | len_rem = len_max; | ||
| 1104 | audit_log_end(*ab); | ||
| 1105 | *ab = audit_log_start(context, | ||
| 1106 | GFP_KERNEL, AUDIT_EXECVE); | ||
| 1107 | if (!*ab) | ||
| 1108 | goto out; | ||
| 1109 | } | ||
| 1143 | 1110 | ||
| 1144 | p = (const char __user *)current->mm->arg_start; | 1111 | /* create the non-arg portion of the arg record */ |
| 1112 | len_tmp = 0; | ||
| 1113 | if (require_data || (iter > 0) || | ||
| 1114 | ((len_abuf + sizeof(abuf)) > len_rem)) { | ||
| 1115 | if (iter == 0) { | ||
| 1116 | len_tmp += snprintf(&abuf[len_tmp], | ||
| 1117 | sizeof(abuf) - len_tmp, | ||
| 1118 | " a%d_len=%lu", | ||
| 1119 | arg, len_full); | ||
| 1120 | } | ||
| 1121 | len_tmp += snprintf(&abuf[len_tmp], | ||
| 1122 | sizeof(abuf) - len_tmp, | ||
| 1123 | " a%d[%d]=", arg, iter++); | ||
| 1124 | } else | ||
| 1125 | len_tmp += snprintf(&abuf[len_tmp], | ||
| 1126 | sizeof(abuf) - len_tmp, | ||
| 1127 | " a%d=", arg); | ||
| 1128 | WARN_ON(len_tmp >= sizeof(abuf)); | ||
| 1129 | abuf[sizeof(abuf) - 1] = '\0'; | ||
| 1130 | |||
| 1131 | /* log the arg in the audit record */ | ||
| 1132 | audit_log_format(*ab, "%s", abuf); | ||
| 1133 | len_rem -= len_tmp; | ||
| 1134 | len_tmp = len_buf; | ||
| 1135 | if (encode) { | ||
| 1136 | if (len_abuf > len_rem) | ||
| 1137 | len_tmp = len_rem / 2; /* encoding */ | ||
| 1138 | audit_log_n_hex(*ab, buf, len_tmp); | ||
| 1139 | len_rem -= len_tmp * 2; | ||
| 1140 | len_abuf -= len_tmp * 2; | ||
| 1141 | } else { | ||
| 1142 | if (len_abuf > len_rem) | ||
| 1143 | len_tmp = len_rem - 2; /* quotes */ | ||
| 1144 | audit_log_n_string(*ab, buf, len_tmp); | ||
| 1145 | len_rem -= len_tmp + 2; | ||
| 1146 | /* don't subtract the "2" because we still need | ||
| 1147 | * to add quotes to the remaining string */ | ||
| 1148 | len_abuf -= len_tmp; | ||
| 1149 | } | ||
| 1150 | len_buf -= len_tmp; | ||
| 1151 | buf += len_tmp; | ||
| 1152 | } | ||
| 1145 | 1153 | ||
| 1146 | audit_log_format(*ab, "argc=%d", context->execve.argc); | 1154 | /* ready to move to the next argument? */ |
| 1155 | if ((len_buf == 0) && !require_data) { | ||
| 1156 | arg++; | ||
| 1157 | iter = 0; | ||
| 1158 | len_full = 0; | ||
| 1159 | require_data = true; | ||
| 1160 | encode = false; | ||
| 1161 | } | ||
| 1162 | } while (arg < context->execve.argc); | ||
| 1147 | 1163 | ||
| 1148 | /* | 1164 | /* NOTE: the caller handles the final audit_log_end() call */ |
| 1149 | * we need some kernel buffer to hold the userspace args. Just | ||
| 1150 | * allocate one big one rather than allocating one of the right size | ||
| 1151 | * for every single argument inside audit_log_single_execve_arg() | ||
| 1152 | * should be <8k allocation so should be pretty safe. | ||
| 1153 | */ | ||
| 1154 | buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); | ||
| 1155 | if (!buf) { | ||
| 1156 | audit_panic("out of memory for argv string"); | ||
| 1157 | return; | ||
| 1158 | } | ||
| 1159 | 1165 | ||
| 1160 | for (i = 0; i < context->execve.argc; i++) { | 1166 | out: |
| 1161 | len = audit_log_single_execve_arg(context, ab, i, | 1167 | kfree(buf_head); |
| 1162 | &len_sent, p, buf); | ||
| 1163 | if (len <= 0) | ||
| 1164 | break; | ||
| 1165 | p += len; | ||
| 1166 | } | ||
| 1167 | kfree(buf); | ||
| 1168 | } | 1168 | } |
| 1169 | 1169 | ||
| 1170 | static void show_special(struct audit_context *context, int *call_panic) | 1170 | static void show_special(struct audit_context *context, int *call_panic) |
| @@ -1425,7 +1425,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
| 1425 | if (context->pwd.dentry && context->pwd.mnt) { | 1425 | if (context->pwd.dentry && context->pwd.mnt) { |
| 1426 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); | 1426 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); |
| 1427 | if (ab) { | 1427 | if (ab) { |
| 1428 | audit_log_d_path(ab, " cwd=", &context->pwd); | 1428 | audit_log_d_path(ab, "cwd=", &context->pwd); |
| 1429 | audit_log_end(ab); | 1429 | audit_log_end(ab); |
| 1430 | } | 1430 | } |
| 1431 | } | 1431 | } |
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 76d5a794e426..633a650d7aeb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
| @@ -328,8 +328,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) | |||
| 328 | } | 328 | } |
| 329 | 329 | ||
| 330 | /* only called from syscall */ | 330 | /* only called from syscall */ |
| 331 | static int fd_array_map_update_elem(struct bpf_map *map, void *key, | 331 | int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, |
| 332 | void *value, u64 map_flags) | 332 | void *key, void *value, u64 map_flags) |
| 333 | { | 333 | { |
| 334 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 334 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
| 335 | void *new_ptr, *old_ptr; | 335 | void *new_ptr, *old_ptr; |
| @@ -342,7 +342,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key, | |||
| 342 | return -E2BIG; | 342 | return -E2BIG; |
| 343 | 343 | ||
| 344 | ufd = *(u32 *)value; | 344 | ufd = *(u32 *)value; |
| 345 | new_ptr = map->ops->map_fd_get_ptr(map, ufd); | 345 | new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); |
| 346 | if (IS_ERR(new_ptr)) | 346 | if (IS_ERR(new_ptr)) |
| 347 | return PTR_ERR(new_ptr); | 347 | return PTR_ERR(new_ptr); |
| 348 | 348 | ||
| @@ -371,10 +371,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) | |||
| 371 | } | 371 | } |
| 372 | } | 372 | } |
| 373 | 373 | ||
| 374 | static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) | 374 | static void *prog_fd_array_get_ptr(struct bpf_map *map, |
| 375 | struct file *map_file, int fd) | ||
| 375 | { | 376 | { |
| 376 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 377 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
| 377 | struct bpf_prog *prog = bpf_prog_get(fd); | 378 | struct bpf_prog *prog = bpf_prog_get(fd); |
| 379 | |||
| 378 | if (IS_ERR(prog)) | 380 | if (IS_ERR(prog)) |
| 379 | return prog; | 381 | return prog; |
| 380 | 382 | ||
| @@ -382,14 +384,13 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) | |||
| 382 | bpf_prog_put(prog); | 384 | bpf_prog_put(prog); |
| 383 | return ERR_PTR(-EINVAL); | 385 | return ERR_PTR(-EINVAL); |
| 384 | } | 386 | } |
| 387 | |||
| 385 | return prog; | 388 | return prog; |
| 386 | } | 389 | } |
| 387 | 390 | ||
| 388 | static void prog_fd_array_put_ptr(void *ptr) | 391 | static void prog_fd_array_put_ptr(void *ptr) |
| 389 | { | 392 | { |
| 390 | struct bpf_prog *prog = ptr; | 393 | bpf_prog_put(ptr); |
| 391 | |||
| 392 | bpf_prog_put_rcu(prog); | ||
| 393 | } | 394 | } |
| 394 | 395 | ||
| 395 | /* decrement refcnt of all bpf_progs that are stored in this map */ | 396 | /* decrement refcnt of all bpf_progs that are stored in this map */ |
| @@ -407,7 +408,6 @@ static const struct bpf_map_ops prog_array_ops = { | |||
| 407 | .map_free = fd_array_map_free, | 408 | .map_free = fd_array_map_free, |
| 408 | .map_get_next_key = array_map_get_next_key, | 409 | .map_get_next_key = array_map_get_next_key, |
| 409 | .map_lookup_elem = fd_array_map_lookup_elem, | 410 | .map_lookup_elem = fd_array_map_lookup_elem, |
| 410 | .map_update_elem = fd_array_map_update_elem, | ||
| 411 | .map_delete_elem = fd_array_map_delete_elem, | 411 | .map_delete_elem = fd_array_map_delete_elem, |
| 412 | .map_fd_get_ptr = prog_fd_array_get_ptr, | 412 | .map_fd_get_ptr = prog_fd_array_get_ptr, |
| 413 | .map_fd_put_ptr = prog_fd_array_put_ptr, | 413 | .map_fd_put_ptr = prog_fd_array_put_ptr, |
| @@ -425,59 +425,105 @@ static int __init register_prog_array_map(void) | |||
| 425 | } | 425 | } |
| 426 | late_initcall(register_prog_array_map); | 426 | late_initcall(register_prog_array_map); |
| 427 | 427 | ||
| 428 | static void perf_event_array_map_free(struct bpf_map *map) | 428 | static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, |
| 429 | struct file *map_file) | ||
| 429 | { | 430 | { |
| 430 | bpf_fd_array_map_clear(map); | 431 | struct bpf_event_entry *ee; |
| 431 | fd_array_map_free(map); | 432 | |
| 433 | ee = kzalloc(sizeof(*ee), GFP_ATOMIC); | ||
| 434 | if (ee) { | ||
| 435 | ee->event = perf_file->private_data; | ||
| 436 | ee->perf_file = perf_file; | ||
| 437 | ee->map_file = map_file; | ||
| 438 | } | ||
| 439 | |||
| 440 | return ee; | ||
| 432 | } | 441 | } |
| 433 | 442 | ||
| 434 | static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) | 443 | static void __bpf_event_entry_free(struct rcu_head *rcu) |
| 435 | { | 444 | { |
| 436 | struct perf_event *event; | 445 | struct bpf_event_entry *ee; |
| 437 | const struct perf_event_attr *attr; | ||
| 438 | struct file *file; | ||
| 439 | 446 | ||
| 440 | file = perf_event_get(fd); | 447 | ee = container_of(rcu, struct bpf_event_entry, rcu); |
| 441 | if (IS_ERR(file)) | 448 | fput(ee->perf_file); |
| 442 | return file; | 449 | kfree(ee); |
| 450 | } | ||
| 443 | 451 | ||
| 444 | event = file->private_data; | 452 | static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) |
| 453 | { | ||
| 454 | call_rcu(&ee->rcu, __bpf_event_entry_free); | ||
| 455 | } | ||
| 445 | 456 | ||
| 446 | attr = perf_event_attrs(event); | 457 | static void *perf_event_fd_array_get_ptr(struct bpf_map *map, |
| 447 | if (IS_ERR(attr)) | 458 | struct file *map_file, int fd) |
| 448 | goto err; | 459 | { |
| 460 | const struct perf_event_attr *attr; | ||
| 461 | struct bpf_event_entry *ee; | ||
| 462 | struct perf_event *event; | ||
| 463 | struct file *perf_file; | ||
| 449 | 464 | ||
| 450 | if (attr->inherit) | 465 | perf_file = perf_event_get(fd); |
| 451 | goto err; | 466 | if (IS_ERR(perf_file)) |
| 467 | return perf_file; | ||
| 452 | 468 | ||
| 453 | if (attr->type == PERF_TYPE_RAW) | 469 | event = perf_file->private_data; |
| 454 | return file; | 470 | ee = ERR_PTR(-EINVAL); |
| 455 | 471 | ||
| 456 | if (attr->type == PERF_TYPE_HARDWARE) | 472 | attr = perf_event_attrs(event); |
| 457 | return file; | 473 | if (IS_ERR(attr) || attr->inherit) |
| 474 | goto err_out; | ||
| 475 | |||
| 476 | switch (attr->type) { | ||
| 477 | case PERF_TYPE_SOFTWARE: | ||
| 478 | if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) | ||
| 479 | goto err_out; | ||
| 480 | /* fall-through */ | ||
| 481 | case PERF_TYPE_RAW: | ||
| 482 | case PERF_TYPE_HARDWARE: | ||
| 483 | ee = bpf_event_entry_gen(perf_file, map_file); | ||
| 484 | if (ee) | ||
| 485 | return ee; | ||
| 486 | ee = ERR_PTR(-ENOMEM); | ||
| 487 | /* fall-through */ | ||
| 488 | default: | ||
| 489 | break; | ||
| 490 | } | ||
| 458 | 491 | ||
| 459 | if (attr->type == PERF_TYPE_SOFTWARE && | 492 | err_out: |
| 460 | attr->config == PERF_COUNT_SW_BPF_OUTPUT) | 493 | fput(perf_file); |
| 461 | return file; | 494 | return ee; |
| 462 | err: | ||
| 463 | fput(file); | ||
| 464 | return ERR_PTR(-EINVAL); | ||
| 465 | } | 495 | } |
| 466 | 496 | ||
| 467 | static void perf_event_fd_array_put_ptr(void *ptr) | 497 | static void perf_event_fd_array_put_ptr(void *ptr) |
| 468 | { | 498 | { |
| 469 | fput((struct file *)ptr); | 499 | bpf_event_entry_free_rcu(ptr); |
| 500 | } | ||
| 501 | |||
| 502 | static void perf_event_fd_array_release(struct bpf_map *map, | ||
| 503 | struct file *map_file) | ||
| 504 | { | ||
| 505 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
| 506 | struct bpf_event_entry *ee; | ||
| 507 | int i; | ||
| 508 | |||
| 509 | rcu_read_lock(); | ||
| 510 | for (i = 0; i < array->map.max_entries; i++) { | ||
| 511 | ee = READ_ONCE(array->ptrs[i]); | ||
| 512 | if (ee && ee->map_file == map_file) | ||
| 513 | fd_array_map_delete_elem(map, &i); | ||
| 514 | } | ||
| 515 | rcu_read_unlock(); | ||
| 470 | } | 516 | } |
| 471 | 517 | ||
| 472 | static const struct bpf_map_ops perf_event_array_ops = { | 518 | static const struct bpf_map_ops perf_event_array_ops = { |
| 473 | .map_alloc = fd_array_map_alloc, | 519 | .map_alloc = fd_array_map_alloc, |
| 474 | .map_free = perf_event_array_map_free, | 520 | .map_free = fd_array_map_free, |
| 475 | .map_get_next_key = array_map_get_next_key, | 521 | .map_get_next_key = array_map_get_next_key, |
| 476 | .map_lookup_elem = fd_array_map_lookup_elem, | 522 | .map_lookup_elem = fd_array_map_lookup_elem, |
| 477 | .map_update_elem = fd_array_map_update_elem, | ||
| 478 | .map_delete_elem = fd_array_map_delete_elem, | 523 | .map_delete_elem = fd_array_map_delete_elem, |
| 479 | .map_fd_get_ptr = perf_event_fd_array_get_ptr, | 524 | .map_fd_get_ptr = perf_event_fd_array_get_ptr, |
| 480 | .map_fd_put_ptr = perf_event_fd_array_put_ptr, | 525 | .map_fd_put_ptr = perf_event_fd_array_put_ptr, |
| 526 | .map_release = perf_event_fd_array_release, | ||
| 481 | }; | 527 | }; |
| 482 | 528 | ||
| 483 | static struct bpf_map_type_list perf_event_array_type __read_mostly = { | 529 | static struct bpf_map_type_list perf_event_array_type __read_mostly = { |
| @@ -491,3 +537,46 @@ static int __init register_perf_event_array_map(void) | |||
| 491 | return 0; | 537 | return 0; |
| 492 | } | 538 | } |
| 493 | late_initcall(register_perf_event_array_map); | 539 | late_initcall(register_perf_event_array_map); |
| 540 | |||
| 541 | #ifdef CONFIG_SOCK_CGROUP_DATA | ||
| 542 | static void *cgroup_fd_array_get_ptr(struct bpf_map *map, | ||
| 543 | struct file *map_file /* not used */, | ||
| 544 | int fd) | ||
| 545 | { | ||
| 546 | return cgroup_get_from_fd(fd); | ||
| 547 | } | ||
| 548 | |||
| 549 | static void cgroup_fd_array_put_ptr(void *ptr) | ||
| 550 | { | ||
| 551 | /* cgroup_put free cgrp after a rcu grace period */ | ||
| 552 | cgroup_put(ptr); | ||
| 553 | } | ||
| 554 | |||
| 555 | static void cgroup_fd_array_free(struct bpf_map *map) | ||
| 556 | { | ||
| 557 | bpf_fd_array_map_clear(map); | ||
| 558 | fd_array_map_free(map); | ||
| 559 | } | ||
| 560 | |||
| 561 | static const struct bpf_map_ops cgroup_array_ops = { | ||
| 562 | .map_alloc = fd_array_map_alloc, | ||
| 563 | .map_free = cgroup_fd_array_free, | ||
| 564 | .map_get_next_key = array_map_get_next_key, | ||
| 565 | .map_lookup_elem = fd_array_map_lookup_elem, | ||
| 566 | .map_delete_elem = fd_array_map_delete_elem, | ||
| 567 | .map_fd_get_ptr = cgroup_fd_array_get_ptr, | ||
| 568 | .map_fd_put_ptr = cgroup_fd_array_put_ptr, | ||
| 569 | }; | ||
| 570 | |||
| 571 | static struct bpf_map_type_list cgroup_array_type __read_mostly = { | ||
| 572 | .ops = &cgroup_array_ops, | ||
| 573 | .type = BPF_MAP_TYPE_CGROUP_ARRAY, | ||
| 574 | }; | ||
| 575 | |||
| 576 | static int __init register_cgroup_array_map(void) | ||
| 577 | { | ||
| 578 | bpf_register_map_type(&cgroup_array_type); | ||
| 579 | return 0; | ||
| 580 | } | ||
| 581 | late_initcall(register_cgroup_array_map); | ||
| 582 | #endif | ||
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b94a36550591..03fd23d4d587 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
| @@ -719,14 +719,13 @@ select_insn: | |||
| 719 | 719 | ||
| 720 | if (unlikely(index >= array->map.max_entries)) | 720 | if (unlikely(index >= array->map.max_entries)) |
| 721 | goto out; | 721 | goto out; |
| 722 | |||
| 723 | if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) | 722 | if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) |
| 724 | goto out; | 723 | goto out; |
| 725 | 724 | ||
| 726 | tail_call_cnt++; | 725 | tail_call_cnt++; |
| 727 | 726 | ||
| 728 | prog = READ_ONCE(array->ptrs[index]); | 727 | prog = READ_ONCE(array->ptrs[index]); |
| 729 | if (unlikely(!prog)) | 728 | if (!prog) |
| 730 | goto out; | 729 | goto out; |
| 731 | 730 | ||
| 732 | /* ARG1 at this point is guaranteed to point to CTX from | 731 | /* ARG1 at this point is guaranteed to point to CTX from |
| @@ -1055,9 +1054,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) | |||
| 1055 | return NULL; | 1054 | return NULL; |
| 1056 | } | 1055 | } |
| 1057 | 1056 | ||
| 1058 | const struct bpf_func_proto * __weak bpf_get_event_output_proto(void) | 1057 | u64 __weak |
| 1058 | bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, | ||
| 1059 | void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) | ||
| 1059 | { | 1060 | { |
| 1060 | return NULL; | 1061 | return -ENOTSUPP; |
| 1061 | } | 1062 | } |
| 1062 | 1063 | ||
| 1063 | /* Always built-in helper functions. */ | 1064 | /* Always built-in helper functions. */ |
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ad7a0573f71b..1ea3afba1a4f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c | |||
| @@ -101,7 +101,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = { | |||
| 101 | 101 | ||
| 102 | static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 102 | static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) |
| 103 | { | 103 | { |
| 104 | return raw_smp_processor_id(); | 104 | return smp_processor_id(); |
| 105 | } | 105 | } |
| 106 | 106 | ||
| 107 | const struct bpf_func_proto bpf_get_smp_processor_id_proto = { | 107 | const struct bpf_func_proto bpf_get_smp_processor_id_proto = { |
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 318858edb1cd..5967b870a895 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c | |||
| @@ -11,7 +11,7 @@ | |||
| 11 | * version 2 as published by the Free Software Foundation. | 11 | * version 2 as published by the Free Software Foundation. |
| 12 | */ | 12 | */ |
| 13 | 13 | ||
| 14 | #include <linux/module.h> | 14 | #include <linux/init.h> |
| 15 | #include <linux/magic.h> | 15 | #include <linux/magic.h> |
| 16 | #include <linux/major.h> | 16 | #include <linux/major.h> |
| 17 | #include <linux/mount.h> | 17 | #include <linux/mount.h> |
| @@ -367,8 +367,6 @@ static struct file_system_type bpf_fs_type = { | |||
| 367 | .kill_sb = kill_litter_super, | 367 | .kill_sb = kill_litter_super, |
| 368 | }; | 368 | }; |
| 369 | 369 | ||
| 370 | MODULE_ALIAS_FS("bpf"); | ||
| 371 | |||
| 372 | static int __init bpf_init(void) | 370 | static int __init bpf_init(void) |
| 373 | { | 371 | { |
| 374 | int ret; | 372 | int ret; |
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 080a2dfb5800..bf4495fcd25d 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c | |||
| @@ -99,7 +99,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) | |||
| 99 | if (err) | 99 | if (err) |
| 100 | goto free_smap; | 100 | goto free_smap; |
| 101 | 101 | ||
| 102 | err = get_callchain_buffers(); | 102 | err = get_callchain_buffers(sysctl_perf_event_max_stack); |
| 103 | if (err) | 103 | if (err) |
| 104 | goto free_smap; | 104 | goto free_smap; |
| 105 | 105 | ||
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 46ecce4b79ed..228f962447a5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
| @@ -124,7 +124,12 @@ void bpf_map_put_with_uref(struct bpf_map *map) | |||
| 124 | 124 | ||
| 125 | static int bpf_map_release(struct inode *inode, struct file *filp) | 125 | static int bpf_map_release(struct inode *inode, struct file *filp) |
| 126 | { | 126 | { |
| 127 | bpf_map_put_with_uref(filp->private_data); | 127 | struct bpf_map *map = filp->private_data; |
| 128 | |||
| 129 | if (map->ops->map_release) | ||
| 130 | map->ops->map_release(map, filp); | ||
| 131 | |||
| 132 | bpf_map_put_with_uref(map); | ||
| 128 | return 0; | 133 | return 0; |
| 129 | } | 134 | } |
| 130 | 135 | ||
| @@ -387,6 +392,13 @@ static int map_update_elem(union bpf_attr *attr) | |||
| 387 | err = bpf_percpu_hash_update(map, key, value, attr->flags); | 392 | err = bpf_percpu_hash_update(map, key, value, attr->flags); |
| 388 | } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { | 393 | } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { |
| 389 | err = bpf_percpu_array_update(map, key, value, attr->flags); | 394 | err = bpf_percpu_array_update(map, key, value, attr->flags); |
| 395 | } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || | ||
| 396 | map->map_type == BPF_MAP_TYPE_PROG_ARRAY || | ||
| 397 | map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { | ||
| 398 | rcu_read_lock(); | ||
| 399 | err = bpf_fd_array_map_update_elem(map, f.file, key, value, | ||
| 400 | attr->flags); | ||
| 401 | rcu_read_unlock(); | ||
| 390 | } else { | 402 | } else { |
| 391 | rcu_read_lock(); | 403 | rcu_read_lock(); |
| 392 | err = map->ops->map_update_elem(map, key, value, attr->flags); | 404 | err = map->ops->map_update_elem(map, key, value, attr->flags); |
| @@ -612,7 +624,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) | |||
| 612 | free_uid(user); | 624 | free_uid(user); |
| 613 | } | 625 | } |
| 614 | 626 | ||
| 615 | static void __prog_put_common(struct rcu_head *rcu) | 627 | static void __bpf_prog_put_rcu(struct rcu_head *rcu) |
| 616 | { | 628 | { |
| 617 | struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); | 629 | struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); |
| 618 | 630 | ||
| @@ -621,17 +633,10 @@ static void __prog_put_common(struct rcu_head *rcu) | |||
| 621 | bpf_prog_free(aux->prog); | 633 | bpf_prog_free(aux->prog); |
| 622 | } | 634 | } |
| 623 | 635 | ||
| 624 | /* version of bpf_prog_put() that is called after a grace period */ | ||
| 625 | void bpf_prog_put_rcu(struct bpf_prog *prog) | ||
| 626 | { | ||
| 627 | if (atomic_dec_and_test(&prog->aux->refcnt)) | ||
| 628 | call_rcu(&prog->aux->rcu, __prog_put_common); | ||
| 629 | } | ||
| 630 | |||
| 631 | void bpf_prog_put(struct bpf_prog *prog) | 636 | void bpf_prog_put(struct bpf_prog *prog) |
| 632 | { | 637 | { |
| 633 | if (atomic_dec_and_test(&prog->aux->refcnt)) | 638 | if (atomic_dec_and_test(&prog->aux->refcnt)) |
| 634 | __prog_put_common(&prog->aux->rcu); | 639 | call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); |
| 635 | } | 640 | } |
| 636 | EXPORT_SYMBOL_GPL(bpf_prog_put); | 641 | EXPORT_SYMBOL_GPL(bpf_prog_put); |
| 637 | 642 | ||
| @@ -639,7 +644,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) | |||
| 639 | { | 644 | { |
| 640 | struct bpf_prog *prog = filp->private_data; | 645 | struct bpf_prog *prog = filp->private_data; |
| 641 | 646 | ||
| 642 | bpf_prog_put_rcu(prog); | 647 | bpf_prog_put(prog); |
| 643 | return 0; | 648 | return 0; |
| 644 | } | 649 | } |
| 645 | 650 | ||
| @@ -653,7 +658,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog) | |||
| 653 | O_RDWR | O_CLOEXEC); | 658 | O_RDWR | O_CLOEXEC); |
| 654 | } | 659 | } |
| 655 | 660 | ||
| 656 | static struct bpf_prog *__bpf_prog_get(struct fd f) | 661 | static struct bpf_prog *____bpf_prog_get(struct fd f) |
| 657 | { | 662 | { |
| 658 | if (!f.file) | 663 | if (!f.file) |
| 659 | return ERR_PTR(-EBADF); | 664 | return ERR_PTR(-EBADF); |
| @@ -665,33 +670,50 @@ static struct bpf_prog *__bpf_prog_get(struct fd f) | |||
| 665 | return f.file->private_data; | 670 | return f.file->private_data; |
| 666 | } | 671 | } |
| 667 | 672 | ||
| 668 | struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) | 673 | struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) |
| 669 | { | 674 | { |
| 670 | if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { | 675 | if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { |
| 671 | atomic_dec(&prog->aux->refcnt); | 676 | atomic_sub(i, &prog->aux->refcnt); |
| 672 | return ERR_PTR(-EBUSY); | 677 | return ERR_PTR(-EBUSY); |
| 673 | } | 678 | } |
| 674 | return prog; | 679 | return prog; |
| 675 | } | 680 | } |
| 681 | EXPORT_SYMBOL_GPL(bpf_prog_add); | ||
| 676 | 682 | ||
| 677 | /* called by sockets/tracing/seccomp before attaching program to an event | 683 | struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) |
| 678 | * pairs with bpf_prog_put() | 684 | { |
| 679 | */ | 685 | return bpf_prog_add(prog, 1); |
| 680 | struct bpf_prog *bpf_prog_get(u32 ufd) | 686 | } |
| 687 | |||
| 688 | static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) | ||
| 681 | { | 689 | { |
| 682 | struct fd f = fdget(ufd); | 690 | struct fd f = fdget(ufd); |
| 683 | struct bpf_prog *prog; | 691 | struct bpf_prog *prog; |
| 684 | 692 | ||
| 685 | prog = __bpf_prog_get(f); | 693 | prog = ____bpf_prog_get(f); |
| 686 | if (IS_ERR(prog)) | 694 | if (IS_ERR(prog)) |
| 687 | return prog; | 695 | return prog; |
| 696 | if (type && prog->type != *type) { | ||
| 697 | prog = ERR_PTR(-EINVAL); | ||
| 698 | goto out; | ||
| 699 | } | ||
| 688 | 700 | ||
| 689 | prog = bpf_prog_inc(prog); | 701 | prog = bpf_prog_inc(prog); |
| 702 | out: | ||
| 690 | fdput(f); | 703 | fdput(f); |
| 691 | |||
| 692 | return prog; | 704 | return prog; |
| 693 | } | 705 | } |
| 694 | EXPORT_SYMBOL_GPL(bpf_prog_get); | 706 | |
| 707 | struct bpf_prog *bpf_prog_get(u32 ufd) | ||
| 708 | { | ||
| 709 | return __bpf_prog_get(ufd, NULL); | ||
| 710 | } | ||
| 711 | |||
| 712 | struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) | ||
| 713 | { | ||
| 714 | return __bpf_prog_get(ufd, &type); | ||
| 715 | } | ||
| 716 | EXPORT_SYMBOL_GPL(bpf_prog_get_type); | ||
| 695 | 717 | ||
| 696 | /* last field in 'union bpf_attr' used by this command */ | 718 | /* last field in 'union bpf_attr' used by this command */ |
| 697 | #define BPF_PROG_LOAD_LAST_FIELD kern_version | 719 | #define BPF_PROG_LOAD_LAST_FIELD kern_version |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eec9f90ba030..f72f23b8fdab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
| @@ -653,6 +653,16 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, | |||
| 653 | 653 | ||
| 654 | #define MAX_PACKET_OFF 0xffff | 654 | #define MAX_PACKET_OFF 0xffff |
| 655 | 655 | ||
| 656 | static bool may_write_pkt_data(enum bpf_prog_type type) | ||
| 657 | { | ||
| 658 | switch (type) { | ||
| 659 | case BPF_PROG_TYPE_XDP: | ||
| 660 | return true; | ||
| 661 | default: | ||
| 662 | return false; | ||
| 663 | } | ||
| 664 | } | ||
| 665 | |||
| 656 | static int check_packet_access(struct verifier_env *env, u32 regno, int off, | 666 | static int check_packet_access(struct verifier_env *env, u32 regno, int off, |
| 657 | int size) | 667 | int size) |
| 658 | { | 668 | { |
| @@ -713,6 +723,7 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, | |||
| 713 | switch (env->prog->type) { | 723 | switch (env->prog->type) { |
| 714 | case BPF_PROG_TYPE_SCHED_CLS: | 724 | case BPF_PROG_TYPE_SCHED_CLS: |
| 715 | case BPF_PROG_TYPE_SCHED_ACT: | 725 | case BPF_PROG_TYPE_SCHED_ACT: |
| 726 | case BPF_PROG_TYPE_XDP: | ||
| 716 | break; | 727 | break; |
| 717 | default: | 728 | default: |
| 718 | verbose("verifier is misconfigured\n"); | 729 | verbose("verifier is misconfigured\n"); |
| @@ -805,10 +816,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, | |||
| 805 | err = check_stack_read(state, off, size, value_regno); | 816 | err = check_stack_read(state, off, size, value_regno); |
| 806 | } | 817 | } |
| 807 | } else if (state->regs[regno].type == PTR_TO_PACKET) { | 818 | } else if (state->regs[regno].type == PTR_TO_PACKET) { |
| 808 | if (t == BPF_WRITE) { | 819 | if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) { |
| 809 | verbose("cannot write into packet\n"); | 820 | verbose("cannot write into packet\n"); |
| 810 | return -EACCES; | 821 | return -EACCES; |
| 811 | } | 822 | } |
| 823 | if (t == BPF_WRITE && value_regno >= 0 && | ||
| 824 | is_pointer_value(env, value_regno)) { | ||
| 825 | verbose("R%d leaks addr into packet\n", value_regno); | ||
| 826 | return -EACCES; | ||
| 827 | } | ||
| 812 | err = check_packet_access(env, regno, off, size); | 828 | err = check_packet_access(env, regno, off, size); |
| 813 | if (!err && t == BPF_READ && value_regno >= 0) | 829 | if (!err && t == BPF_READ && value_regno >= 0) |
| 814 | mark_reg_unknown_value(state->regs, value_regno); | 830 | mark_reg_unknown_value(state->regs, value_regno); |
| @@ -1035,6 +1051,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) | |||
| 1035 | if (func_id != BPF_FUNC_get_stackid) | 1051 | if (func_id != BPF_FUNC_get_stackid) |
| 1036 | goto error; | 1052 | goto error; |
| 1037 | break; | 1053 | break; |
| 1054 | case BPF_MAP_TYPE_CGROUP_ARRAY: | ||
| 1055 | if (func_id != BPF_FUNC_skb_in_cgroup) | ||
| 1056 | goto error; | ||
| 1057 | break; | ||
| 1038 | default: | 1058 | default: |
| 1039 | break; | 1059 | break; |
| 1040 | } | 1060 | } |
| @@ -1054,6 +1074,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) | |||
| 1054 | if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) | 1074 | if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) |
| 1055 | goto error; | 1075 | goto error; |
| 1056 | break; | 1076 | break; |
| 1077 | case BPF_FUNC_skb_in_cgroup: | ||
| 1078 | if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) | ||
| 1079 | goto error; | ||
| 1080 | break; | ||
| 1057 | default: | 1081 | default: |
| 1058 | break; | 1082 | break; |
| 1059 | } | 1083 | } |
diff --git a/kernel/capability.c b/kernel/capability.c index 45432b54d5c6..00411c82dac5 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
| @@ -361,6 +361,24 @@ bool has_capability_noaudit(struct task_struct *t, int cap) | |||
| 361 | return has_ns_capability_noaudit(t, &init_user_ns, cap); | 361 | return has_ns_capability_noaudit(t, &init_user_ns, cap); |
| 362 | } | 362 | } |
| 363 | 363 | ||
| 364 | static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) | ||
| 365 | { | ||
| 366 | int capable; | ||
| 367 | |||
| 368 | if (unlikely(!cap_valid(cap))) { | ||
| 369 | pr_crit("capable() called with invalid cap=%u\n", cap); | ||
| 370 | BUG(); | ||
| 371 | } | ||
| 372 | |||
| 373 | capable = audit ? security_capable(current_cred(), ns, cap) : | ||
| 374 | security_capable_noaudit(current_cred(), ns, cap); | ||
| 375 | if (capable == 0) { | ||
| 376 | current->flags |= PF_SUPERPRIV; | ||
| 377 | return true; | ||
| 378 | } | ||
| 379 | return false; | ||
| 380 | } | ||
| 381 | |||
| 364 | /** | 382 | /** |
| 365 | * ns_capable - Determine if the current task has a superior capability in effect | 383 | * ns_capable - Determine if the current task has a superior capability in effect |
| 366 | * @ns: The usernamespace we want the capability in | 384 | * @ns: The usernamespace we want the capability in |
| @@ -374,19 +392,27 @@ bool has_capability_noaudit(struct task_struct *t, int cap) | |||
| 374 | */ | 392 | */ |
| 375 | bool ns_capable(struct user_namespace *ns, int cap) | 393 | bool ns_capable(struct user_namespace *ns, int cap) |
| 376 | { | 394 | { |
| 377 | if (unlikely(!cap_valid(cap))) { | 395 | return ns_capable_common(ns, cap, true); |
| 378 | pr_crit("capable() called with invalid cap=%u\n", cap); | ||
| 379 | BUG(); | ||
| 380 | } | ||
| 381 | |||
| 382 | if (security_capable(current_cred(), ns, cap) == 0) { | ||
| 383 | current->flags |= PF_SUPERPRIV; | ||
| 384 | return true; | ||
| 385 | } | ||
| 386 | return false; | ||
| 387 | } | 396 | } |
| 388 | EXPORT_SYMBOL(ns_capable); | 397 | EXPORT_SYMBOL(ns_capable); |
| 389 | 398 | ||
| 399 | /** | ||
| 400 | * ns_capable_noaudit - Determine if the current task has a superior capability | ||
| 401 | * (unaudited) in effect | ||
| 402 | * @ns: The usernamespace we want the capability in | ||
| 403 | * @cap: The capability to be tested for | ||
| 404 | * | ||
| 405 | * Return true if the current task has the given superior capability currently | ||
| 406 | * available for use, false if not. | ||
| 407 | * | ||
| 408 | * This sets PF_SUPERPRIV on the task if the capability is available on the | ||
| 409 | * assumption that it's about to be used. | ||
| 410 | */ | ||
| 411 | bool ns_capable_noaudit(struct user_namespace *ns, int cap) | ||
| 412 | { | ||
| 413 | return ns_capable_common(ns, cap, false); | ||
| 414 | } | ||
| 415 | EXPORT_SYMBOL(ns_capable_noaudit); | ||
| 390 | 416 | ||
| 391 | /** | 417 | /** |
| 392 | * capable - Determine if the current task has a superior capability in effect | 418 | * capable - Determine if the current task has a superior capability in effect |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 75c0ff00aca6..d1c51b7f5221 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -61,7 +61,7 @@ | |||
| 61 | #include <linux/cpuset.h> | 61 | #include <linux/cpuset.h> |
| 62 | #include <linux/proc_ns.h> | 62 | #include <linux/proc_ns.h> |
| 63 | #include <linux/nsproxy.h> | 63 | #include <linux/nsproxy.h> |
| 64 | #include <linux/proc_ns.h> | 64 | #include <linux/file.h> |
| 65 | #include <net/sock.h> | 65 | #include <net/sock.h> |
| 66 | 66 | ||
| 67 | /* | 67 | /* |
| @@ -1160,18 +1160,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root) | |||
| 1160 | { | 1160 | { |
| 1161 | lockdep_assert_held(&cgroup_mutex); | 1161 | lockdep_assert_held(&cgroup_mutex); |
| 1162 | 1162 | ||
| 1163 | if (root->hierarchy_id) { | 1163 | idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); |
| 1164 | idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); | ||
| 1165 | root->hierarchy_id = 0; | ||
| 1166 | } | ||
| 1167 | } | 1164 | } |
| 1168 | 1165 | ||
| 1169 | static void cgroup_free_root(struct cgroup_root *root) | 1166 | static void cgroup_free_root(struct cgroup_root *root) |
| 1170 | { | 1167 | { |
| 1171 | if (root) { | 1168 | if (root) { |
| 1172 | /* hierarchy ID should already have been released */ | ||
| 1173 | WARN_ON_ONCE(root->hierarchy_id); | ||
| 1174 | |||
| 1175 | idr_destroy(&root->cgroup_idr); | 1169 | idr_destroy(&root->cgroup_idr); |
| 1176 | kfree(root); | 1170 | kfree(root); |
| 1177 | } | 1171 | } |
| @@ -2215,12 +2209,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 2215 | goto out_unlock; | 2209 | goto out_unlock; |
| 2216 | } | 2210 | } |
| 2217 | 2211 | ||
| 2218 | /* | 2212 | /* Hierarchies may only be created in the initial cgroup namespace. */ |
| 2219 | * We know this subsystem has not yet been bound. Users in a non-init | 2213 | if (ns != &init_cgroup_ns) { |
| 2220 | * user namespace may only mount hierarchies with no bound subsystems, | ||
| 2221 | * i.e. 'none,name=user1' | ||
| 2222 | */ | ||
| 2223 | if (!opts.none && !capable(CAP_SYS_ADMIN)) { | ||
| 2224 | ret = -EPERM; | 2214 | ret = -EPERM; |
| 2225 | goto out_unlock; | 2215 | goto out_unlock; |
| 2226 | } | 2216 | } |
| @@ -2962,6 +2952,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
| 2962 | int retval = 0; | 2952 | int retval = 0; |
| 2963 | 2953 | ||
| 2964 | mutex_lock(&cgroup_mutex); | 2954 | mutex_lock(&cgroup_mutex); |
| 2955 | percpu_down_write(&cgroup_threadgroup_rwsem); | ||
| 2965 | for_each_root(root) { | 2956 | for_each_root(root) { |
| 2966 | struct cgroup *from_cgrp; | 2957 | struct cgroup *from_cgrp; |
| 2967 | 2958 | ||
| @@ -2976,6 +2967,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
| 2976 | if (retval) | 2967 | if (retval) |
| 2977 | break; | 2968 | break; |
| 2978 | } | 2969 | } |
| 2970 | percpu_up_write(&cgroup_threadgroup_rwsem); | ||
| 2979 | mutex_unlock(&cgroup_mutex); | 2971 | mutex_unlock(&cgroup_mutex); |
| 2980 | 2972 | ||
| 2981 | return retval; | 2973 | return retval; |
| @@ -4343,6 +4335,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | |||
| 4343 | 4335 | ||
| 4344 | mutex_lock(&cgroup_mutex); | 4336 | mutex_lock(&cgroup_mutex); |
| 4345 | 4337 | ||
| 4338 | percpu_down_write(&cgroup_threadgroup_rwsem); | ||
| 4339 | |||
| 4346 | /* all tasks in @from are being moved, all csets are source */ | 4340 | /* all tasks in @from are being moved, all csets are source */ |
| 4347 | spin_lock_irq(&css_set_lock); | 4341 | spin_lock_irq(&css_set_lock); |
| 4348 | list_for_each_entry(link, &from->cset_links, cset_link) | 4342 | list_for_each_entry(link, &from->cset_links, cset_link) |
| @@ -4371,6 +4365,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | |||
| 4371 | } while (task && !ret); | 4365 | } while (task && !ret); |
| 4372 | out_err: | 4366 | out_err: |
| 4373 | cgroup_migrate_finish(&preloaded_csets); | 4367 | cgroup_migrate_finish(&preloaded_csets); |
| 4368 | percpu_up_write(&cgroup_threadgroup_rwsem); | ||
| 4374 | mutex_unlock(&cgroup_mutex); | 4369 | mutex_unlock(&cgroup_mutex); |
| 4375 | return ret; | 4370 | return ret; |
| 4376 | } | 4371 | } |
| @@ -5146,6 +5141,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, | |||
| 5146 | lockdep_assert_held(&cgroup_mutex); | 5141 | lockdep_assert_held(&cgroup_mutex); |
| 5147 | 5142 | ||
| 5148 | css = ss->css_alloc(parent_css); | 5143 | css = ss->css_alloc(parent_css); |
| 5144 | if (!css) | ||
| 5145 | css = ERR_PTR(-ENOMEM); | ||
| 5149 | if (IS_ERR(css)) | 5146 | if (IS_ERR(css)) |
| 5150 | return css; | 5147 | return css; |
| 5151 | 5148 | ||
| @@ -6172,7 +6169,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, | |||
| 6172 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) | 6169 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) |
| 6173 | { | 6170 | { |
| 6174 | WARN_ON_ONCE(!rcu_read_lock_held()); | 6171 | WARN_ON_ONCE(!rcu_read_lock_held()); |
| 6175 | return id > 0 ? idr_find(&ss->css_idr, id) : NULL; | 6172 | return idr_find(&ss->css_idr, id); |
| 6176 | } | 6173 | } |
| 6177 | 6174 | ||
| 6178 | /** | 6175 | /** |
| @@ -6209,6 +6206,40 @@ struct cgroup *cgroup_get_from_path(const char *path) | |||
| 6209 | } | 6206 | } |
| 6210 | EXPORT_SYMBOL_GPL(cgroup_get_from_path); | 6207 | EXPORT_SYMBOL_GPL(cgroup_get_from_path); |
| 6211 | 6208 | ||
| 6209 | /** | ||
| 6210 | * cgroup_get_from_fd - get a cgroup pointer from a fd | ||
| 6211 | * @fd: fd obtained by open(cgroup2_dir) | ||
| 6212 | * | ||
| 6213 | * Find the cgroup from a fd which should be obtained | ||
| 6214 | * by opening a cgroup directory. Returns a pointer to the | ||
| 6215 | * cgroup on success. ERR_PTR is returned if the cgroup | ||
| 6216 | * cannot be found. | ||
| 6217 | */ | ||
| 6218 | struct cgroup *cgroup_get_from_fd(int fd) | ||
| 6219 | { | ||
| 6220 | struct cgroup_subsys_state *css; | ||
| 6221 | struct cgroup *cgrp; | ||
| 6222 | struct file *f; | ||
| 6223 | |||
| 6224 | f = fget_raw(fd); | ||
| 6225 | if (!f) | ||
| 6226 | return ERR_PTR(-EBADF); | ||
| 6227 | |||
| 6228 | css = css_tryget_online_from_dir(f->f_path.dentry, NULL); | ||
| 6229 | fput(f); | ||
| 6230 | if (IS_ERR(css)) | ||
| 6231 | return ERR_CAST(css); | ||
| 6232 | |||
| 6233 | cgrp = css->cgroup; | ||
| 6234 | if (!cgroup_on_dfl(cgrp)) { | ||
| 6235 | cgroup_put(cgrp); | ||
| 6236 | return ERR_PTR(-EBADF); | ||
| 6237 | } | ||
| 6238 | |||
| 6239 | return cgrp; | ||
| 6240 | } | ||
| 6241 | EXPORT_SYMBOL_GPL(cgroup_get_from_fd); | ||
| 6242 | |||
| 6212 | /* | 6243 | /* |
| 6213 | * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data | 6244 | * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data |
| 6214 | * definition in cgroup-defs.h. | 6245 | * definition in cgroup-defs.h. |
| @@ -6309,14 +6340,11 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, | |||
| 6309 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) | 6340 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) |
| 6310 | return ERR_PTR(-EPERM); | 6341 | return ERR_PTR(-EPERM); |
| 6311 | 6342 | ||
| 6312 | mutex_lock(&cgroup_mutex); | 6343 | /* It is not safe to take cgroup_mutex here */ |
| 6313 | spin_lock_irq(&css_set_lock); | 6344 | spin_lock_irq(&css_set_lock); |
| 6314 | |||
| 6315 | cset = task_css_set(current); | 6345 | cset = task_css_set(current); |
| 6316 | get_css_set(cset); | 6346 | get_css_set(cset); |
| 6317 | |||
| 6318 | spin_unlock_irq(&css_set_lock); | 6347 | spin_unlock_irq(&css_set_lock); |
| 6319 | mutex_unlock(&cgroup_mutex); | ||
| 6320 | 6348 | ||
| 6321 | new_ns = alloc_cgroup_ns(); | 6349 | new_ns = alloc_cgroup_ns(); |
| 6322 | if (IS_ERR(new_ns)) { | 6350 | if (IS_ERR(new_ns)) { |
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index 303097b37429..2bd673783f1a 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c | |||
| @@ -49,6 +49,12 @@ struct pids_cgroup { | |||
| 49 | */ | 49 | */ |
| 50 | atomic64_t counter; | 50 | atomic64_t counter; |
| 51 | int64_t limit; | 51 | int64_t limit; |
| 52 | |||
| 53 | /* Handle for "pids.events" */ | ||
| 54 | struct cgroup_file events_file; | ||
| 55 | |||
| 56 | /* Number of times fork failed because limit was hit. */ | ||
| 57 | atomic64_t events_limit; | ||
| 52 | }; | 58 | }; |
| 53 | 59 | ||
| 54 | static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) | 60 | static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) |
| @@ -72,6 +78,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent) | |||
| 72 | 78 | ||
| 73 | pids->limit = PIDS_MAX; | 79 | pids->limit = PIDS_MAX; |
| 74 | atomic64_set(&pids->counter, 0); | 80 | atomic64_set(&pids->counter, 0); |
| 81 | atomic64_set(&pids->events_limit, 0); | ||
| 75 | return &pids->css; | 82 | return &pids->css; |
| 76 | } | 83 | } |
| 77 | 84 | ||
| @@ -213,10 +220,21 @@ static int pids_can_fork(struct task_struct *task) | |||
| 213 | { | 220 | { |
| 214 | struct cgroup_subsys_state *css; | 221 | struct cgroup_subsys_state *css; |
| 215 | struct pids_cgroup *pids; | 222 | struct pids_cgroup *pids; |
| 223 | int err; | ||
| 216 | 224 | ||
| 217 | css = task_css_check(current, pids_cgrp_id, true); | 225 | css = task_css_check(current, pids_cgrp_id, true); |
| 218 | pids = css_pids(css); | 226 | pids = css_pids(css); |
| 219 | return pids_try_charge(pids, 1); | 227 | err = pids_try_charge(pids, 1); |
| 228 | if (err) { | ||
| 229 | /* Only log the first time events_limit is incremented. */ | ||
| 230 | if (atomic64_inc_return(&pids->events_limit) == 1) { | ||
| 231 | pr_info("cgroup: fork rejected by pids controller in "); | ||
| 232 | pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id)); | ||
| 233 | pr_cont("\n"); | ||
| 234 | } | ||
| 235 | cgroup_file_notify(&pids->events_file); | ||
| 236 | } | ||
| 237 | return err; | ||
| 220 | } | 238 | } |
| 221 | 239 | ||
| 222 | static void pids_cancel_fork(struct task_struct *task) | 240 | static void pids_cancel_fork(struct task_struct *task) |
| @@ -288,6 +306,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css, | |||
| 288 | return atomic64_read(&pids->counter); | 306 | return atomic64_read(&pids->counter); |
| 289 | } | 307 | } |
| 290 | 308 | ||
| 309 | static int pids_events_show(struct seq_file *sf, void *v) | ||
| 310 | { | ||
| 311 | struct pids_cgroup *pids = css_pids(seq_css(sf)); | ||
| 312 | |||
| 313 | seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit)); | ||
| 314 | return 0; | ||
| 315 | } | ||
| 316 | |||
| 291 | static struct cftype pids_files[] = { | 317 | static struct cftype pids_files[] = { |
| 292 | { | 318 | { |
| 293 | .name = "max", | 319 | .name = "max", |
| @@ -300,6 +326,12 @@ static struct cftype pids_files[] = { | |||
| 300 | .read_s64 = pids_current_read, | 326 | .read_s64 = pids_current_read, |
| 301 | .flags = CFTYPE_NOT_ON_ROOT, | 327 | .flags = CFTYPE_NOT_ON_ROOT, |
| 302 | }, | 328 | }, |
| 329 | { | ||
| 330 | .name = "events", | ||
| 331 | .seq_show = pids_events_show, | ||
| 332 | .file_offset = offsetof(struct pids_cgroup, events_file), | ||
| 333 | .flags = CFTYPE_NOT_ON_ROOT, | ||
| 334 | }, | ||
| 303 | { } /* terminate */ | 335 | { } /* terminate */ |
| 304 | }; | 336 | }; |
| 305 | 337 | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 7b61887f7ccd..341bf80f80bd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -517,6 +517,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, | |||
| 517 | if (!cpu_online(cpu)) | 517 | if (!cpu_online(cpu)) |
| 518 | return 0; | 518 | return 0; |
| 519 | 519 | ||
| 520 | /* | ||
| 521 | * If we are up and running, use the hotplug thread. For early calls | ||
| 522 | * we invoke the thread function directly. | ||
| 523 | */ | ||
| 524 | if (!st->thread) | ||
| 525 | return cpuhp_invoke_callback(cpu, state, cb); | ||
| 526 | |||
| 520 | st->cb_state = state; | 527 | st->cb_state = state; |
| 521 | st->cb = cb; | 528 | st->cb = cb; |
| 522 | /* | 529 | /* |
| @@ -1173,6 +1180,31 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
| 1173 | .teardown = NULL, | 1180 | .teardown = NULL, |
| 1174 | .cant_stop = true, | 1181 | .cant_stop = true, |
| 1175 | }, | 1182 | }, |
| 1183 | [CPUHP_PERF_PREPARE] = { | ||
| 1184 | .name = "perf prepare", | ||
| 1185 | .startup = perf_event_init_cpu, | ||
| 1186 | .teardown = perf_event_exit_cpu, | ||
| 1187 | }, | ||
| 1188 | [CPUHP_WORKQUEUE_PREP] = { | ||
| 1189 | .name = "workqueue prepare", | ||
| 1190 | .startup = workqueue_prepare_cpu, | ||
| 1191 | .teardown = NULL, | ||
| 1192 | }, | ||
| 1193 | [CPUHP_HRTIMERS_PREPARE] = { | ||
| 1194 | .name = "hrtimers prepare", | ||
| 1195 | .startup = hrtimers_prepare_cpu, | ||
| 1196 | .teardown = hrtimers_dead_cpu, | ||
| 1197 | }, | ||
| 1198 | [CPUHP_SMPCFD_PREPARE] = { | ||
| 1199 | .name = "SMPCFD prepare", | ||
| 1200 | .startup = smpcfd_prepare_cpu, | ||
| 1201 | .teardown = smpcfd_dead_cpu, | ||
| 1202 | }, | ||
| 1203 | [CPUHP_RCUTREE_PREP] = { | ||
| 1204 | .name = "RCU-tree prepare", | ||
| 1205 | .startup = rcutree_prepare_cpu, | ||
| 1206 | .teardown = rcutree_dead_cpu, | ||
| 1207 | }, | ||
| 1176 | /* | 1208 | /* |
| 1177 | * Preparatory and dead notifiers. Will be replaced once the notifiers | 1209 | * Preparatory and dead notifiers. Will be replaced once the notifiers |
| 1178 | * are converted to states. | 1210 | * are converted to states. |
| @@ -1184,6 +1216,16 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
| 1184 | .skip_onerr = true, | 1216 | .skip_onerr = true, |
| 1185 | .cant_stop = true, | 1217 | .cant_stop = true, |
| 1186 | }, | 1218 | }, |
| 1219 | /* | ||
| 1220 | * On the tear-down path, timers_dead_cpu() must be invoked | ||
| 1221 | * before blk_mq_queue_reinit_notify() from notify_dead(), | ||
| 1222 | * otherwise a RCU stall occurs. | ||
| 1223 | */ | ||
| 1224 | [CPUHP_TIMERS_DEAD] = { | ||
| 1225 | .name = "timers dead", | ||
| 1226 | .startup = NULL, | ||
| 1227 | .teardown = timers_dead_cpu, | ||
| 1228 | }, | ||
| 1187 | /* Kicks the plugged cpu into life */ | 1229 | /* Kicks the plugged cpu into life */ |
| 1188 | [CPUHP_BRINGUP_CPU] = { | 1230 | [CPUHP_BRINGUP_CPU] = { |
| 1189 | .name = "cpu:bringup", | 1231 | .name = "cpu:bringup", |
| @@ -1191,6 +1233,10 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
| 1191 | .teardown = NULL, | 1233 | .teardown = NULL, |
| 1192 | .cant_stop = true, | 1234 | .cant_stop = true, |
| 1193 | }, | 1235 | }, |
| 1236 | [CPUHP_AP_SMPCFD_DYING] = { | ||
| 1237 | .startup = NULL, | ||
| 1238 | .teardown = smpcfd_dying_cpu, | ||
| 1239 | }, | ||
| 1194 | /* | 1240 | /* |
| 1195 | * Handled on controll processor until the plugged processor manages | 1241 | * Handled on controll processor until the plugged processor manages |
| 1196 | * this itself. | 1242 | * this itself. |
| @@ -1227,6 +1273,10 @@ static struct cpuhp_step cpuhp_ap_states[] = { | |||
| 1227 | .startup = sched_cpu_starting, | 1273 | .startup = sched_cpu_starting, |
| 1228 | .teardown = sched_cpu_dying, | 1274 | .teardown = sched_cpu_dying, |
| 1229 | }, | 1275 | }, |
| 1276 | [CPUHP_AP_RCUTREE_DYING] = { | ||
| 1277 | .startup = NULL, | ||
| 1278 | .teardown = rcutree_dying_cpu, | ||
| 1279 | }, | ||
| 1230 | /* | 1280 | /* |
| 1231 | * Low level startup/teardown notifiers. Run with interrupts | 1281 | * Low level startup/teardown notifiers. Run with interrupts |
| 1232 | * disabled. Will be removed once the notifiers are converted to | 1282 | * disabled. Will be removed once the notifiers are converted to |
| @@ -1250,6 +1300,22 @@ static struct cpuhp_step cpuhp_ap_states[] = { | |||
| 1250 | .startup = smpboot_unpark_threads, | 1300 | .startup = smpboot_unpark_threads, |
| 1251 | .teardown = NULL, | 1301 | .teardown = NULL, |
| 1252 | }, | 1302 | }, |
| 1303 | [CPUHP_AP_PERF_ONLINE] = { | ||
| 1304 | .name = "perf online", | ||
| 1305 | .startup = perf_event_init_cpu, | ||
| 1306 | .teardown = perf_event_exit_cpu, | ||
| 1307 | }, | ||
| 1308 | [CPUHP_AP_WORKQUEUE_ONLINE] = { | ||
| 1309 | .name = "workqueue online", | ||
| 1310 | .startup = workqueue_online_cpu, | ||
| 1311 | .teardown = workqueue_offline_cpu, | ||
| 1312 | }, | ||
| 1313 | [CPUHP_AP_RCUTREE_ONLINE] = { | ||
| 1314 | .name = "RCU-tree online", | ||
| 1315 | .startup = rcutree_online_cpu, | ||
| 1316 | .teardown = rcutree_offline_cpu, | ||
| 1317 | }, | ||
| 1318 | |||
| 1253 | /* | 1319 | /* |
| 1254 | * Online/down_prepare notifiers. Will be removed once the notifiers | 1320 | * Online/down_prepare notifiers. Will be removed once the notifiers |
| 1255 | * are converted to states. | 1321 | * are converted to states. |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 73e93e53884d..c7fd2778ed50 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
| 1034 | { | 1034 | { |
| 1035 | bool need_loop; | 1035 | bool need_loop; |
| 1036 | 1036 | ||
| 1037 | /* | ||
| 1038 | * Allow tasks that have access to memory reserves because they have | ||
| 1039 | * been OOM killed to get memory anywhere. | ||
| 1040 | */ | ||
| 1041 | if (unlikely(test_thread_flag(TIF_MEMDIE))) | ||
| 1042 | return; | ||
| 1043 | if (current->flags & PF_EXITING) /* Let dying task have memory */ | ||
| 1044 | return; | ||
| 1045 | |||
| 1046 | task_lock(tsk); | 1037 | task_lock(tsk); |
| 1047 | /* | 1038 | /* |
| 1048 | * Determine if a loop is necessary if another thread is doing | 1039 | * Determine if a loop is necessary if another thread is doing |
diff --git a/kernel/cred.c b/kernel/cred.c index 0c0cd8a62285..5f264fb5737d 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
| @@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx); | |||
| 689 | */ | 689 | */ |
| 690 | int set_create_files_as(struct cred *new, struct inode *inode) | 690 | int set_create_files_as(struct cred *new, struct inode *inode) |
| 691 | { | 691 | { |
| 692 | if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) | ||
| 693 | return -EINVAL; | ||
| 692 | new->fsuid = inode->i_uid; | 694 | new->fsuid = inode->i_uid; |
| 693 | new->fsgid = inode->i_gid; | 695 | new->fsgid = inode->i_gid; |
| 694 | return security_kernel_create_files_as(new, inode); | 696 | return security_kernel_create_files_as(new, inode); |
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 179ef4640964..e9fdb5203de5 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c | |||
| @@ -104,7 +104,7 @@ fail: | |||
| 104 | return -ENOMEM; | 104 | return -ENOMEM; |
| 105 | } | 105 | } |
| 106 | 106 | ||
| 107 | int get_callchain_buffers(void) | 107 | int get_callchain_buffers(int event_max_stack) |
| 108 | { | 108 | { |
| 109 | int err = 0; | 109 | int err = 0; |
| 110 | int count; | 110 | int count; |
| @@ -121,6 +121,15 @@ int get_callchain_buffers(void) | |||
| 121 | /* If the allocation failed, give up */ | 121 | /* If the allocation failed, give up */ |
| 122 | if (!callchain_cpus_entries) | 122 | if (!callchain_cpus_entries) |
| 123 | err = -ENOMEM; | 123 | err = -ENOMEM; |
| 124 | /* | ||
| 125 | * If requesting per event more than the global cap, | ||
| 126 | * return a different error to help userspace figure | ||
| 127 | * this out. | ||
| 128 | * | ||
| 129 | * And also do it here so that we have &callchain_mutex held. | ||
| 130 | */ | ||
| 131 | if (event_max_stack > sysctl_perf_event_max_stack) | ||
| 132 | err = -EOVERFLOW; | ||
| 124 | goto exit; | 133 | goto exit; |
| 125 | } | 134 | } |
| 126 | 135 | ||
| @@ -174,11 +183,12 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) | |||
| 174 | bool user = !event->attr.exclude_callchain_user; | 183 | bool user = !event->attr.exclude_callchain_user; |
| 175 | /* Disallow cross-task user callchains. */ | 184 | /* Disallow cross-task user callchains. */ |
| 176 | bool crosstask = event->ctx->task && event->ctx->task != current; | 185 | bool crosstask = event->ctx->task && event->ctx->task != current; |
| 186 | const u32 max_stack = event->attr.sample_max_stack; | ||
| 177 | 187 | ||
| 178 | if (!kernel && !user) | 188 | if (!kernel && !user) |
| 179 | return NULL; | 189 | return NULL; |
| 180 | 190 | ||
| 181 | return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true); | 191 | return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); |
| 182 | } | 192 | } |
| 183 | 193 | ||
| 184 | struct perf_callchain_entry * | 194 | struct perf_callchain_entry * |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 43d43a2d5811..356a6c7cb52a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -335,6 +335,7 @@ static atomic_t perf_sched_count; | |||
| 335 | 335 | ||
| 336 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | 336 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); |
| 337 | static DEFINE_PER_CPU(int, perf_sched_cb_usages); | 337 | static DEFINE_PER_CPU(int, perf_sched_cb_usages); |
| 338 | static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); | ||
| 338 | 339 | ||
| 339 | static atomic_t nr_mmap_events __read_mostly; | 340 | static atomic_t nr_mmap_events __read_mostly; |
| 340 | static atomic_t nr_comm_events __read_mostly; | 341 | static atomic_t nr_comm_events __read_mostly; |
| @@ -396,6 +397,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write, | |||
| 396 | if (ret || !write) | 397 | if (ret || !write) |
| 397 | return ret; | 398 | return ret; |
| 398 | 399 | ||
| 400 | /* | ||
| 401 | * If throttling is disabled don't allow the write: | ||
| 402 | */ | ||
| 403 | if (sysctl_perf_cpu_time_max_percent == 100 || | ||
| 404 | sysctl_perf_cpu_time_max_percent == 0) | ||
| 405 | return -EINVAL; | ||
| 406 | |||
| 399 | max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); | 407 | max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); |
| 400 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; | 408 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; |
| 401 | update_perf_cpu_limits(); | 409 | update_perf_cpu_limits(); |
| @@ -3686,6 +3694,39 @@ static void free_event_rcu(struct rcu_head *head) | |||
| 3686 | static void ring_buffer_attach(struct perf_event *event, | 3694 | static void ring_buffer_attach(struct perf_event *event, |
| 3687 | struct ring_buffer *rb); | 3695 | struct ring_buffer *rb); |
| 3688 | 3696 | ||
| 3697 | static void detach_sb_event(struct perf_event *event) | ||
| 3698 | { | ||
| 3699 | struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); | ||
| 3700 | |||
| 3701 | raw_spin_lock(&pel->lock); | ||
| 3702 | list_del_rcu(&event->sb_list); | ||
| 3703 | raw_spin_unlock(&pel->lock); | ||
| 3704 | } | ||
| 3705 | |||
| 3706 | static bool is_sb_event(struct perf_event *event) | ||
| 3707 | { | ||
| 3708 | struct perf_event_attr *attr = &event->attr; | ||
| 3709 | |||
| 3710 | if (event->parent) | ||
| 3711 | return false; | ||
| 3712 | |||
| 3713 | if (event->attach_state & PERF_ATTACH_TASK) | ||
| 3714 | return false; | ||
| 3715 | |||
| 3716 | if (attr->mmap || attr->mmap_data || attr->mmap2 || | ||
| 3717 | attr->comm || attr->comm_exec || | ||
| 3718 | attr->task || | ||
| 3719 | attr->context_switch) | ||
| 3720 | return true; | ||
| 3721 | return false; | ||
| 3722 | } | ||
| 3723 | |||
| 3724 | static void unaccount_pmu_sb_event(struct perf_event *event) | ||
| 3725 | { | ||
| 3726 | if (is_sb_event(event)) | ||
| 3727 | detach_sb_event(event); | ||
| 3728 | } | ||
| 3729 | |||
| 3689 | static void unaccount_event_cpu(struct perf_event *event, int cpu) | 3730 | static void unaccount_event_cpu(struct perf_event *event, int cpu) |
| 3690 | { | 3731 | { |
| 3691 | if (event->parent) | 3732 | if (event->parent) |
| @@ -3749,6 +3790,8 @@ static void unaccount_event(struct perf_event *event) | |||
| 3749 | } | 3790 | } |
| 3750 | 3791 | ||
| 3751 | unaccount_event_cpu(event, event->cpu); | 3792 | unaccount_event_cpu(event, event->cpu); |
| 3793 | |||
| 3794 | unaccount_pmu_sb_event(event); | ||
| 3752 | } | 3795 | } |
| 3753 | 3796 | ||
| 3754 | static void perf_sched_delayed(struct work_struct *work) | 3797 | static void perf_sched_delayed(struct work_struct *work) |
| @@ -5574,16 +5617,26 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
| 5574 | } | 5617 | } |
| 5575 | 5618 | ||
| 5576 | if (sample_type & PERF_SAMPLE_RAW) { | 5619 | if (sample_type & PERF_SAMPLE_RAW) { |
| 5577 | if (data->raw) { | 5620 | struct perf_raw_record *raw = data->raw; |
| 5578 | u32 raw_size = data->raw->size; | 5621 | |
| 5579 | u32 real_size = round_up(raw_size + sizeof(u32), | 5622 | if (raw) { |
| 5580 | sizeof(u64)) - sizeof(u32); | 5623 | struct perf_raw_frag *frag = &raw->frag; |
| 5581 | u64 zero = 0; | 5624 | |
| 5582 | 5625 | perf_output_put(handle, raw->size); | |
| 5583 | perf_output_put(handle, real_size); | 5626 | do { |
| 5584 | __output_copy(handle, data->raw->data, raw_size); | 5627 | if (frag->copy) { |
| 5585 | if (real_size - raw_size) | 5628 | __output_custom(handle, frag->copy, |
| 5586 | __output_copy(handle, &zero, real_size - raw_size); | 5629 | frag->data, frag->size); |
| 5630 | } else { | ||
| 5631 | __output_copy(handle, frag->data, | ||
| 5632 | frag->size); | ||
| 5633 | } | ||
| 5634 | if (perf_raw_frag_last(frag)) | ||
| 5635 | break; | ||
| 5636 | frag = frag->next; | ||
| 5637 | } while (1); | ||
| 5638 | if (frag->pad) | ||
| 5639 | __output_skip(handle, NULL, frag->pad); | ||
| 5587 | } else { | 5640 | } else { |
| 5588 | struct { | 5641 | struct { |
| 5589 | u32 size; | 5642 | u32 size; |
| @@ -5708,14 +5761,28 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
| 5708 | } | 5761 | } |
| 5709 | 5762 | ||
| 5710 | if (sample_type & PERF_SAMPLE_RAW) { | 5763 | if (sample_type & PERF_SAMPLE_RAW) { |
| 5711 | int size = sizeof(u32); | 5764 | struct perf_raw_record *raw = data->raw; |
| 5712 | 5765 | int size; | |
| 5713 | if (data->raw) | 5766 | |
| 5714 | size += data->raw->size; | 5767 | if (raw) { |
| 5715 | else | 5768 | struct perf_raw_frag *frag = &raw->frag; |
| 5716 | size += sizeof(u32); | 5769 | u32 sum = 0; |
| 5770 | |||
| 5771 | do { | ||
| 5772 | sum += frag->size; | ||
| 5773 | if (perf_raw_frag_last(frag)) | ||
| 5774 | break; | ||
| 5775 | frag = frag->next; | ||
| 5776 | } while (1); | ||
| 5777 | |||
| 5778 | size = round_up(sum + sizeof(u32), sizeof(u64)); | ||
| 5779 | raw->size = size - sizeof(u32); | ||
| 5780 | frag->pad = raw->size - sum; | ||
| 5781 | } else { | ||
| 5782 | size = sizeof(u64); | ||
| 5783 | } | ||
| 5717 | 5784 | ||
| 5718 | header->size += round_up(size, sizeof(u64)); | 5785 | header->size += size; |
| 5719 | } | 5786 | } |
| 5720 | 5787 | ||
| 5721 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { | 5788 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { |
| @@ -5875,11 +5942,11 @@ perf_event_read_event(struct perf_event *event, | |||
| 5875 | perf_output_end(&handle); | 5942 | perf_output_end(&handle); |
| 5876 | } | 5943 | } |
| 5877 | 5944 | ||
| 5878 | typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); | 5945 | typedef void (perf_iterate_f)(struct perf_event *event, void *data); |
| 5879 | 5946 | ||
| 5880 | static void | 5947 | static void |
| 5881 | perf_event_aux_ctx(struct perf_event_context *ctx, | 5948 | perf_iterate_ctx(struct perf_event_context *ctx, |
| 5882 | perf_event_aux_output_cb output, | 5949 | perf_iterate_f output, |
| 5883 | void *data, bool all) | 5950 | void *data, bool all) |
| 5884 | { | 5951 | { |
| 5885 | struct perf_event *event; | 5952 | struct perf_event *event; |
| @@ -5896,52 +5963,55 @@ perf_event_aux_ctx(struct perf_event_context *ctx, | |||
| 5896 | } | 5963 | } |
| 5897 | } | 5964 | } |
| 5898 | 5965 | ||
| 5899 | static void | 5966 | static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) |
| 5900 | perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, | ||
| 5901 | struct perf_event_context *task_ctx) | ||
| 5902 | { | 5967 | { |
| 5903 | rcu_read_lock(); | 5968 | struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); |
| 5904 | preempt_disable(); | 5969 | struct perf_event *event; |
| 5905 | perf_event_aux_ctx(task_ctx, output, data, false); | 5970 | |
| 5906 | preempt_enable(); | 5971 | list_for_each_entry_rcu(event, &pel->list, sb_list) { |
| 5907 | rcu_read_unlock(); | 5972 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
| 5973 | continue; | ||
| 5974 | if (!event_filter_match(event)) | ||
| 5975 | continue; | ||
| 5976 | output(event, data); | ||
| 5977 | } | ||
| 5908 | } | 5978 | } |
| 5909 | 5979 | ||
| 5980 | /* | ||
| 5981 | * Iterate all events that need to receive side-band events. | ||
| 5982 | * | ||
| 5983 | * For new callers; ensure that account_pmu_sb_event() includes | ||
| 5984 | * your event, otherwise it might not get delivered. | ||
| 5985 | */ | ||
| 5910 | static void | 5986 | static void |
| 5911 | perf_event_aux(perf_event_aux_output_cb output, void *data, | 5987 | perf_iterate_sb(perf_iterate_f output, void *data, |
| 5912 | struct perf_event_context *task_ctx) | 5988 | struct perf_event_context *task_ctx) |
| 5913 | { | 5989 | { |
| 5914 | struct perf_cpu_context *cpuctx; | ||
| 5915 | struct perf_event_context *ctx; | 5990 | struct perf_event_context *ctx; |
| 5916 | struct pmu *pmu; | ||
| 5917 | int ctxn; | 5991 | int ctxn; |
| 5918 | 5992 | ||
| 5993 | rcu_read_lock(); | ||
| 5994 | preempt_disable(); | ||
| 5995 | |||
| 5919 | /* | 5996 | /* |
| 5920 | * If we have task_ctx != NULL we only notify | 5997 | * If we have task_ctx != NULL we only notify the task context itself. |
| 5921 | * the task context itself. The task_ctx is set | 5998 | * The task_ctx is set only for EXIT events before releasing task |
| 5922 | * only for EXIT events before releasing task | ||
| 5923 | * context. | 5999 | * context. |
| 5924 | */ | 6000 | */ |
| 5925 | if (task_ctx) { | 6001 | if (task_ctx) { |
| 5926 | perf_event_aux_task_ctx(output, data, task_ctx); | 6002 | perf_iterate_ctx(task_ctx, output, data, false); |
| 5927 | return; | 6003 | goto done; |
| 5928 | } | 6004 | } |
| 5929 | 6005 | ||
| 5930 | rcu_read_lock(); | 6006 | perf_iterate_sb_cpu(output, data); |
| 5931 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 6007 | |
| 5932 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 6008 | for_each_task_context_nr(ctxn) { |
| 5933 | if (cpuctx->unique_pmu != pmu) | ||
| 5934 | goto next; | ||
| 5935 | perf_event_aux_ctx(&cpuctx->ctx, output, data, false); | ||
| 5936 | ctxn = pmu->task_ctx_nr; | ||
| 5937 | if (ctxn < 0) | ||
| 5938 | goto next; | ||
| 5939 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | 6009 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); |
| 5940 | if (ctx) | 6010 | if (ctx) |
| 5941 | perf_event_aux_ctx(ctx, output, data, false); | 6011 | perf_iterate_ctx(ctx, output, data, false); |
| 5942 | next: | ||
| 5943 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
| 5944 | } | 6012 | } |
| 6013 | done: | ||
| 6014 | preempt_enable(); | ||
| 5945 | rcu_read_unlock(); | 6015 | rcu_read_unlock(); |
| 5946 | } | 6016 | } |
| 5947 | 6017 | ||
| @@ -5990,7 +6060,7 @@ void perf_event_exec(void) | |||
| 5990 | 6060 | ||
| 5991 | perf_event_enable_on_exec(ctxn); | 6061 | perf_event_enable_on_exec(ctxn); |
| 5992 | 6062 | ||
| 5993 | perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL, | 6063 | perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, |
| 5994 | true); | 6064 | true); |
| 5995 | } | 6065 | } |
| 5996 | rcu_read_unlock(); | 6066 | rcu_read_unlock(); |
| @@ -6034,9 +6104,9 @@ static int __perf_pmu_output_stop(void *info) | |||
| 6034 | }; | 6104 | }; |
| 6035 | 6105 | ||
| 6036 | rcu_read_lock(); | 6106 | rcu_read_lock(); |
| 6037 | perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); | 6107 | perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); |
| 6038 | if (cpuctx->task_ctx) | 6108 | if (cpuctx->task_ctx) |
| 6039 | perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop, | 6109 | perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, |
| 6040 | &ro, false); | 6110 | &ro, false); |
| 6041 | rcu_read_unlock(); | 6111 | rcu_read_unlock(); |
| 6042 | 6112 | ||
| @@ -6165,7 +6235,7 @@ static void perf_event_task(struct task_struct *task, | |||
| 6165 | }, | 6235 | }, |
| 6166 | }; | 6236 | }; |
| 6167 | 6237 | ||
| 6168 | perf_event_aux(perf_event_task_output, | 6238 | perf_iterate_sb(perf_event_task_output, |
| 6169 | &task_event, | 6239 | &task_event, |
| 6170 | task_ctx); | 6240 | task_ctx); |
| 6171 | } | 6241 | } |
| @@ -6244,7 +6314,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
| 6244 | 6314 | ||
| 6245 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 6315 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
| 6246 | 6316 | ||
| 6247 | perf_event_aux(perf_event_comm_output, | 6317 | perf_iterate_sb(perf_event_comm_output, |
| 6248 | comm_event, | 6318 | comm_event, |
| 6249 | NULL); | 6319 | NULL); |
| 6250 | } | 6320 | } |
| @@ -6475,7 +6545,7 @@ got_name: | |||
| 6475 | 6545 | ||
| 6476 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; | 6546 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; |
| 6477 | 6547 | ||
| 6478 | perf_event_aux(perf_event_mmap_output, | 6548 | perf_iterate_sb(perf_event_mmap_output, |
| 6479 | mmap_event, | 6549 | mmap_event, |
| 6480 | NULL); | 6550 | NULL); |
| 6481 | 6551 | ||
| @@ -6558,7 +6628,7 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma) | |||
| 6558 | if (!ctx) | 6628 | if (!ctx) |
| 6559 | continue; | 6629 | continue; |
| 6560 | 6630 | ||
| 6561 | perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true); | 6631 | perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); |
| 6562 | } | 6632 | } |
| 6563 | rcu_read_unlock(); | 6633 | rcu_read_unlock(); |
| 6564 | } | 6634 | } |
| @@ -6745,7 +6815,7 @@ static void perf_event_switch(struct task_struct *task, | |||
| 6745 | }, | 6815 | }, |
| 6746 | }; | 6816 | }; |
| 6747 | 6817 | ||
| 6748 | perf_event_aux(perf_event_switch_output, | 6818 | perf_iterate_sb(perf_event_switch_output, |
| 6749 | &switch_event, | 6819 | &switch_event, |
| 6750 | NULL); | 6820 | NULL); |
| 6751 | } | 6821 | } |
| @@ -7352,7 +7422,7 @@ static struct pmu perf_swevent = { | |||
| 7352 | static int perf_tp_filter_match(struct perf_event *event, | 7422 | static int perf_tp_filter_match(struct perf_event *event, |
| 7353 | struct perf_sample_data *data) | 7423 | struct perf_sample_data *data) |
| 7354 | { | 7424 | { |
| 7355 | void *record = data->raw->data; | 7425 | void *record = data->raw->frag.data; |
| 7356 | 7426 | ||
| 7357 | /* only top level events have filters set */ | 7427 | /* only top level events have filters set */ |
| 7358 | if (event->parent) | 7428 | if (event->parent) |
| @@ -7408,8 +7478,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, | |||
| 7408 | struct perf_event *event; | 7478 | struct perf_event *event; |
| 7409 | 7479 | ||
| 7410 | struct perf_raw_record raw = { | 7480 | struct perf_raw_record raw = { |
| 7411 | .size = entry_size, | 7481 | .frag = { |
| 7412 | .data = record, | 7482 | .size = entry_size, |
| 7483 | .data = record, | ||
| 7484 | }, | ||
| 7413 | }; | 7485 | }; |
| 7414 | 7486 | ||
| 7415 | perf_sample_data_init(&data, 0, 0); | 7487 | perf_sample_data_init(&data, 0, 0); |
| @@ -7550,7 +7622,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) | |||
| 7550 | prog = event->tp_event->prog; | 7622 | prog = event->tp_event->prog; |
| 7551 | if (prog) { | 7623 | if (prog) { |
| 7552 | event->tp_event->prog = NULL; | 7624 | event->tp_event->prog = NULL; |
| 7553 | bpf_prog_put_rcu(prog); | 7625 | bpf_prog_put(prog); |
| 7554 | } | 7626 | } |
| 7555 | } | 7627 | } |
| 7556 | 7628 | ||
| @@ -8667,6 +8739,28 @@ unlock: | |||
| 8667 | return pmu; | 8739 | return pmu; |
| 8668 | } | 8740 | } |
| 8669 | 8741 | ||
| 8742 | static void attach_sb_event(struct perf_event *event) | ||
| 8743 | { | ||
| 8744 | struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); | ||
| 8745 | |||
| 8746 | raw_spin_lock(&pel->lock); | ||
| 8747 | list_add_rcu(&event->sb_list, &pel->list); | ||
| 8748 | raw_spin_unlock(&pel->lock); | ||
| 8749 | } | ||
| 8750 | |||
| 8751 | /* | ||
| 8752 | * We keep a list of all !task (and therefore per-cpu) events | ||
| 8753 | * that need to receive side-band records. | ||
| 8754 | * | ||
| 8755 | * This avoids having to scan all the various PMU per-cpu contexts | ||
| 8756 | * looking for them. | ||
| 8757 | */ | ||
| 8758 | static void account_pmu_sb_event(struct perf_event *event) | ||
| 8759 | { | ||
| 8760 | if (is_sb_event(event)) | ||
| 8761 | attach_sb_event(event); | ||
| 8762 | } | ||
| 8763 | |||
| 8670 | static void account_event_cpu(struct perf_event *event, int cpu) | 8764 | static void account_event_cpu(struct perf_event *event, int cpu) |
| 8671 | { | 8765 | { |
| 8672 | if (event->parent) | 8766 | if (event->parent) |
| @@ -8747,6 +8841,8 @@ static void account_event(struct perf_event *event) | |||
| 8747 | enabled: | 8841 | enabled: |
| 8748 | 8842 | ||
| 8749 | account_event_cpu(event, event->cpu); | 8843 | account_event_cpu(event, event->cpu); |
| 8844 | |||
| 8845 | account_pmu_sb_event(event); | ||
| 8750 | } | 8846 | } |
| 8751 | 8847 | ||
| 8752 | /* | 8848 | /* |
| @@ -8895,7 +8991,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 8895 | 8991 | ||
| 8896 | if (!event->parent) { | 8992 | if (!event->parent) { |
| 8897 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { | 8993 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { |
| 8898 | err = get_callchain_buffers(); | 8994 | err = get_callchain_buffers(attr->sample_max_stack); |
| 8899 | if (err) | 8995 | if (err) |
| 8900 | goto err_addr_filters; | 8996 | goto err_addr_filters; |
| 8901 | } | 8997 | } |
| @@ -9217,6 +9313,9 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 9217 | return -EINVAL; | 9313 | return -EINVAL; |
| 9218 | } | 9314 | } |
| 9219 | 9315 | ||
| 9316 | if (!attr.sample_max_stack) | ||
| 9317 | attr.sample_max_stack = sysctl_perf_event_max_stack; | ||
| 9318 | |||
| 9220 | /* | 9319 | /* |
| 9221 | * In cgroup mode, the pid argument is used to pass the fd | 9320 | * In cgroup mode, the pid argument is used to pass the fd |
| 9222 | * opened to the cgroup directory in cgroupfs. The cpu argument | 9321 | * opened to the cgroup directory in cgroupfs. The cpu argument |
| @@ -9290,7 +9389,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 9290 | 9389 | ||
| 9291 | if (is_sampling_event(event)) { | 9390 | if (is_sampling_event(event)) { |
| 9292 | if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { | 9391 | if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { |
| 9293 | err = -ENOTSUPP; | 9392 | err = -EOPNOTSUPP; |
| 9294 | goto err_alloc; | 9393 | goto err_alloc; |
| 9295 | } | 9394 | } |
| 9296 | } | 9395 | } |
| @@ -10252,10 +10351,13 @@ static void __init perf_event_init_all_cpus(void) | |||
| 10252 | swhash = &per_cpu(swevent_htable, cpu); | 10351 | swhash = &per_cpu(swevent_htable, cpu); |
| 10253 | mutex_init(&swhash->hlist_mutex); | 10352 | mutex_init(&swhash->hlist_mutex); |
| 10254 | INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); | 10353 | INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); |
| 10354 | |||
| 10355 | INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); | ||
| 10356 | raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); | ||
| 10255 | } | 10357 | } |
| 10256 | } | 10358 | } |
| 10257 | 10359 | ||
| 10258 | static void perf_event_init_cpu(int cpu) | 10360 | int perf_event_init_cpu(unsigned int cpu) |
| 10259 | { | 10361 | { |
| 10260 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); | 10362 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
| 10261 | 10363 | ||
| @@ -10268,6 +10370,7 @@ static void perf_event_init_cpu(int cpu) | |||
| 10268 | rcu_assign_pointer(swhash->swevent_hlist, hlist); | 10370 | rcu_assign_pointer(swhash->swevent_hlist, hlist); |
| 10269 | } | 10371 | } |
| 10270 | mutex_unlock(&swhash->hlist_mutex); | 10372 | mutex_unlock(&swhash->hlist_mutex); |
| 10373 | return 0; | ||
| 10271 | } | 10374 | } |
| 10272 | 10375 | ||
| 10273 | #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE | 10376 | #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE |
| @@ -10299,14 +10402,17 @@ static void perf_event_exit_cpu_context(int cpu) | |||
| 10299 | } | 10402 | } |
| 10300 | srcu_read_unlock(&pmus_srcu, idx); | 10403 | srcu_read_unlock(&pmus_srcu, idx); |
| 10301 | } | 10404 | } |
| 10405 | #else | ||
| 10406 | |||
| 10407 | static void perf_event_exit_cpu_context(int cpu) { } | ||
| 10302 | 10408 | ||
| 10303 | static void perf_event_exit_cpu(int cpu) | 10409 | #endif |
| 10410 | |||
| 10411 | int perf_event_exit_cpu(unsigned int cpu) | ||
| 10304 | { | 10412 | { |
| 10305 | perf_event_exit_cpu_context(cpu); | 10413 | perf_event_exit_cpu_context(cpu); |
| 10414 | return 0; | ||
| 10306 | } | 10415 | } |
| 10307 | #else | ||
| 10308 | static inline void perf_event_exit_cpu(int cpu) { } | ||
| 10309 | #endif | ||
| 10310 | 10416 | ||
| 10311 | static int | 10417 | static int |
| 10312 | perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) | 10418 | perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) |
| @@ -10328,46 +10434,6 @@ static struct notifier_block perf_reboot_notifier = { | |||
| 10328 | .priority = INT_MIN, | 10434 | .priority = INT_MIN, |
| 10329 | }; | 10435 | }; |
| 10330 | 10436 | ||
| 10331 | static int | ||
| 10332 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | ||
| 10333 | { | ||
| 10334 | unsigned int cpu = (long)hcpu; | ||
| 10335 | |||
| 10336 | switch (action & ~CPU_TASKS_FROZEN) { | ||
| 10337 | |||
| 10338 | case CPU_UP_PREPARE: | ||
| 10339 | /* | ||
| 10340 | * This must be done before the CPU comes alive, because the | ||
| 10341 | * moment we can run tasks we can encounter (software) events. | ||
| 10342 | * | ||
| 10343 | * Specifically, someone can have inherited events on kthreadd | ||
| 10344 | * or a pre-existing worker thread that gets re-bound. | ||
| 10345 | */ | ||
| 10346 | perf_event_init_cpu(cpu); | ||
| 10347 | break; | ||
| 10348 | |||
| 10349 | case CPU_DOWN_PREPARE: | ||
| 10350 | /* | ||
| 10351 | * This must be done before the CPU dies because after that an | ||
| 10352 | * active event might want to IPI the CPU and that'll not work | ||
| 10353 | * so great for dead CPUs. | ||
| 10354 | * | ||
| 10355 | * XXX smp_call_function_single() return -ENXIO without a warn | ||
| 10356 | * so we could possibly deal with this. | ||
| 10357 | * | ||
| 10358 | * This is safe against new events arriving because | ||
| 10359 | * sys_perf_event_open() serializes against hotplug using | ||
| 10360 | * get_online_cpus(). | ||
| 10361 | */ | ||
| 10362 | perf_event_exit_cpu(cpu); | ||
| 10363 | break; | ||
| 10364 | default: | ||
| 10365 | break; | ||
| 10366 | } | ||
| 10367 | |||
| 10368 | return NOTIFY_OK; | ||
| 10369 | } | ||
| 10370 | |||
| 10371 | void __init perf_event_init(void) | 10437 | void __init perf_event_init(void) |
| 10372 | { | 10438 | { |
| 10373 | int ret; | 10439 | int ret; |
| @@ -10380,7 +10446,7 @@ void __init perf_event_init(void) | |||
| 10380 | perf_pmu_register(&perf_cpu_clock, NULL, -1); | 10446 | perf_pmu_register(&perf_cpu_clock, NULL, -1); |
| 10381 | perf_pmu_register(&perf_task_clock, NULL, -1); | 10447 | perf_pmu_register(&perf_task_clock, NULL, -1); |
| 10382 | perf_tp_register(); | 10448 | perf_tp_register(); |
| 10383 | perf_cpu_notifier(perf_cpu_notify); | 10449 | perf_event_init_cpu(smp_processor_id()); |
| 10384 | register_reboot_notifier(&perf_reboot_notifier); | 10450 | register_reboot_notifier(&perf_reboot_notifier); |
| 10385 | 10451 | ||
| 10386 | ret = init_hw_breakpoint(); | 10452 | ret = init_hw_breakpoint(); |
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 05f9f6d626df..486fd78eb8d5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
| @@ -123,21 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb) | |||
| 123 | return rb->aux_nr_pages << PAGE_SHIFT; | 123 | return rb->aux_nr_pages << PAGE_SHIFT; |
| 124 | } | 124 | } |
| 125 | 125 | ||
| 126 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ | 126 | #define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \ |
| 127 | static inline unsigned long \ | ||
| 128 | func_name(struct perf_output_handle *handle, \ | ||
| 129 | const void *buf, unsigned long len) \ | ||
| 130 | { \ | 127 | { \ |
| 131 | unsigned long size, written; \ | 128 | unsigned long size, written; \ |
| 132 | \ | 129 | \ |
| 133 | do { \ | 130 | do { \ |
| 134 | size = min(handle->size, len); \ | 131 | size = min(handle->size, len); \ |
| 135 | written = memcpy_func(handle->addr, buf, size); \ | 132 | written = memcpy_func(__VA_ARGS__); \ |
| 136 | written = size - written; \ | 133 | written = size - written; \ |
| 137 | \ | 134 | \ |
| 138 | len -= written; \ | 135 | len -= written; \ |
| 139 | handle->addr += written; \ | 136 | handle->addr += written; \ |
| 140 | buf += written; \ | 137 | if (advance_buf) \ |
| 138 | buf += written; \ | ||
| 141 | handle->size -= written; \ | 139 | handle->size -= written; \ |
| 142 | if (!handle->size) { \ | 140 | if (!handle->size) { \ |
| 143 | struct ring_buffer *rb = handle->rb; \ | 141 | struct ring_buffer *rb = handle->rb; \ |
| @@ -152,6 +150,21 @@ func_name(struct perf_output_handle *handle, \ | |||
| 152 | return len; \ | 150 | return len; \ |
| 153 | } | 151 | } |
| 154 | 152 | ||
| 153 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ | ||
| 154 | static inline unsigned long \ | ||
| 155 | func_name(struct perf_output_handle *handle, \ | ||
| 156 | const void *buf, unsigned long len) \ | ||
| 157 | __DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size) | ||
| 158 | |||
| 159 | static inline unsigned long | ||
| 160 | __output_custom(struct perf_output_handle *handle, perf_copy_f copy_func, | ||
| 161 | const void *buf, unsigned long len) | ||
| 162 | { | ||
| 163 | unsigned long orig_len = len; | ||
| 164 | __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf, | ||
| 165 | orig_len - len, size) | ||
| 166 | } | ||
| 167 | |||
| 155 | static inline unsigned long | 168 | static inline unsigned long |
| 156 | memcpy_common(void *dst, const void *src, unsigned long n) | 169 | memcpy_common(void *dst, const void *src, unsigned long n) |
| 157 | { | 170 | { |
diff --git a/kernel/exit.c b/kernel/exit.c index 9e6e1356e6bb..84ae830234f8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -211,6 +211,82 @@ repeat: | |||
| 211 | } | 211 | } |
| 212 | 212 | ||
| 213 | /* | 213 | /* |
| 214 | * Note that if this function returns a valid task_struct pointer (!NULL) | ||
| 215 | * task->usage must remain >0 for the duration of the RCU critical section. | ||
| 216 | */ | ||
| 217 | struct task_struct *task_rcu_dereference(struct task_struct **ptask) | ||
| 218 | { | ||
| 219 | struct sighand_struct *sighand; | ||
| 220 | struct task_struct *task; | ||
| 221 | |||
| 222 | /* | ||
| 223 | * We need to verify that release_task() was not called and thus | ||
| 224 | * delayed_put_task_struct() can't run and drop the last reference | ||
| 225 | * before rcu_read_unlock(). We check task->sighand != NULL, | ||
| 226 | * but we can read the already freed and reused memory. | ||
| 227 | */ | ||
| 228 | retry: | ||
| 229 | task = rcu_dereference(*ptask); | ||
| 230 | if (!task) | ||
| 231 | return NULL; | ||
| 232 | |||
| 233 | probe_kernel_address(&task->sighand, sighand); | ||
| 234 | |||
| 235 | /* | ||
| 236 | * Pairs with atomic_dec_and_test() in put_task_struct(). If this task | ||
| 237 | * was already freed we can not miss the preceding update of this | ||
| 238 | * pointer. | ||
| 239 | */ | ||
| 240 | smp_rmb(); | ||
| 241 | if (unlikely(task != READ_ONCE(*ptask))) | ||
| 242 | goto retry; | ||
| 243 | |||
| 244 | /* | ||
| 245 | * We've re-checked that "task == *ptask", now we have two different | ||
| 246 | * cases: | ||
| 247 | * | ||
| 248 | * 1. This is actually the same task/task_struct. In this case | ||
| 249 | * sighand != NULL tells us it is still alive. | ||
| 250 | * | ||
| 251 | * 2. This is another task which got the same memory for task_struct. | ||
| 252 | * We can't know this of course, and we can not trust | ||
| 253 | * sighand != NULL. | ||
| 254 | * | ||
| 255 | * In this case we actually return a random value, but this is | ||
| 256 | * correct. | ||
| 257 | * | ||
| 258 | * If we return NULL - we can pretend that we actually noticed that | ||
| 259 | * *ptask was updated when the previous task has exited. Or pretend | ||
| 260 | * that probe_slab_address(&sighand) reads NULL. | ||
| 261 | * | ||
| 262 | * If we return the new task (because sighand is not NULL for any | ||
| 263 | * reason) - this is fine too. This (new) task can't go away before | ||
| 264 | * another gp pass. | ||
| 265 | * | ||
| 266 | * And note: We could even eliminate the false positive if re-read | ||
| 267 | * task->sighand once again to avoid the falsely NULL. But this case | ||
| 268 | * is very unlikely so we don't care. | ||
| 269 | */ | ||
| 270 | if (!sighand) | ||
| 271 | return NULL; | ||
| 272 | |||
| 273 | return task; | ||
| 274 | } | ||
| 275 | |||
| 276 | struct task_struct *try_get_task_struct(struct task_struct **ptask) | ||
| 277 | { | ||
| 278 | struct task_struct *task; | ||
| 279 | |||
| 280 | rcu_read_lock(); | ||
| 281 | task = task_rcu_dereference(ptask); | ||
| 282 | if (task) | ||
| 283 | get_task_struct(task); | ||
| 284 | rcu_read_unlock(); | ||
| 285 | |||
| 286 | return task; | ||
| 287 | } | ||
| 288 | |||
| 289 | /* | ||
| 214 | * Determine if a process group is "orphaned", according to the POSIX | 290 | * Determine if a process group is "orphaned", according to the POSIX |
| 215 | * definition in 2.2.2.52. Orphaned process groups are not to be affected | 291 | * definition in 2.2.2.52. Orphaned process groups are not to be affected |
| 216 | * by terminal-generated stop signals. Newly orphaned process groups are | 292 | * by terminal-generated stop signals. Newly orphaned process groups are |
| @@ -700,10 +776,14 @@ void do_exit(long code) | |||
| 700 | 776 | ||
| 701 | exit_signals(tsk); /* sets PF_EXITING */ | 777 | exit_signals(tsk); /* sets PF_EXITING */ |
| 702 | /* | 778 | /* |
| 703 | * tsk->flags are checked in the futex code to protect against | 779 | * Ensure that all new tsk->pi_lock acquisitions must observe |
| 704 | * an exiting task cleaning up the robust pi futexes. | 780 | * PF_EXITING. Serializes against futex.c:attach_to_pi_owner(). |
| 705 | */ | 781 | */ |
| 706 | smp_mb(); | 782 | smp_mb(); |
| 783 | /* | ||
| 784 | * Ensure that we must observe the pi_state in exit_mm() -> | ||
| 785 | * mm_release() -> exit_pi_state_list(). | ||
| 786 | */ | ||
| 707 | raw_spin_unlock_wait(&tsk->pi_lock); | 787 | raw_spin_unlock_wait(&tsk->pi_lock); |
| 708 | 788 | ||
| 709 | if (unlikely(in_atomic())) { | 789 | if (unlikely(in_atomic())) { |
diff --git a/kernel/fork.c b/kernel/fork.c index 4a7ec0c6c88c..52e725d4a866 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -162,23 +162,15 @@ void __weak arch_release_thread_stack(unsigned long *stack) | |||
| 162 | static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, | 162 | static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, |
| 163 | int node) | 163 | int node) |
| 164 | { | 164 | { |
| 165 | struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, | 165 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, |
| 166 | THREAD_SIZE_ORDER); | 166 | THREAD_SIZE_ORDER); |
| 167 | |||
| 168 | if (page) | ||
| 169 | memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, | ||
| 170 | 1 << THREAD_SIZE_ORDER); | ||
| 171 | 167 | ||
| 172 | return page ? page_address(page) : NULL; | 168 | return page ? page_address(page) : NULL; |
| 173 | } | 169 | } |
| 174 | 170 | ||
| 175 | static inline void free_thread_stack(unsigned long *stack) | 171 | static inline void free_thread_stack(unsigned long *stack) |
| 176 | { | 172 | { |
| 177 | struct page *page = virt_to_page(stack); | 173 | __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); |
| 178 | |||
| 179 | memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, | ||
| 180 | -(1 << THREAD_SIZE_ORDER)); | ||
| 181 | __free_kmem_pages(page, THREAD_SIZE_ORDER); | ||
| 182 | } | 174 | } |
| 183 | # else | 175 | # else |
| 184 | static struct kmem_cache *thread_stack_cache; | 176 | static struct kmem_cache *thread_stack_cache; |
| @@ -223,9 +215,15 @@ static struct kmem_cache *mm_cachep; | |||
| 223 | 215 | ||
| 224 | static void account_kernel_stack(unsigned long *stack, int account) | 216 | static void account_kernel_stack(unsigned long *stack, int account) |
| 225 | { | 217 | { |
| 226 | struct zone *zone = page_zone(virt_to_page(stack)); | 218 | /* All stack pages are in the same zone and belong to the same memcg. */ |
| 219 | struct page *first_page = virt_to_page(stack); | ||
| 220 | |||
| 221 | mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, | ||
| 222 | THREAD_SIZE / 1024 * account); | ||
| 227 | 223 | ||
| 228 | mod_zone_page_state(zone, NR_KERNEL_STACK, account); | 224 | memcg_kmem_update_page_stat( |
| 225 | first_page, MEMCG_KERNEL_STACK_KB, | ||
| 226 | account * (THREAD_SIZE / 1024)); | ||
| 229 | } | 227 | } |
| 230 | 228 | ||
| 231 | void free_task(struct task_struct *tsk) | 229 | void free_task(struct task_struct *tsk) |
diff --git a/kernel/freezer.c b/kernel/freezer.c index a8900a3bc27a..6f56a9e219fa 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
| @@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p) | |||
| 42 | if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) | 42 | if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) |
| 43 | return false; | 43 | return false; |
| 44 | 44 | ||
| 45 | if (test_thread_flag(TIF_MEMDIE)) | 45 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) |
| 46 | return false; | 46 | return false; |
| 47 | 47 | ||
| 48 | if (pm_nosig_freezing || cgroup_freezing(p)) | 48 | if (pm_nosig_freezing || cgroup_freezing(p)) |
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2ee42e95a3ce..1d3ee3169202 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
| @@ -9,3 +9,4 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o | |||
| 9 | obj-$(CONFIG_PM_SLEEP) += pm.o | 9 | obj-$(CONFIG_PM_SLEEP) += pm.o |
| 10 | obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o | 10 | obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o |
| 11 | obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o | 11 | obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o |
| 12 | obj-$(CONFIG_SMP) += affinity.o | ||
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c new file mode 100644 index 000000000000..f68959341c0f --- /dev/null +++ b/kernel/irq/affinity.c | |||
| @@ -0,0 +1,61 @@ | |||
| 1 | |||
| 2 | #include <linux/interrupt.h> | ||
| 3 | #include <linux/kernel.h> | ||
| 4 | #include <linux/slab.h> | ||
| 5 | #include <linux/cpu.h> | ||
| 6 | |||
| 7 | static int get_first_sibling(unsigned int cpu) | ||
| 8 | { | ||
| 9 | unsigned int ret; | ||
| 10 | |||
| 11 | ret = cpumask_first(topology_sibling_cpumask(cpu)); | ||
| 12 | if (ret < nr_cpu_ids) | ||
| 13 | return ret; | ||
| 14 | return cpu; | ||
| 15 | } | ||
| 16 | |||
| 17 | /* | ||
| 18 | * Take a map of online CPUs and the number of available interrupt vectors | ||
| 19 | * and generate an output cpumask suitable for spreading MSI/MSI-X vectors | ||
| 20 | * so that they are distributed as good as possible around the CPUs. If | ||
| 21 | * more vectors than CPUs are available we'll map one to each CPU, | ||
| 22 | * otherwise we map one to the first sibling of each socket. | ||
| 23 | * | ||
| 24 | * If there are more vectors than CPUs we will still only have one bit | ||
| 25 | * set per CPU, but interrupt code will keep on assigning the vectors from | ||
| 26 | * the start of the bitmap until we run out of vectors. | ||
| 27 | */ | ||
| 28 | struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) | ||
| 29 | { | ||
| 30 | struct cpumask *affinity_mask; | ||
| 31 | unsigned int max_vecs = *nr_vecs; | ||
| 32 | |||
| 33 | if (max_vecs == 1) | ||
| 34 | return NULL; | ||
| 35 | |||
| 36 | affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); | ||
| 37 | if (!affinity_mask) { | ||
| 38 | *nr_vecs = 1; | ||
| 39 | return NULL; | ||
| 40 | } | ||
| 41 | |||
| 42 | if (max_vecs >= num_online_cpus()) { | ||
| 43 | cpumask_copy(affinity_mask, cpu_online_mask); | ||
| 44 | *nr_vecs = num_online_cpus(); | ||
| 45 | } else { | ||
| 46 | unsigned int vecs = 0, cpu; | ||
| 47 | |||
| 48 | for_each_online_cpu(cpu) { | ||
| 49 | if (cpu == get_first_sibling(cpu)) { | ||
| 50 | cpumask_set_cpu(cpu, affinity_mask); | ||
| 51 | vecs++; | ||
| 52 | } | ||
| 53 | |||
| 54 | if (--max_vecs == 0) | ||
| 55 | break; | ||
| 56 | } | ||
| 57 | *nr_vecs = vecs; | ||
| 58 | } | ||
| 59 | |||
| 60 | return affinity_mask; | ||
| 61 | } | ||
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2f9f2b0e79f2..b4c1bc7c9ca2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -426,6 +426,49 @@ out_unlock: | |||
| 426 | } | 426 | } |
| 427 | EXPORT_SYMBOL_GPL(handle_simple_irq); | 427 | EXPORT_SYMBOL_GPL(handle_simple_irq); |
| 428 | 428 | ||
| 429 | /** | ||
| 430 | * handle_untracked_irq - Simple and software-decoded IRQs. | ||
| 431 | * @desc: the interrupt description structure for this irq | ||
| 432 | * | ||
| 433 | * Untracked interrupts are sent from a demultiplexing interrupt | ||
| 434 | * handler when the demultiplexer does not know which device it its | ||
| 435 | * multiplexed irq domain generated the interrupt. IRQ's handled | ||
| 436 | * through here are not subjected to stats tracking, randomness, or | ||
| 437 | * spurious interrupt detection. | ||
| 438 | * | ||
| 439 | * Note: Like handle_simple_irq, the caller is expected to handle | ||
| 440 | * the ack, clear, mask and unmask issues if necessary. | ||
| 441 | */ | ||
| 442 | void handle_untracked_irq(struct irq_desc *desc) | ||
| 443 | { | ||
| 444 | unsigned int flags = 0; | ||
| 445 | |||
| 446 | raw_spin_lock(&desc->lock); | ||
| 447 | |||
| 448 | if (!irq_may_run(desc)) | ||
| 449 | goto out_unlock; | ||
| 450 | |||
| 451 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | ||
| 452 | |||
| 453 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { | ||
| 454 | desc->istate |= IRQS_PENDING; | ||
| 455 | goto out_unlock; | ||
| 456 | } | ||
| 457 | |||
| 458 | desc->istate &= ~IRQS_PENDING; | ||
| 459 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); | ||
| 460 | raw_spin_unlock(&desc->lock); | ||
| 461 | |||
| 462 | __handle_irq_event_percpu(desc, &flags); | ||
| 463 | |||
| 464 | raw_spin_lock(&desc->lock); | ||
| 465 | irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); | ||
| 466 | |||
| 467 | out_unlock: | ||
| 468 | raw_spin_unlock(&desc->lock); | ||
| 469 | } | ||
| 470 | EXPORT_SYMBOL_GPL(handle_untracked_irq); | ||
| 471 | |||
| 429 | /* | 472 | /* |
| 430 | * Called unconditionally from handle_level_irq() and only for oneshot | 473 | * Called unconditionally from handle_level_irq() and only for oneshot |
| 431 | * interrupts from handle_fasteoi_irq() | 474 | * interrupts from handle_fasteoi_irq() |
| @@ -1093,3 +1136,43 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) | |||
| 1093 | 1136 | ||
| 1094 | return 0; | 1137 | return 0; |
| 1095 | } | 1138 | } |
| 1139 | |||
| 1140 | /** | ||
| 1141 | * irq_chip_pm_get - Enable power for an IRQ chip | ||
| 1142 | * @data: Pointer to interrupt specific data | ||
| 1143 | * | ||
| 1144 | * Enable the power to the IRQ chip referenced by the interrupt data | ||
| 1145 | * structure. | ||
| 1146 | */ | ||
| 1147 | int irq_chip_pm_get(struct irq_data *data) | ||
| 1148 | { | ||
| 1149 | int retval; | ||
| 1150 | |||
| 1151 | if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) { | ||
| 1152 | retval = pm_runtime_get_sync(data->chip->parent_device); | ||
| 1153 | if (retval < 0) { | ||
| 1154 | pm_runtime_put_noidle(data->chip->parent_device); | ||
| 1155 | return retval; | ||
| 1156 | } | ||
| 1157 | } | ||
| 1158 | |||
| 1159 | return 0; | ||
| 1160 | } | ||
| 1161 | |||
| 1162 | /** | ||
| 1163 | * irq_chip_pm_put - Disable power for an IRQ chip | ||
| 1164 | * @data: Pointer to interrupt specific data | ||
| 1165 | * | ||
| 1166 | * Disable the power to the IRQ chip referenced by the interrupt data | ||
| 1167 | * structure, belongs. Note that power will only be disabled, once this | ||
| 1168 | * function has been called for all IRQs that have called irq_chip_pm_get(). | ||
| 1169 | */ | ||
| 1170 | int irq_chip_pm_put(struct irq_data *data) | ||
| 1171 | { | ||
| 1172 | int retval = 0; | ||
| 1173 | |||
| 1174 | if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) | ||
| 1175 | retval = pm_runtime_put(data->chip->parent_device); | ||
| 1176 | |||
| 1177 | return (retval < 0) ? retval : 0; | ||
| 1178 | } | ||
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a15b5485b446..d3f24905852c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
| @@ -132,10 +132,10 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) | |||
| 132 | wake_up_process(action->thread); | 132 | wake_up_process(action->thread); |
| 133 | } | 133 | } |
| 134 | 134 | ||
| 135 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) | 135 | irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags) |
| 136 | { | 136 | { |
| 137 | irqreturn_t retval = IRQ_NONE; | 137 | irqreturn_t retval = IRQ_NONE; |
| 138 | unsigned int flags = 0, irq = desc->irq_data.irq; | 138 | unsigned int irq = desc->irq_data.irq; |
| 139 | struct irqaction *action; | 139 | struct irqaction *action; |
| 140 | 140 | ||
| 141 | for_each_action_of_desc(desc, action) { | 141 | for_each_action_of_desc(desc, action) { |
| @@ -164,7 +164,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) | |||
| 164 | 164 | ||
| 165 | /* Fall through to add to randomness */ | 165 | /* Fall through to add to randomness */ |
| 166 | case IRQ_HANDLED: | 166 | case IRQ_HANDLED: |
| 167 | flags |= action->flags; | 167 | *flags |= action->flags; |
| 168 | break; | 168 | break; |
| 169 | 169 | ||
| 170 | default: | 170 | default: |
| @@ -174,7 +174,17 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) | |||
| 174 | retval |= res; | 174 | retval |= res; |
| 175 | } | 175 | } |
| 176 | 176 | ||
| 177 | add_interrupt_randomness(irq, flags); | 177 | return retval; |
| 178 | } | ||
| 179 | |||
| 180 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) | ||
| 181 | { | ||
| 182 | irqreturn_t retval; | ||
| 183 | unsigned int flags = 0; | ||
| 184 | |||
| 185 | retval = __handle_irq_event_percpu(desc, &flags); | ||
| 186 | |||
| 187 | add_interrupt_randomness(desc->irq_data.irq, flags); | ||
| 178 | 188 | ||
| 179 | if (!noirqdebug) | 189 | if (!noirqdebug) |
| 180 | note_interrupt(desc, retval); | 190 | note_interrupt(desc, retval); |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 09be2c903c6d..bc226e783bd2 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
| @@ -7,6 +7,7 @@ | |||
| 7 | */ | 7 | */ |
| 8 | #include <linux/irqdesc.h> | 8 | #include <linux/irqdesc.h> |
| 9 | #include <linux/kernel_stat.h> | 9 | #include <linux/kernel_stat.h> |
| 10 | #include <linux/pm_runtime.h> | ||
| 10 | 11 | ||
| 11 | #ifdef CONFIG_SPARSE_IRQ | 12 | #ifdef CONFIG_SPARSE_IRQ |
| 12 | # define IRQ_BITMAP_BITS (NR_IRQS + 8196) | 13 | # define IRQ_BITMAP_BITS (NR_IRQS + 8196) |
| @@ -83,6 +84,7 @@ extern void irq_mark_irq(unsigned int irq); | |||
| 83 | 84 | ||
| 84 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); | 85 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); |
| 85 | 86 | ||
| 87 | irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); | ||
| 86 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); | 88 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); |
| 87 | irqreturn_t handle_irq_event(struct irq_desc *desc); | 89 | irqreturn_t handle_irq_event(struct irq_desc *desc); |
| 88 | 90 | ||
| @@ -105,6 +107,8 @@ static inline void unregister_handler_proc(unsigned int irq, | |||
| 105 | struct irqaction *action) { } | 107 | struct irqaction *action) { } |
| 106 | #endif | 108 | #endif |
| 107 | 109 | ||
| 110 | extern bool irq_can_set_affinity_usr(unsigned int irq); | ||
| 111 | |||
| 108 | extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); | 112 | extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); |
| 109 | 113 | ||
| 110 | extern void irq_set_thread_affinity(struct irq_desc *desc); | 114 | extern void irq_set_thread_affinity(struct irq_desc *desc); |
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 89b49f6773f0..1a9abc1c8ea0 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c | |||
| @@ -76,14 +76,14 @@ int irq_reserve_ipi(struct irq_domain *domain, | |||
| 76 | } | 76 | } |
| 77 | } | 77 | } |
| 78 | 78 | ||
| 79 | virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE); | 79 | virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE, NULL); |
| 80 | if (virq <= 0) { | 80 | if (virq <= 0) { |
| 81 | pr_warn("Can't reserve IPI, failed to alloc descs\n"); | 81 | pr_warn("Can't reserve IPI, failed to alloc descs\n"); |
| 82 | return -ENOMEM; | 82 | return -ENOMEM; |
| 83 | } | 83 | } |
| 84 | 84 | ||
| 85 | virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, | 85 | virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, |
| 86 | (void *) dest, true); | 86 | (void *) dest, true, NULL); |
| 87 | 87 | ||
| 88 | if (virq <= 0) { | 88 | if (virq <= 0) { |
| 89 | pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); | 89 | pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 8731e1c5d1e7..a623b44f2d4b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
| @@ -68,9 +68,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) | |||
| 68 | return 0; | 68 | return 0; |
| 69 | } | 69 | } |
| 70 | 70 | ||
| 71 | static void desc_smp_init(struct irq_desc *desc, int node) | 71 | static void desc_smp_init(struct irq_desc *desc, int node, |
| 72 | const struct cpumask *affinity) | ||
| 72 | { | 73 | { |
| 73 | cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity); | 74 | if (!affinity) |
| 75 | affinity = irq_default_affinity; | ||
| 76 | cpumask_copy(desc->irq_common_data.affinity, affinity); | ||
| 77 | |||
| 74 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 78 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
| 75 | cpumask_clear(desc->pending_mask); | 79 | cpumask_clear(desc->pending_mask); |
| 76 | #endif | 80 | #endif |
| @@ -82,11 +86,12 @@ static void desc_smp_init(struct irq_desc *desc, int node) | |||
| 82 | #else | 86 | #else |
| 83 | static inline int | 87 | static inline int |
| 84 | alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } | 88 | alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } |
| 85 | static inline void desc_smp_init(struct irq_desc *desc, int node) { } | 89 | static inline void |
| 90 | desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } | ||
| 86 | #endif | 91 | #endif |
| 87 | 92 | ||
| 88 | static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, | 93 | static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, |
| 89 | struct module *owner) | 94 | const struct cpumask *affinity, struct module *owner) |
| 90 | { | 95 | { |
| 91 | int cpu; | 96 | int cpu; |
| 92 | 97 | ||
| @@ -107,7 +112,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, | |||
| 107 | desc->owner = owner; | 112 | desc->owner = owner; |
| 108 | for_each_possible_cpu(cpu) | 113 | for_each_possible_cpu(cpu) |
| 109 | *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; | 114 | *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; |
| 110 | desc_smp_init(desc, node); | 115 | desc_smp_init(desc, node, affinity); |
| 111 | } | 116 | } |
| 112 | 117 | ||
| 113 | int nr_irqs = NR_IRQS; | 118 | int nr_irqs = NR_IRQS; |
| @@ -158,7 +163,9 @@ void irq_unlock_sparse(void) | |||
| 158 | mutex_unlock(&sparse_irq_lock); | 163 | mutex_unlock(&sparse_irq_lock); |
| 159 | } | 164 | } |
| 160 | 165 | ||
| 161 | static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) | 166 | static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, |
| 167 | const struct cpumask *affinity, | ||
| 168 | struct module *owner) | ||
| 162 | { | 169 | { |
| 163 | struct irq_desc *desc; | 170 | struct irq_desc *desc; |
| 164 | gfp_t gfp = GFP_KERNEL; | 171 | gfp_t gfp = GFP_KERNEL; |
| @@ -178,7 +185,8 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) | |||
| 178 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | 185 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); |
| 179 | init_rcu_head(&desc->rcu); | 186 | init_rcu_head(&desc->rcu); |
| 180 | 187 | ||
| 181 | desc_set_defaults(irq, desc, node, owner); | 188 | desc_set_defaults(irq, desc, node, affinity, owner); |
| 189 | irqd_set(&desc->irq_data, flags); | ||
| 182 | 190 | ||
| 183 | return desc; | 191 | return desc; |
| 184 | 192 | ||
| @@ -223,13 +231,32 @@ static void free_desc(unsigned int irq) | |||
| 223 | } | 231 | } |
| 224 | 232 | ||
| 225 | static int alloc_descs(unsigned int start, unsigned int cnt, int node, | 233 | static int alloc_descs(unsigned int start, unsigned int cnt, int node, |
| 226 | struct module *owner) | 234 | const struct cpumask *affinity, struct module *owner) |
| 227 | { | 235 | { |
| 236 | const struct cpumask *mask = NULL; | ||
| 228 | struct irq_desc *desc; | 237 | struct irq_desc *desc; |
| 229 | int i; | 238 | unsigned int flags; |
| 239 | int i, cpu = -1; | ||
| 240 | |||
| 241 | if (affinity && cpumask_empty(affinity)) | ||
| 242 | return -EINVAL; | ||
| 243 | |||
| 244 | flags = affinity ? IRQD_AFFINITY_MANAGED : 0; | ||
| 230 | 245 | ||
| 231 | for (i = 0; i < cnt; i++) { | 246 | for (i = 0; i < cnt; i++) { |
| 232 | desc = alloc_desc(start + i, node, owner); | 247 | if (affinity) { |
| 248 | cpu = cpumask_next(cpu, affinity); | ||
| 249 | if (cpu >= nr_cpu_ids) | ||
| 250 | cpu = cpumask_first(affinity); | ||
| 251 | node = cpu_to_node(cpu); | ||
| 252 | |||
| 253 | /* | ||
| 254 | * For single allocations we use the caller provided | ||
| 255 | * mask otherwise we use the mask of the target cpu | ||
| 256 | */ | ||
| 257 | mask = cnt == 1 ? affinity : cpumask_of(cpu); | ||
| 258 | } | ||
| 259 | desc = alloc_desc(start + i, node, flags, mask, owner); | ||
| 233 | if (!desc) | 260 | if (!desc) |
| 234 | goto err; | 261 | goto err; |
| 235 | mutex_lock(&sparse_irq_lock); | 262 | mutex_lock(&sparse_irq_lock); |
| @@ -277,7 +304,7 @@ int __init early_irq_init(void) | |||
| 277 | nr_irqs = initcnt; | 304 | nr_irqs = initcnt; |
| 278 | 305 | ||
| 279 | for (i = 0; i < initcnt; i++) { | 306 | for (i = 0; i < initcnt; i++) { |
| 280 | desc = alloc_desc(i, node, NULL); | 307 | desc = alloc_desc(i, node, 0, NULL, NULL); |
| 281 | set_bit(i, allocated_irqs); | 308 | set_bit(i, allocated_irqs); |
| 282 | irq_insert_desc(i, desc); | 309 | irq_insert_desc(i, desc); |
| 283 | } | 310 | } |
| @@ -311,7 +338,7 @@ int __init early_irq_init(void) | |||
| 311 | alloc_masks(&desc[i], GFP_KERNEL, node); | 338 | alloc_masks(&desc[i], GFP_KERNEL, node); |
| 312 | raw_spin_lock_init(&desc[i].lock); | 339 | raw_spin_lock_init(&desc[i].lock); |
| 313 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | 340 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); |
| 314 | desc_set_defaults(i, &desc[i], node, NULL); | 341 | desc_set_defaults(i, &desc[i], node, NULL, NULL); |
| 315 | } | 342 | } |
| 316 | return arch_early_irq_init(); | 343 | return arch_early_irq_init(); |
| 317 | } | 344 | } |
| @@ -328,11 +355,12 @@ static void free_desc(unsigned int irq) | |||
| 328 | unsigned long flags; | 355 | unsigned long flags; |
| 329 | 356 | ||
| 330 | raw_spin_lock_irqsave(&desc->lock, flags); | 357 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 331 | desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL); | 358 | desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); |
| 332 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 359 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 333 | } | 360 | } |
| 334 | 361 | ||
| 335 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, | 362 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, |
| 363 | const struct cpumask *affinity, | ||
| 336 | struct module *owner) | 364 | struct module *owner) |
| 337 | { | 365 | { |
| 338 | u32 i; | 366 | u32 i; |
| @@ -453,12 +481,15 @@ EXPORT_SYMBOL_GPL(irq_free_descs); | |||
| 453 | * @cnt: Number of consecutive irqs to allocate. | 481 | * @cnt: Number of consecutive irqs to allocate. |
| 454 | * @node: Preferred node on which the irq descriptor should be allocated | 482 | * @node: Preferred node on which the irq descriptor should be allocated |
| 455 | * @owner: Owning module (can be NULL) | 483 | * @owner: Owning module (can be NULL) |
| 484 | * @affinity: Optional pointer to an affinity mask which hints where the | ||
| 485 | * irq descriptors should be allocated and which default | ||
| 486 | * affinities to use | ||
| 456 | * | 487 | * |
| 457 | * Returns the first irq number or error code | 488 | * Returns the first irq number or error code |
| 458 | */ | 489 | */ |
| 459 | int __ref | 490 | int __ref |
| 460 | __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, | 491 | __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, |
| 461 | struct module *owner) | 492 | struct module *owner, const struct cpumask *affinity) |
| 462 | { | 493 | { |
| 463 | int start, ret; | 494 | int start, ret; |
| 464 | 495 | ||
| @@ -494,7 +525,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, | |||
| 494 | 525 | ||
| 495 | bitmap_set(allocated_irqs, start, cnt); | 526 | bitmap_set(allocated_irqs, start, cnt); |
| 496 | mutex_unlock(&sparse_irq_lock); | 527 | mutex_unlock(&sparse_irq_lock); |
| 497 | return alloc_descs(start, cnt, node, owner); | 528 | return alloc_descs(start, cnt, node, affinity, owner); |
| 498 | 529 | ||
| 499 | err: | 530 | err: |
| 500 | mutex_unlock(&sparse_irq_lock); | 531 | mutex_unlock(&sparse_irq_lock); |
| @@ -512,7 +543,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs); | |||
| 512 | */ | 543 | */ |
| 513 | unsigned int irq_alloc_hwirqs(int cnt, int node) | 544 | unsigned int irq_alloc_hwirqs(int cnt, int node) |
| 514 | { | 545 | { |
| 515 | int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL); | 546 | int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL); |
| 516 | 547 | ||
| 517 | if (irq < 0) | 548 | if (irq < 0) |
| 518 | return 0; | 549 | return 0; |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8798b6c9e945..4752b43662e0 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
| @@ -481,7 +481,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
| 481 | } | 481 | } |
| 482 | 482 | ||
| 483 | /* Allocate a virtual interrupt number */ | 483 | /* Allocate a virtual interrupt number */ |
| 484 | virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node)); | 484 | virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL); |
| 485 | if (virq <= 0) { | 485 | if (virq <= 0) { |
| 486 | pr_debug("-> virq allocation failed\n"); | 486 | pr_debug("-> virq allocation failed\n"); |
| 487 | return 0; | 487 | return 0; |
| @@ -567,6 +567,7 @@ static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data, | |||
| 567 | unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) | 567 | unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) |
| 568 | { | 568 | { |
| 569 | struct irq_domain *domain; | 569 | struct irq_domain *domain; |
| 570 | struct irq_data *irq_data; | ||
| 570 | irq_hw_number_t hwirq; | 571 | irq_hw_number_t hwirq; |
| 571 | unsigned int type = IRQ_TYPE_NONE; | 572 | unsigned int type = IRQ_TYPE_NONE; |
| 572 | int virq; | 573 | int virq; |
| @@ -588,15 +589,46 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) | |||
| 588 | if (irq_domain_translate(domain, fwspec, &hwirq, &type)) | 589 | if (irq_domain_translate(domain, fwspec, &hwirq, &type)) |
| 589 | return 0; | 590 | return 0; |
| 590 | 591 | ||
| 591 | if (irq_domain_is_hierarchy(domain)) { | 592 | /* |
| 593 | * WARN if the irqchip returns a type with bits | ||
| 594 | * outside the sense mask set and clear these bits. | ||
| 595 | */ | ||
| 596 | if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK)) | ||
| 597 | type &= IRQ_TYPE_SENSE_MASK; | ||
| 598 | |||
| 599 | /* | ||
| 600 | * If we've already configured this interrupt, | ||
| 601 | * don't do it again, or hell will break loose. | ||
| 602 | */ | ||
| 603 | virq = irq_find_mapping(domain, hwirq); | ||
| 604 | if (virq) { | ||
| 605 | /* | ||
| 606 | * If the trigger type is not specified or matches the | ||
| 607 | * current trigger type then we are done so return the | ||
| 608 | * interrupt number. | ||
| 609 | */ | ||
| 610 | if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) | ||
| 611 | return virq; | ||
| 612 | |||
| 592 | /* | 613 | /* |
| 593 | * If we've already configured this interrupt, | 614 | * If the trigger type has not been set yet, then set |
| 594 | * don't do it again, or hell will break loose. | 615 | * it now and return the interrupt number. |
| 595 | */ | 616 | */ |
| 596 | virq = irq_find_mapping(domain, hwirq); | 617 | if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) { |
| 597 | if (virq) | 618 | irq_data = irq_get_irq_data(virq); |
| 619 | if (!irq_data) | ||
| 620 | return 0; | ||
| 621 | |||
| 622 | irqd_set_trigger_type(irq_data, type); | ||
| 598 | return virq; | 623 | return virq; |
| 624 | } | ||
| 599 | 625 | ||
| 626 | pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n", | ||
| 627 | hwirq, of_node_full_name(to_of_node(fwspec->fwnode))); | ||
| 628 | return 0; | ||
| 629 | } | ||
| 630 | |||
| 631 | if (irq_domain_is_hierarchy(domain)) { | ||
| 600 | virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec); | 632 | virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec); |
| 601 | if (virq <= 0) | 633 | if (virq <= 0) |
| 602 | return 0; | 634 | return 0; |
| @@ -607,10 +639,18 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) | |||
| 607 | return virq; | 639 | return virq; |
| 608 | } | 640 | } |
| 609 | 641 | ||
| 610 | /* Set type if specified and different than the current one */ | 642 | irq_data = irq_get_irq_data(virq); |
| 611 | if (type != IRQ_TYPE_NONE && | 643 | if (!irq_data) { |
| 612 | type != irq_get_trigger_type(virq)) | 644 | if (irq_domain_is_hierarchy(domain)) |
| 613 | irq_set_irq_type(virq, type); | 645 | irq_domain_free_irqs(virq, 1); |
| 646 | else | ||
| 647 | irq_dispose_mapping(virq); | ||
| 648 | return 0; | ||
| 649 | } | ||
| 650 | |||
| 651 | /* Store trigger type */ | ||
| 652 | irqd_set_trigger_type(irq_data, type); | ||
| 653 | |||
| 614 | return virq; | 654 | return virq; |
| 615 | } | 655 | } |
| 616 | EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); | 656 | EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); |
| @@ -640,8 +680,12 @@ void irq_dispose_mapping(unsigned int virq) | |||
| 640 | if (WARN_ON(domain == NULL)) | 680 | if (WARN_ON(domain == NULL)) |
| 641 | return; | 681 | return; |
| 642 | 682 | ||
| 643 | irq_domain_disassociate(domain, virq); | 683 | if (irq_domain_is_hierarchy(domain)) { |
| 644 | irq_free_desc(virq); | 684 | irq_domain_free_irqs(virq, 1); |
| 685 | } else { | ||
| 686 | irq_domain_disassociate(domain, virq); | ||
| 687 | irq_free_desc(virq); | ||
| 688 | } | ||
| 645 | } | 689 | } |
| 646 | EXPORT_SYMBOL_GPL(irq_dispose_mapping); | 690 | EXPORT_SYMBOL_GPL(irq_dispose_mapping); |
| 647 | 691 | ||
| @@ -835,19 +879,23 @@ const struct irq_domain_ops irq_domain_simple_ops = { | |||
| 835 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); | 879 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); |
| 836 | 880 | ||
| 837 | int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, | 881 | int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, |
| 838 | int node) | 882 | int node, const struct cpumask *affinity) |
| 839 | { | 883 | { |
| 840 | unsigned int hint; | 884 | unsigned int hint; |
| 841 | 885 | ||
| 842 | if (virq >= 0) { | 886 | if (virq >= 0) { |
| 843 | virq = irq_alloc_descs(virq, virq, cnt, node); | 887 | virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE, |
| 888 | affinity); | ||
| 844 | } else { | 889 | } else { |
| 845 | hint = hwirq % nr_irqs; | 890 | hint = hwirq % nr_irqs; |
| 846 | if (hint == 0) | 891 | if (hint == 0) |
| 847 | hint++; | 892 | hint++; |
| 848 | virq = irq_alloc_descs_from(hint, cnt, node); | 893 | virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE, |
| 849 | if (virq <= 0 && hint > 1) | 894 | affinity); |
| 850 | virq = irq_alloc_descs_from(1, cnt, node); | 895 | if (virq <= 0 && hint > 1) { |
| 896 | virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE, | ||
| 897 | affinity); | ||
| 898 | } | ||
| 851 | } | 899 | } |
| 852 | 900 | ||
| 853 | return virq; | 901 | return virq; |
| @@ -1144,8 +1192,10 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, | |||
| 1144 | if (recursive) | 1192 | if (recursive) |
| 1145 | ret = irq_domain_alloc_irqs_recursive(parent, irq_base, | 1193 | ret = irq_domain_alloc_irqs_recursive(parent, irq_base, |
| 1146 | nr_irqs, arg); | 1194 | nr_irqs, arg); |
| 1147 | if (ret >= 0) | 1195 | if (ret < 0) |
| 1148 | ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); | 1196 | return ret; |
| 1197 | |||
| 1198 | ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); | ||
| 1149 | if (ret < 0 && recursive) | 1199 | if (ret < 0 && recursive) |
| 1150 | irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); | 1200 | irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); |
| 1151 | 1201 | ||
| @@ -1160,6 +1210,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, | |||
| 1160 | * @node: NUMA node id for memory allocation | 1210 | * @node: NUMA node id for memory allocation |
| 1161 | * @arg: domain specific argument | 1211 | * @arg: domain specific argument |
| 1162 | * @realloc: IRQ descriptors have already been allocated if true | 1212 | * @realloc: IRQ descriptors have already been allocated if true |
| 1213 | * @affinity: Optional irq affinity mask for multiqueue devices | ||
| 1163 | * | 1214 | * |
| 1164 | * Allocate IRQ numbers and initialized all data structures to support | 1215 | * Allocate IRQ numbers and initialized all data structures to support |
| 1165 | * hierarchy IRQ domains. | 1216 | * hierarchy IRQ domains. |
| @@ -1175,7 +1226,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, | |||
| 1175 | */ | 1226 | */ |
| 1176 | int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, | 1227 | int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, |
| 1177 | unsigned int nr_irqs, int node, void *arg, | 1228 | unsigned int nr_irqs, int node, void *arg, |
| 1178 | bool realloc) | 1229 | bool realloc, const struct cpumask *affinity) |
| 1179 | { | 1230 | { |
| 1180 | int i, ret, virq; | 1231 | int i, ret, virq; |
| 1181 | 1232 | ||
| @@ -1193,7 +1244,8 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, | |||
| 1193 | if (realloc && irq_base >= 0) { | 1244 | if (realloc && irq_base >= 0) { |
| 1194 | virq = irq_base; | 1245 | virq = irq_base; |
| 1195 | } else { | 1246 | } else { |
| 1196 | virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node); | 1247 | virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node, |
| 1248 | affinity); | ||
| 1197 | if (virq < 0) { | 1249 | if (virq < 0) { |
| 1198 | pr_debug("cannot allocate IRQ(base %d, count %d)\n", | 1250 | pr_debug("cannot allocate IRQ(base %d, count %d)\n", |
| 1199 | irq_base, nr_irqs); | 1251 | irq_base, nr_irqs); |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ef0bc02c3a70..73a2b786b5e9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq); | |||
| 115 | #ifdef CONFIG_SMP | 115 | #ifdef CONFIG_SMP |
| 116 | cpumask_var_t irq_default_affinity; | 116 | cpumask_var_t irq_default_affinity; |
| 117 | 117 | ||
| 118 | static int __irq_can_set_affinity(struct irq_desc *desc) | 118 | static bool __irq_can_set_affinity(struct irq_desc *desc) |
| 119 | { | 119 | { |
| 120 | if (!desc || !irqd_can_balance(&desc->irq_data) || | 120 | if (!desc || !irqd_can_balance(&desc->irq_data) || |
| 121 | !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) | 121 | !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) |
| 122 | return 0; | 122 | return false; |
| 123 | return 1; | 123 | return true; |
| 124 | } | 124 | } |
| 125 | 125 | ||
| 126 | /** | 126 | /** |
| @@ -134,6 +134,21 @@ int irq_can_set_affinity(unsigned int irq) | |||
| 134 | } | 134 | } |
| 135 | 135 | ||
| 136 | /** | 136 | /** |
| 137 | * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space | ||
| 138 | * @irq: Interrupt to check | ||
| 139 | * | ||
| 140 | * Like irq_can_set_affinity() above, but additionally checks for the | ||
| 141 | * AFFINITY_MANAGED flag. | ||
| 142 | */ | ||
| 143 | bool irq_can_set_affinity_usr(unsigned int irq) | ||
| 144 | { | ||
| 145 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 146 | |||
| 147 | return __irq_can_set_affinity(desc) && | ||
| 148 | !irqd_affinity_is_managed(&desc->irq_data); | ||
| 149 | } | ||
| 150 | |||
| 151 | /** | ||
| 137 | * irq_set_thread_affinity - Notify irq threads to adjust affinity | 152 | * irq_set_thread_affinity - Notify irq threads to adjust affinity |
| 138 | * @desc: irq descriptor which has affitnity changed | 153 | * @desc: irq descriptor which has affitnity changed |
| 139 | * | 154 | * |
| @@ -338,10 +353,11 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) | |||
| 338 | return 0; | 353 | return 0; |
| 339 | 354 | ||
| 340 | /* | 355 | /* |
| 341 | * Preserve an userspace affinity setup, but make sure that | 356 | * Preserve the managed affinity setting and an userspace affinity |
| 342 | * one of the targets is online. | 357 | * setup, but make sure that one of the targets is online. |
| 343 | */ | 358 | */ |
| 344 | if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { | 359 | if (irqd_affinity_is_managed(&desc->irq_data) || |
| 360 | irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { | ||
| 345 | if (cpumask_intersects(desc->irq_common_data.affinity, | 361 | if (cpumask_intersects(desc->irq_common_data.affinity, |
| 346 | cpu_online_mask)) | 362 | cpu_online_mask)) |
| 347 | set = desc->irq_common_data.affinity; | 363 | set = desc->irq_common_data.affinity; |
| @@ -1117,6 +1133,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1117 | new->irq = irq; | 1133 | new->irq = irq; |
| 1118 | 1134 | ||
| 1119 | /* | 1135 | /* |
| 1136 | * If the trigger type is not specified by the caller, | ||
| 1137 | * then use the default for this interrupt. | ||
| 1138 | */ | ||
| 1139 | if (!(new->flags & IRQF_TRIGGER_MASK)) | ||
| 1140 | new->flags |= irqd_get_trigger_type(&desc->irq_data); | ||
| 1141 | |||
| 1142 | /* | ||
| 1120 | * Check whether the interrupt nests into another interrupt | 1143 | * Check whether the interrupt nests into another interrupt |
| 1121 | * thread. | 1144 | * thread. |
| 1122 | */ | 1145 | */ |
| @@ -1409,10 +1432,18 @@ int setup_irq(unsigned int irq, struct irqaction *act) | |||
| 1409 | 1432 | ||
| 1410 | if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) | 1433 | if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) |
| 1411 | return -EINVAL; | 1434 | return -EINVAL; |
| 1435 | |||
| 1436 | retval = irq_chip_pm_get(&desc->irq_data); | ||
| 1437 | if (retval < 0) | ||
| 1438 | return retval; | ||
| 1439 | |||
| 1412 | chip_bus_lock(desc); | 1440 | chip_bus_lock(desc); |
| 1413 | retval = __setup_irq(irq, desc, act); | 1441 | retval = __setup_irq(irq, desc, act); |
| 1414 | chip_bus_sync_unlock(desc); | 1442 | chip_bus_sync_unlock(desc); |
| 1415 | 1443 | ||
| 1444 | if (retval) | ||
| 1445 | irq_chip_pm_put(&desc->irq_data); | ||
| 1446 | |||
| 1416 | return retval; | 1447 | return retval; |
| 1417 | } | 1448 | } |
| 1418 | EXPORT_SYMBOL_GPL(setup_irq); | 1449 | EXPORT_SYMBOL_GPL(setup_irq); |
| @@ -1506,6 +1537,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
| 1506 | } | 1537 | } |
| 1507 | } | 1538 | } |
| 1508 | 1539 | ||
| 1540 | irq_chip_pm_put(&desc->irq_data); | ||
| 1509 | module_put(desc->owner); | 1541 | module_put(desc->owner); |
| 1510 | kfree(action->secondary); | 1542 | kfree(action->secondary); |
| 1511 | return action; | 1543 | return action; |
| @@ -1648,11 +1680,16 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, | |||
| 1648 | action->name = devname; | 1680 | action->name = devname; |
| 1649 | action->dev_id = dev_id; | 1681 | action->dev_id = dev_id; |
| 1650 | 1682 | ||
| 1683 | retval = irq_chip_pm_get(&desc->irq_data); | ||
| 1684 | if (retval < 0) | ||
| 1685 | return retval; | ||
| 1686 | |||
| 1651 | chip_bus_lock(desc); | 1687 | chip_bus_lock(desc); |
| 1652 | retval = __setup_irq(irq, desc, action); | 1688 | retval = __setup_irq(irq, desc, action); |
| 1653 | chip_bus_sync_unlock(desc); | 1689 | chip_bus_sync_unlock(desc); |
| 1654 | 1690 | ||
| 1655 | if (retval) { | 1691 | if (retval) { |
| 1692 | irq_chip_pm_put(&desc->irq_data); | ||
| 1656 | kfree(action->secondary); | 1693 | kfree(action->secondary); |
| 1657 | kfree(action); | 1694 | kfree(action); |
| 1658 | } | 1695 | } |
| @@ -1730,7 +1767,14 @@ void enable_percpu_irq(unsigned int irq, unsigned int type) | |||
| 1730 | if (!desc) | 1767 | if (!desc) |
| 1731 | return; | 1768 | return; |
| 1732 | 1769 | ||
| 1770 | /* | ||
| 1771 | * If the trigger type is not specified by the caller, then | ||
| 1772 | * use the default for this interrupt. | ||
| 1773 | */ | ||
| 1733 | type &= IRQ_TYPE_SENSE_MASK; | 1774 | type &= IRQ_TYPE_SENSE_MASK; |
| 1775 | if (type == IRQ_TYPE_NONE) | ||
| 1776 | type = irqd_get_trigger_type(&desc->irq_data); | ||
| 1777 | |||
| 1734 | if (type != IRQ_TYPE_NONE) { | 1778 | if (type != IRQ_TYPE_NONE) { |
| 1735 | int ret; | 1779 | int ret; |
| 1736 | 1780 | ||
| @@ -1822,6 +1866,7 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_ | |||
| 1822 | 1866 | ||
| 1823 | unregister_handler_proc(irq, action); | 1867 | unregister_handler_proc(irq, action); |
| 1824 | 1868 | ||
| 1869 | irq_chip_pm_put(&desc->irq_data); | ||
| 1825 | module_put(desc->owner); | 1870 | module_put(desc->owner); |
| 1826 | return action; | 1871 | return action; |
| 1827 | 1872 | ||
| @@ -1884,10 +1929,18 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) | |||
| 1884 | 1929 | ||
| 1885 | if (!desc || !irq_settings_is_per_cpu_devid(desc)) | 1930 | if (!desc || !irq_settings_is_per_cpu_devid(desc)) |
| 1886 | return -EINVAL; | 1931 | return -EINVAL; |
| 1932 | |||
| 1933 | retval = irq_chip_pm_get(&desc->irq_data); | ||
| 1934 | if (retval < 0) | ||
| 1935 | return retval; | ||
| 1936 | |||
| 1887 | chip_bus_lock(desc); | 1937 | chip_bus_lock(desc); |
| 1888 | retval = __setup_irq(irq, desc, act); | 1938 | retval = __setup_irq(irq, desc, act); |
| 1889 | chip_bus_sync_unlock(desc); | 1939 | chip_bus_sync_unlock(desc); |
| 1890 | 1940 | ||
| 1941 | if (retval) | ||
| 1942 | irq_chip_pm_put(&desc->irq_data); | ||
| 1943 | |||
| 1891 | return retval; | 1944 | return retval; |
| 1892 | } | 1945 | } |
| 1893 | 1946 | ||
| @@ -1931,12 +1984,18 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, | |||
| 1931 | action->name = devname; | 1984 | action->name = devname; |
| 1932 | action->percpu_dev_id = dev_id; | 1985 | action->percpu_dev_id = dev_id; |
| 1933 | 1986 | ||
| 1987 | retval = irq_chip_pm_get(&desc->irq_data); | ||
| 1988 | if (retval < 0) | ||
| 1989 | return retval; | ||
| 1990 | |||
| 1934 | chip_bus_lock(desc); | 1991 | chip_bus_lock(desc); |
| 1935 | retval = __setup_irq(irq, desc, action); | 1992 | retval = __setup_irq(irq, desc, action); |
| 1936 | chip_bus_sync_unlock(desc); | 1993 | chip_bus_sync_unlock(desc); |
| 1937 | 1994 | ||
| 1938 | if (retval) | 1995 | if (retval) { |
| 1996 | irq_chip_pm_put(&desc->irq_data); | ||
| 1939 | kfree(action); | 1997 | kfree(action); |
| 1998 | } | ||
| 1940 | 1999 | ||
| 1941 | return retval; | 2000 | return retval; |
| 1942 | } | 2001 | } |
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 38e89ce7b071..54999350162c 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c | |||
| @@ -324,7 +324,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, | |||
| 324 | struct msi_domain_ops *ops = info->ops; | 324 | struct msi_domain_ops *ops = info->ops; |
| 325 | msi_alloc_info_t arg; | 325 | msi_alloc_info_t arg; |
| 326 | struct msi_desc *desc; | 326 | struct msi_desc *desc; |
| 327 | int i, ret, virq = -1; | 327 | int i, ret, virq; |
| 328 | 328 | ||
| 329 | ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); | 329 | ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); |
| 330 | if (ret) | 330 | if (ret) |
| @@ -332,13 +332,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, | |||
| 332 | 332 | ||
| 333 | for_each_msi_entry(desc, dev) { | 333 | for_each_msi_entry(desc, dev) { |
| 334 | ops->set_desc(&arg, desc); | 334 | ops->set_desc(&arg, desc); |
| 335 | if (info->flags & MSI_FLAG_IDENTITY_MAP) | ||
| 336 | virq = (int)ops->get_hwirq(info, &arg); | ||
| 337 | else | ||
| 338 | virq = -1; | ||
| 339 | 335 | ||
| 340 | virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, | 336 | virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, |
| 341 | dev_to_node(dev), &arg, false); | 337 | dev_to_node(dev), &arg, false, |
| 338 | desc->affinity); | ||
| 342 | if (virq < 0) { | 339 | if (virq < 0) { |
| 343 | ret = -ENOSPC; | 340 | ret = -ENOSPC; |
| 344 | if (ops->handle_error) | 341 | if (ops->handle_error) |
| @@ -356,6 +353,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, | |||
| 356 | ops->msi_finish(&arg, 0); | 353 | ops->msi_finish(&arg, 0); |
| 357 | 354 | ||
| 358 | for_each_msi_entry(desc, dev) { | 355 | for_each_msi_entry(desc, dev) { |
| 356 | virq = desc->irq; | ||
| 359 | if (desc->nvec_used == 1) | 357 | if (desc->nvec_used == 1) |
| 360 | dev_dbg(dev, "irq %d for MSI\n", virq); | 358 | dev_dbg(dev, "irq %d for MSI\n", virq); |
| 361 | else | 359 | else |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4e1b94726818..feaa813b84a9 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
| @@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file, | |||
| 96 | cpumask_var_t new_value; | 96 | cpumask_var_t new_value; |
| 97 | int err; | 97 | int err; |
| 98 | 98 | ||
| 99 | if (!irq_can_set_affinity(irq) || no_irq_affinity) | 99 | if (!irq_can_set_affinity_usr(irq) || no_irq_affinity) |
| 100 | return -EIO; | 100 | return -EIO; |
| 101 | 101 | ||
| 102 | if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) | 102 | if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) |
| @@ -311,7 +311,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) | |||
| 311 | !name_unique(irq, action)) | 311 | !name_unique(irq, action)) |
| 312 | return; | 312 | return; |
| 313 | 313 | ||
| 314 | memset(name, 0, MAX_NAMELEN); | ||
| 315 | snprintf(name, MAX_NAMELEN, "%s", action->name); | 314 | snprintf(name, MAX_NAMELEN, "%s", action->name); |
| 316 | 315 | ||
| 317 | /* create /proc/irq/1234/handler/ */ | 316 | /* create /proc/irq/1234/handler/ */ |
| @@ -340,7 +339,6 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
| 340 | if (desc->dir) | 339 | if (desc->dir) |
| 341 | goto out_unlock; | 340 | goto out_unlock; |
| 342 | 341 | ||
| 343 | memset(name, 0, MAX_NAMELEN); | ||
| 344 | sprintf(name, "%d", irq); | 342 | sprintf(name, "%d", irq); |
| 345 | 343 | ||
| 346 | /* create /proc/irq/1234 */ | 344 | /* create /proc/irq/1234 */ |
| @@ -386,7 +384,6 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
| 386 | #endif | 384 | #endif |
| 387 | remove_proc_entry("spurious", desc->dir); | 385 | remove_proc_entry("spurious", desc->dir); |
| 388 | 386 | ||
| 389 | memset(name, 0, MAX_NAMELEN); | ||
| 390 | sprintf(name, "%u", irq); | 387 | sprintf(name, "%u", irq); |
| 391 | remove_proc_entry(name, root_irq_dir); | 388 | remove_proc_entry(name, root_irq_dir); |
| 392 | } | 389 | } |
| @@ -421,12 +418,8 @@ void init_irq_proc(void) | |||
| 421 | /* | 418 | /* |
| 422 | * Create entries for all existing IRQs. | 419 | * Create entries for all existing IRQs. |
| 423 | */ | 420 | */ |
| 424 | for_each_irq_desc(irq, desc) { | 421 | for_each_irq_desc(irq, desc) |
| 425 | if (!desc) | ||
| 426 | continue; | ||
| 427 | |||
| 428 | register_irq_proc(irq, desc); | 422 | register_irq_proc(irq, desc); |
| 429 | } | ||
| 430 | } | 423 | } |
| 431 | 424 | ||
| 432 | #ifdef CONFIG_GENERIC_IRQ_SHOW | 425 | #ifdef CONFIG_GENERIC_IRQ_SHOW |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 4b353e0be121..0dbea887d625 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
| @@ -452,7 +452,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, | |||
| 452 | return notifier_from_errno(ret); | 452 | return notifier_from_errno(ret); |
| 453 | } | 453 | } |
| 454 | 454 | ||
| 455 | struct notifier_block jump_label_module_nb = { | 455 | static struct notifier_block jump_label_module_nb = { |
| 456 | .notifier_call = jump_label_module_notify, | 456 | .notifier_call = jump_label_module_notify, |
| 457 | .priority = 1, /* higher than tracepoints */ | 457 | .priority = 1, /* higher than tracepoints */ |
| 458 | }; | 458 | }; |
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 81f1a7107c0e..589d763a49b3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
| @@ -46,6 +46,7 @@ | |||
| 46 | #include <linux/gfp.h> | 46 | #include <linux/gfp.h> |
| 47 | #include <linux/kmemcheck.h> | 47 | #include <linux/kmemcheck.h> |
| 48 | #include <linux/random.h> | 48 | #include <linux/random.h> |
| 49 | #include <linux/jhash.h> | ||
| 49 | 50 | ||
| 50 | #include <asm/sections.h> | 51 | #include <asm/sections.h> |
| 51 | 52 | ||
| @@ -309,10 +310,14 @@ static struct hlist_head chainhash_table[CHAINHASH_SIZE]; | |||
| 309 | * It's a 64-bit hash, because it's important for the keys to be | 310 | * It's a 64-bit hash, because it's important for the keys to be |
| 310 | * unique. | 311 | * unique. |
| 311 | */ | 312 | */ |
| 312 | #define iterate_chain_key(key1, key2) \ | 313 | static inline u64 iterate_chain_key(u64 key, u32 idx) |
| 313 | (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ | 314 | { |
| 314 | ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ | 315 | u32 k0 = key, k1 = key >> 32; |
| 315 | (key2)) | 316 | |
| 317 | __jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */ | ||
| 318 | |||
| 319 | return k0 | (u64)k1 << 32; | ||
| 320 | } | ||
| 316 | 321 | ||
| 317 | void lockdep_off(void) | 322 | void lockdep_off(void) |
| 318 | { | 323 | { |
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index d06ae3bb46c5..57a871ae3c81 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h | |||
| @@ -29,12 +29,12 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, | |||
| 29 | 29 | ||
| 30 | static inline void mutex_set_owner(struct mutex *lock) | 30 | static inline void mutex_set_owner(struct mutex *lock) |
| 31 | { | 31 | { |
| 32 | lock->owner = current; | 32 | WRITE_ONCE(lock->owner, current); |
| 33 | } | 33 | } |
| 34 | 34 | ||
| 35 | static inline void mutex_clear_owner(struct mutex *lock) | 35 | static inline void mutex_clear_owner(struct mutex *lock) |
| 36 | { | 36 | { |
| 37 | lock->owner = NULL; | 37 | WRITE_ONCE(lock->owner, NULL); |
| 38 | } | 38 | } |
| 39 | 39 | ||
| 40 | #define spin_lock_mutex(lock, flags) \ | 40 | #define spin_lock_mutex(lock, flags) \ |
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index a68bae5e852a..6cd6b8e9efd7 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h | |||
| @@ -17,14 +17,20 @@ | |||
| 17 | __list_del((waiter)->list.prev, (waiter)->list.next) | 17 | __list_del((waiter)->list.prev, (waiter)->list.next) |
| 18 | 18 | ||
| 19 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 19 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
| 20 | /* | ||
| 21 | * The mutex owner can get read and written to locklessly. | ||
| 22 | * We should use WRITE_ONCE when writing the owner value to | ||
| 23 | * avoid store tearing, otherwise, a thread could potentially | ||
| 24 | * read a partially written and incomplete owner value. | ||
| 25 | */ | ||
| 20 | static inline void mutex_set_owner(struct mutex *lock) | 26 | static inline void mutex_set_owner(struct mutex *lock) |
| 21 | { | 27 | { |
| 22 | lock->owner = current; | 28 | WRITE_ONCE(lock->owner, current); |
| 23 | } | 29 | } |
| 24 | 30 | ||
| 25 | static inline void mutex_clear_owner(struct mutex *lock) | 31 | static inline void mutex_clear_owner(struct mutex *lock) |
| 26 | { | 32 | { |
| 27 | lock->owner = NULL; | 33 | WRITE_ONCE(lock->owner, NULL); |
| 28 | } | 34 | } |
| 29 | #else | 35 | #else |
| 30 | static inline void mutex_set_owner(struct mutex *lock) | 36 | static inline void mutex_set_owner(struct mutex *lock) |
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fec082338668..19248ddf37ce 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c | |||
| @@ -93,7 +93,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) | |||
| 93 | * that accesses can't leak upwards out of our subsequent critical | 93 | * that accesses can't leak upwards out of our subsequent critical |
| 94 | * section in the case that the lock is currently held for write. | 94 | * section in the case that the lock is currently held for write. |
| 95 | */ | 95 | */ |
| 96 | cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; | 96 | cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts); |
| 97 | rspin_until_writer_unlock(lock, cnts); | 97 | rspin_until_writer_unlock(lock, cnts); |
| 98 | 98 | ||
| 99 | /* | 99 | /* |
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 5fc8c311b8fe..b2caec7315af 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c | |||
| @@ -90,7 +90,7 @@ static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); | |||
| 90 | * therefore increment the cpu number by one. | 90 | * therefore increment the cpu number by one. |
| 91 | */ | 91 | */ |
| 92 | 92 | ||
| 93 | static inline u32 encode_tail(int cpu, int idx) | 93 | static inline __pure u32 encode_tail(int cpu, int idx) |
| 94 | { | 94 | { |
| 95 | u32 tail; | 95 | u32 tail; |
| 96 | 96 | ||
| @@ -103,7 +103,7 @@ static inline u32 encode_tail(int cpu, int idx) | |||
| 103 | return tail; | 103 | return tail; |
| 104 | } | 104 | } |
| 105 | 105 | ||
| 106 | static inline struct mcs_spinlock *decode_tail(u32 tail) | 106 | static inline __pure struct mcs_spinlock *decode_tail(u32 tail) |
| 107 | { | 107 | { |
| 108 | int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; | 108 | int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; |
| 109 | int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; | 109 | int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; |
| @@ -268,6 +268,63 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, | |||
| 268 | #endif | 268 | #endif |
| 269 | 269 | ||
| 270 | /* | 270 | /* |
| 271 | * Various notes on spin_is_locked() and spin_unlock_wait(), which are | ||
| 272 | * 'interesting' functions: | ||
| 273 | * | ||
| 274 | * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE | ||
| 275 | * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64, | ||
| 276 | * PPC). Also qspinlock has a similar issue per construction, the setting of | ||
| 277 | * the locked byte can be unordered acquiring the lock proper. | ||
| 278 | * | ||
| 279 | * This gets to be 'interesting' in the following cases, where the /should/s | ||
| 280 | * end up false because of this issue. | ||
| 281 | * | ||
| 282 | * | ||
| 283 | * CASE 1: | ||
| 284 | * | ||
| 285 | * So the spin_is_locked() correctness issue comes from something like: | ||
| 286 | * | ||
| 287 | * CPU0 CPU1 | ||
| 288 | * | ||
| 289 | * global_lock(); local_lock(i) | ||
| 290 | * spin_lock(&G) spin_lock(&L[i]) | ||
| 291 | * for (i) if (!spin_is_locked(&G)) { | ||
| 292 | * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep(); | ||
| 293 | * return; | ||
| 294 | * } | ||
| 295 | * // deal with fail | ||
| 296 | * | ||
| 297 | * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such | ||
| 298 | * that there is exclusion between the two critical sections. | ||
| 299 | * | ||
| 300 | * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from | ||
| 301 | * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i]) | ||
| 302 | * /should/ be constrained by the ACQUIRE from spin_lock(&G). | ||
| 303 | * | ||
| 304 | * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB. | ||
| 305 | * | ||
| 306 | * | ||
| 307 | * CASE 2: | ||
| 308 | * | ||
| 309 | * For spin_unlock_wait() there is a second correctness issue, namely: | ||
| 310 | * | ||
| 311 | * CPU0 CPU1 | ||
| 312 | * | ||
| 313 | * flag = set; | ||
| 314 | * smp_mb(); spin_lock(&l) | ||
| 315 | * spin_unlock_wait(&l); if (!flag) | ||
| 316 | * // add to lockless list | ||
| 317 | * spin_unlock(&l); | ||
| 318 | * // iterate lockless list | ||
| 319 | * | ||
| 320 | * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0 | ||
| 321 | * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE | ||
| 322 | * semantics etc..) | ||
| 323 | * | ||
| 324 | * Where flag /should/ be ordered against the locked store of l. | ||
| 325 | */ | ||
| 326 | |||
| 327 | /* | ||
| 271 | * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before | 328 | * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before |
| 272 | * issuing an _unordered_ store to set _Q_LOCKED_VAL. | 329 | * issuing an _unordered_ store to set _Q_LOCKED_VAL. |
| 273 | * | 330 | * |
| @@ -322,7 +379,7 @@ void queued_spin_unlock_wait(struct qspinlock *lock) | |||
| 322 | cpu_relax(); | 379 | cpu_relax(); |
| 323 | 380 | ||
| 324 | done: | 381 | done: |
| 325 | smp_rmb(); /* CTRL + RMB -> ACQUIRE */ | 382 | smp_acquire__after_ctrl_dep(); |
| 326 | } | 383 | } |
| 327 | EXPORT_SYMBOL(queued_spin_unlock_wait); | 384 | EXPORT_SYMBOL(queued_spin_unlock_wait); |
| 328 | #endif | 385 | #endif |
| @@ -418,7 +475,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) | |||
| 418 | * sequentiality; this is because not all clear_pending_set_locked() | 475 | * sequentiality; this is because not all clear_pending_set_locked() |
| 419 | * implementations imply full barriers. | 476 | * implementations imply full barriers. |
| 420 | */ | 477 | */ |
| 421 | smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK)); | 478 | smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK)); |
| 422 | 479 | ||
| 423 | /* | 480 | /* |
| 424 | * take ownership and clear the pending bit. | 481 | * take ownership and clear the pending bit. |
| @@ -455,6 +512,8 @@ queue: | |||
| 455 | * pending stuff. | 512 | * pending stuff. |
| 456 | * | 513 | * |
| 457 | * p,*,* -> n,*,* | 514 | * p,*,* -> n,*,* |
| 515 | * | ||
| 516 | * RELEASE, such that the stores to @node must be complete. | ||
| 458 | */ | 517 | */ |
| 459 | old = xchg_tail(lock, tail); | 518 | old = xchg_tail(lock, tail); |
| 460 | next = NULL; | 519 | next = NULL; |
| @@ -465,6 +524,15 @@ queue: | |||
| 465 | */ | 524 | */ |
| 466 | if (old & _Q_TAIL_MASK) { | 525 | if (old & _Q_TAIL_MASK) { |
| 467 | prev = decode_tail(old); | 526 | prev = decode_tail(old); |
| 527 | /* | ||
| 528 | * The above xchg_tail() is also a load of @lock which generates, | ||
| 529 | * through decode_tail(), a pointer. | ||
| 530 | * | ||
| 531 | * The address dependency matches the RELEASE of xchg_tail() | ||
| 532 | * such that the access to @prev must happen after. | ||
| 533 | */ | ||
| 534 | smp_read_barrier_depends(); | ||
| 535 | |||
| 468 | WRITE_ONCE(prev->next, node); | 536 | WRITE_ONCE(prev->next, node); |
| 469 | 537 | ||
| 470 | pv_wait_node(node, prev); | 538 | pv_wait_node(node, prev); |
| @@ -494,7 +562,7 @@ queue: | |||
| 494 | * | 562 | * |
| 495 | * The PV pv_wait_head_or_lock function, if active, will acquire | 563 | * The PV pv_wait_head_or_lock function, if active, will acquire |
| 496 | * the lock and return a non-zero value. So we have to skip the | 564 | * the lock and return a non-zero value. So we have to skip the |
| 497 | * smp_cond_acquire() call. As the next PV queue head hasn't been | 565 | * smp_cond_load_acquire() call. As the next PV queue head hasn't been |
| 498 | * designated yet, there is no way for the locked value to become | 566 | * designated yet, there is no way for the locked value to become |
| 499 | * _Q_SLOW_VAL. So both the set_locked() and the | 567 | * _Q_SLOW_VAL. So both the set_locked() and the |
| 500 | * atomic_cmpxchg_relaxed() calls will be safe. | 568 | * atomic_cmpxchg_relaxed() calls will be safe. |
| @@ -505,7 +573,7 @@ queue: | |||
| 505 | if ((val = pv_wait_head_or_lock(lock, node))) | 573 | if ((val = pv_wait_head_or_lock(lock, node))) |
| 506 | goto locked; | 574 | goto locked; |
| 507 | 575 | ||
| 508 | smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); | 576 | val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK)); |
| 509 | 577 | ||
| 510 | locked: | 578 | locked: |
| 511 | /* | 579 | /* |
| @@ -525,9 +593,9 @@ locked: | |||
| 525 | break; | 593 | break; |
| 526 | } | 594 | } |
| 527 | /* | 595 | /* |
| 528 | * The smp_cond_acquire() call above has provided the necessary | 596 | * The smp_cond_load_acquire() call above has provided the |
| 529 | * acquire semantics required for locking. At most two | 597 | * necessary acquire semantics required for locking. At most |
| 530 | * iterations of this loop may be ran. | 598 | * two iterations of this loop may be ran. |
| 531 | */ | 599 | */ |
| 532 | old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); | 600 | old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); |
| 533 | if (old == val) | 601 | if (old == val) |
| @@ -551,7 +619,7 @@ release: | |||
| 551 | /* | 619 | /* |
| 552 | * release the node | 620 | * release the node |
| 553 | */ | 621 | */ |
| 554 | this_cpu_dec(mcs_nodes[0].count); | 622 | __this_cpu_dec(mcs_nodes[0].count); |
| 555 | } | 623 | } |
| 556 | EXPORT_SYMBOL(queued_spin_lock_slowpath); | 624 | EXPORT_SYMBOL(queued_spin_lock_slowpath); |
| 557 | 625 | ||
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 21ede57f68b3..37649e69056c 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h | |||
| @@ -112,12 +112,12 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock) | |||
| 112 | #else /* _Q_PENDING_BITS == 8 */ | 112 | #else /* _Q_PENDING_BITS == 8 */ |
| 113 | static __always_inline void set_pending(struct qspinlock *lock) | 113 | static __always_inline void set_pending(struct qspinlock *lock) |
| 114 | { | 114 | { |
| 115 | atomic_set_mask(_Q_PENDING_VAL, &lock->val); | 115 | atomic_or(_Q_PENDING_VAL, &lock->val); |
| 116 | } | 116 | } |
| 117 | 117 | ||
| 118 | static __always_inline void clear_pending(struct qspinlock *lock) | 118 | static __always_inline void clear_pending(struct qspinlock *lock) |
| 119 | { | 119 | { |
| 120 | atomic_clear_mask(_Q_PENDING_VAL, &lock->val); | 120 | atomic_andnot(_Q_PENDING_VAL, &lock->val); |
| 121 | } | 121 | } |
| 122 | 122 | ||
| 123 | static __always_inline int trylock_clear_pending(struct qspinlock *lock) | 123 | static __always_inline int trylock_clear_pending(struct qspinlock *lock) |
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 3e746607abe5..1ec0f48962b3 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
| @@ -1478,7 +1478,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); | |||
| 1478 | */ | 1478 | */ |
| 1479 | int __sched rt_mutex_trylock(struct rt_mutex *lock) | 1479 | int __sched rt_mutex_trylock(struct rt_mutex *lock) |
| 1480 | { | 1480 | { |
| 1481 | if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq())) | 1481 | if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) |
| 1482 | return 0; | 1482 | return 0; |
| 1483 | 1483 | ||
| 1484 | return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); | 1484 | return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); |
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 09e30c6225e5..447e08de1fab 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
| @@ -80,7 +80,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, | |||
| 80 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); | 80 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); |
| 81 | lockdep_init_map(&sem->dep_map, name, key, 0); | 81 | lockdep_init_map(&sem->dep_map, name, key, 0); |
| 82 | #endif | 82 | #endif |
| 83 | sem->count = RWSEM_UNLOCKED_VALUE; | 83 | atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); |
| 84 | raw_spin_lock_init(&sem->wait_lock); | 84 | raw_spin_lock_init(&sem->wait_lock); |
| 85 | INIT_LIST_HEAD(&sem->wait_list); | 85 | INIT_LIST_HEAD(&sem->wait_list); |
| 86 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | 86 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER |
| @@ -114,12 +114,16 @@ enum rwsem_wake_type { | |||
| 114 | * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) | 114 | * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) |
| 115 | * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) | 115 | * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) |
| 116 | * - there must be someone on the queue | 116 | * - there must be someone on the queue |
| 117 | * - the spinlock must be held by the caller | 117 | * - the wait_lock must be held by the caller |
| 118 | * - tasks are marked for wakeup, the caller must later invoke wake_up_q() | ||
| 119 | * to actually wakeup the blocked task(s) and drop the reference count, | ||
| 120 | * preferably when the wait_lock is released | ||
| 118 | * - woken process blocks are discarded from the list after having task zeroed | 121 | * - woken process blocks are discarded from the list after having task zeroed |
| 119 | * - writers are only woken if downgrading is false | 122 | * - writers are only marked woken if downgrading is false |
| 120 | */ | 123 | */ |
| 121 | static struct rw_semaphore * | 124 | static struct rw_semaphore * |
| 122 | __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) | 125 | __rwsem_mark_wake(struct rw_semaphore *sem, |
| 126 | enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) | ||
| 123 | { | 127 | { |
| 124 | struct rwsem_waiter *waiter; | 128 | struct rwsem_waiter *waiter; |
| 125 | struct task_struct *tsk; | 129 | struct task_struct *tsk; |
| @@ -128,13 +132,16 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) | |||
| 128 | 132 | ||
| 129 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); | 133 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); |
| 130 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) { | 134 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) { |
| 131 | if (wake_type == RWSEM_WAKE_ANY) | 135 | if (wake_type == RWSEM_WAKE_ANY) { |
| 132 | /* Wake writer at the front of the queue, but do not | 136 | /* |
| 133 | * grant it the lock yet as we want other writers | 137 | * Mark writer at the front of the queue for wakeup. |
| 134 | * to be able to steal it. Readers, on the other hand, | 138 | * Until the task is actually later awoken later by |
| 135 | * will block as they will notice the queued writer. | 139 | * the caller, other writers are able to steal it. |
| 140 | * Readers, on the other hand, will block as they | ||
| 141 | * will notice the queued writer. | ||
| 136 | */ | 142 | */ |
| 137 | wake_up_process(waiter->task); | 143 | wake_q_add(wake_q, waiter->task); |
| 144 | } | ||
| 138 | goto out; | 145 | goto out; |
| 139 | } | 146 | } |
| 140 | 147 | ||
| @@ -146,15 +153,27 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) | |||
| 146 | if (wake_type != RWSEM_WAKE_READ_OWNED) { | 153 | if (wake_type != RWSEM_WAKE_READ_OWNED) { |
| 147 | adjustment = RWSEM_ACTIVE_READ_BIAS; | 154 | adjustment = RWSEM_ACTIVE_READ_BIAS; |
| 148 | try_reader_grant: | 155 | try_reader_grant: |
| 149 | oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; | 156 | oldcount = atomic_long_fetch_add(adjustment, &sem->count); |
| 157 | |||
| 150 | if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { | 158 | if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { |
| 151 | /* A writer stole the lock. Undo our reader grant. */ | 159 | /* |
| 152 | if (rwsem_atomic_update(-adjustment, sem) & | 160 | * If the count is still less than RWSEM_WAITING_BIAS |
| 153 | RWSEM_ACTIVE_MASK) | 161 | * after removing the adjustment, it is assumed that |
| 162 | * a writer has stolen the lock. We have to undo our | ||
| 163 | * reader grant. | ||
| 164 | */ | ||
| 165 | if (atomic_long_add_return(-adjustment, &sem->count) < | ||
| 166 | RWSEM_WAITING_BIAS) | ||
| 154 | goto out; | 167 | goto out; |
| 155 | /* Last active locker left. Retry waking readers. */ | 168 | /* Last active locker left. Retry waking readers. */ |
| 156 | goto try_reader_grant; | 169 | goto try_reader_grant; |
| 157 | } | 170 | } |
| 171 | /* | ||
| 172 | * It is not really necessary to set it to reader-owned here, | ||
| 173 | * but it gives the spinners an early indication that the | ||
| 174 | * readers now have the lock. | ||
| 175 | */ | ||
| 176 | rwsem_set_reader_owned(sem); | ||
| 158 | } | 177 | } |
| 159 | 178 | ||
| 160 | /* Grant an infinite number of read locks to the readers at the front | 179 | /* Grant an infinite number of read locks to the readers at the front |
| @@ -179,7 +198,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) | |||
| 179 | adjustment -= RWSEM_WAITING_BIAS; | 198 | adjustment -= RWSEM_WAITING_BIAS; |
| 180 | 199 | ||
| 181 | if (adjustment) | 200 | if (adjustment) |
| 182 | rwsem_atomic_add(adjustment, sem); | 201 | atomic_long_add(adjustment, &sem->count); |
| 183 | 202 | ||
| 184 | next = sem->wait_list.next; | 203 | next = sem->wait_list.next; |
| 185 | loop = woken; | 204 | loop = woken; |
| @@ -187,17 +206,15 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) | |||
| 187 | waiter = list_entry(next, struct rwsem_waiter, list); | 206 | waiter = list_entry(next, struct rwsem_waiter, list); |
| 188 | next = waiter->list.next; | 207 | next = waiter->list.next; |
| 189 | tsk = waiter->task; | 208 | tsk = waiter->task; |
| 209 | |||
| 210 | wake_q_add(wake_q, tsk); | ||
| 190 | /* | 211 | /* |
| 191 | * Make sure we do not wakeup the next reader before | 212 | * Ensure that the last operation is setting the reader |
| 192 | * setting the nil condition to grant the next reader; | 213 | * waiter to nil such that rwsem_down_read_failed() cannot |
| 193 | * otherwise we could miss the wakeup on the other | 214 | * race with do_exit() by always holding a reference count |
| 194 | * side and end up sleeping again. See the pairing | 215 | * to the task to wakeup. |
| 195 | * in rwsem_down_read_failed(). | ||
| 196 | */ | 216 | */ |
| 197 | smp_mb(); | 217 | smp_store_release(&waiter->task, NULL); |
| 198 | waiter->task = NULL; | ||
| 199 | wake_up_process(tsk); | ||
| 200 | put_task_struct(tsk); | ||
| 201 | } while (--loop); | 218 | } while (--loop); |
| 202 | 219 | ||
| 203 | sem->wait_list.next = next; | 220 | sem->wait_list.next = next; |
| @@ -216,11 +233,11 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
| 216 | long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; | 233 | long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; |
| 217 | struct rwsem_waiter waiter; | 234 | struct rwsem_waiter waiter; |
| 218 | struct task_struct *tsk = current; | 235 | struct task_struct *tsk = current; |
| 236 | WAKE_Q(wake_q); | ||
| 219 | 237 | ||
| 220 | /* set up my own style of waitqueue */ | 238 | /* set up my own style of waitqueue */ |
| 221 | waiter.task = tsk; | 239 | waiter.task = tsk; |
| 222 | waiter.type = RWSEM_WAITING_FOR_READ; | 240 | waiter.type = RWSEM_WAITING_FOR_READ; |
| 223 | get_task_struct(tsk); | ||
| 224 | 241 | ||
| 225 | raw_spin_lock_irq(&sem->wait_lock); | 242 | raw_spin_lock_irq(&sem->wait_lock); |
| 226 | if (list_empty(&sem->wait_list)) | 243 | if (list_empty(&sem->wait_list)) |
| @@ -228,7 +245,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
| 228 | list_add_tail(&waiter.list, &sem->wait_list); | 245 | list_add_tail(&waiter.list, &sem->wait_list); |
| 229 | 246 | ||
| 230 | /* we're now waiting on the lock, but no longer actively locking */ | 247 | /* we're now waiting on the lock, but no longer actively locking */ |
| 231 | count = rwsem_atomic_update(adjustment, sem); | 248 | count = atomic_long_add_return(adjustment, &sem->count); |
| 232 | 249 | ||
| 233 | /* If there are no active locks, wake the front queued process(es). | 250 | /* If there are no active locks, wake the front queued process(es). |
| 234 | * | 251 | * |
| @@ -238,9 +255,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
| 238 | if (count == RWSEM_WAITING_BIAS || | 255 | if (count == RWSEM_WAITING_BIAS || |
| 239 | (count > RWSEM_WAITING_BIAS && | 256 | (count > RWSEM_WAITING_BIAS && |
| 240 | adjustment != -RWSEM_ACTIVE_READ_BIAS)) | 257 | adjustment != -RWSEM_ACTIVE_READ_BIAS)) |
| 241 | sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); | 258 | sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); |
| 242 | 259 | ||
| 243 | raw_spin_unlock_irq(&sem->wait_lock); | 260 | raw_spin_unlock_irq(&sem->wait_lock); |
| 261 | wake_up_q(&wake_q); | ||
| 244 | 262 | ||
| 245 | /* wait to be given the lock */ | 263 | /* wait to be given the lock */ |
| 246 | while (true) { | 264 | while (true) { |
| @@ -255,17 +273,29 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
| 255 | } | 273 | } |
| 256 | EXPORT_SYMBOL(rwsem_down_read_failed); | 274 | EXPORT_SYMBOL(rwsem_down_read_failed); |
| 257 | 275 | ||
| 276 | /* | ||
| 277 | * This function must be called with the sem->wait_lock held to prevent | ||
| 278 | * race conditions between checking the rwsem wait list and setting the | ||
| 279 | * sem->count accordingly. | ||
| 280 | */ | ||
| 258 | static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | 281 | static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) |
| 259 | { | 282 | { |
| 260 | /* | 283 | /* |
| 261 | * Try acquiring the write lock. Check count first in order | 284 | * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS. |
| 262 | * to reduce unnecessary expensive cmpxchg() operations. | ||
| 263 | */ | 285 | */ |
| 264 | if (count == RWSEM_WAITING_BIAS && | 286 | if (count != RWSEM_WAITING_BIAS) |
| 265 | cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, | 287 | return false; |
| 266 | RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { | 288 | |
| 267 | if (!list_is_singular(&sem->wait_list)) | 289 | /* |
| 268 | rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); | 290 | * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there |
| 291 | * are other tasks on the wait list, we need to add on WAITING_BIAS. | ||
| 292 | */ | ||
| 293 | count = list_is_singular(&sem->wait_list) ? | ||
| 294 | RWSEM_ACTIVE_WRITE_BIAS : | ||
| 295 | RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; | ||
| 296 | |||
| 297 | if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) | ||
| 298 | == RWSEM_WAITING_BIAS) { | ||
| 269 | rwsem_set_owner(sem); | 299 | rwsem_set_owner(sem); |
| 270 | return true; | 300 | return true; |
| 271 | } | 301 | } |
| @@ -279,13 +309,13 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | |||
| 279 | */ | 309 | */ |
| 280 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | 310 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) |
| 281 | { | 311 | { |
| 282 | long old, count = READ_ONCE(sem->count); | 312 | long old, count = atomic_long_read(&sem->count); |
| 283 | 313 | ||
| 284 | while (true) { | 314 | while (true) { |
| 285 | if (!(count == 0 || count == RWSEM_WAITING_BIAS)) | 315 | if (!(count == 0 || count == RWSEM_WAITING_BIAS)) |
| 286 | return false; | 316 | return false; |
| 287 | 317 | ||
| 288 | old = cmpxchg_acquire(&sem->count, count, | 318 | old = atomic_long_cmpxchg_acquire(&sem->count, count, |
| 289 | count + RWSEM_ACTIVE_WRITE_BIAS); | 319 | count + RWSEM_ACTIVE_WRITE_BIAS); |
| 290 | if (old == count) { | 320 | if (old == count) { |
| 291 | rwsem_set_owner(sem); | 321 | rwsem_set_owner(sem); |
| @@ -306,16 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) | |||
| 306 | 336 | ||
| 307 | rcu_read_lock(); | 337 | rcu_read_lock(); |
| 308 | owner = READ_ONCE(sem->owner); | 338 | owner = READ_ONCE(sem->owner); |
| 309 | if (!owner) { | 339 | if (!rwsem_owner_is_writer(owner)) { |
| 310 | long count = READ_ONCE(sem->count); | ||
| 311 | /* | 340 | /* |
| 312 | * If sem->owner is not set, yet we have just recently entered the | 341 | * Don't spin if the rwsem is readers owned. |
| 313 | * slowpath with the lock being active, then there is a possibility | ||
| 314 | * reader(s) may have the lock. To be safe, bail spinning in these | ||
| 315 | * situations. | ||
| 316 | */ | 342 | */ |
| 317 | if (count & RWSEM_ACTIVE_MASK) | 343 | ret = !rwsem_owner_is_reader(owner); |
| 318 | ret = false; | ||
| 319 | goto done; | 344 | goto done; |
| 320 | } | 345 | } |
| 321 | 346 | ||
| @@ -325,10 +350,15 @@ done: | |||
| 325 | return ret; | 350 | return ret; |
| 326 | } | 351 | } |
| 327 | 352 | ||
| 328 | static noinline | 353 | /* |
| 329 | bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) | 354 | * Return true only if we can still spin on the owner field of the rwsem. |
| 355 | */ | ||
| 356 | static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) | ||
| 330 | { | 357 | { |
| 331 | long count; | 358 | struct task_struct *owner = READ_ONCE(sem->owner); |
| 359 | |||
| 360 | if (!rwsem_owner_is_writer(owner)) | ||
| 361 | goto out; | ||
| 332 | 362 | ||
| 333 | rcu_read_lock(); | 363 | rcu_read_lock(); |
| 334 | while (sem->owner == owner) { | 364 | while (sem->owner == owner) { |
| @@ -349,22 +379,16 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) | |||
| 349 | cpu_relax_lowlatency(); | 379 | cpu_relax_lowlatency(); |
| 350 | } | 380 | } |
| 351 | rcu_read_unlock(); | 381 | rcu_read_unlock(); |
| 352 | 382 | out: | |
| 353 | if (READ_ONCE(sem->owner)) | ||
| 354 | return true; /* new owner, continue spinning */ | ||
| 355 | |||
| 356 | /* | 383 | /* |
| 357 | * When the owner is not set, the lock could be free or | 384 | * If there is a new owner or the owner is not set, we continue |
| 358 | * held by readers. Check the counter to verify the | 385 | * spinning. |
| 359 | * state. | ||
| 360 | */ | 386 | */ |
| 361 | count = READ_ONCE(sem->count); | 387 | return !rwsem_owner_is_reader(READ_ONCE(sem->owner)); |
| 362 | return (count == 0 || count == RWSEM_WAITING_BIAS); | ||
| 363 | } | 388 | } |
| 364 | 389 | ||
| 365 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | 390 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) |
| 366 | { | 391 | { |
| 367 | struct task_struct *owner; | ||
| 368 | bool taken = false; | 392 | bool taken = false; |
| 369 | 393 | ||
| 370 | preempt_disable(); | 394 | preempt_disable(); |
| @@ -376,12 +400,17 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | |||
| 376 | if (!osq_lock(&sem->osq)) | 400 | if (!osq_lock(&sem->osq)) |
| 377 | goto done; | 401 | goto done; |
| 378 | 402 | ||
| 379 | while (true) { | 403 | /* |
| 380 | owner = READ_ONCE(sem->owner); | 404 | * Optimistically spin on the owner field and attempt to acquire the |
| 381 | if (owner && !rwsem_spin_on_owner(sem, owner)) | 405 | * lock whenever the owner changes. Spinning will be stopped when: |
| 382 | break; | 406 | * 1) the owning writer isn't running; or |
| 383 | 407 | * 2) readers own the lock as we can't determine if they are | |
| 384 | /* wait_lock will be acquired if write_lock is obtained */ | 408 | * actively running or not. |
| 409 | */ | ||
| 410 | while (rwsem_spin_on_owner(sem)) { | ||
| 411 | /* | ||
| 412 | * Try to acquire the lock | ||
| 413 | */ | ||
| 385 | if (rwsem_try_write_lock_unqueued(sem)) { | 414 | if (rwsem_try_write_lock_unqueued(sem)) { |
| 386 | taken = true; | 415 | taken = true; |
| 387 | break; | 416 | break; |
| @@ -393,7 +422,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | |||
| 393 | * we're an RT task that will live-lock because we won't let | 422 | * we're an RT task that will live-lock because we won't let |
| 394 | * the owner complete. | 423 | * the owner complete. |
| 395 | */ | 424 | */ |
| 396 | if (!owner && (need_resched() || rt_task(current))) | 425 | if (!sem->owner && (need_resched() || rt_task(current))) |
| 397 | break; | 426 | break; |
| 398 | 427 | ||
| 399 | /* | 428 | /* |
| @@ -440,9 +469,10 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) | |||
| 440 | bool waiting = true; /* any queued threads before us */ | 469 | bool waiting = true; /* any queued threads before us */ |
| 441 | struct rwsem_waiter waiter; | 470 | struct rwsem_waiter waiter; |
| 442 | struct rw_semaphore *ret = sem; | 471 | struct rw_semaphore *ret = sem; |
| 472 | WAKE_Q(wake_q); | ||
| 443 | 473 | ||
| 444 | /* undo write bias from down_write operation, stop active locking */ | 474 | /* undo write bias from down_write operation, stop active locking */ |
| 445 | count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); | 475 | count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); |
| 446 | 476 | ||
| 447 | /* do optimistic spinning and steal lock if possible */ | 477 | /* do optimistic spinning and steal lock if possible */ |
| 448 | if (rwsem_optimistic_spin(sem)) | 478 | if (rwsem_optimistic_spin(sem)) |
| @@ -465,18 +495,29 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) | |||
| 465 | 495 | ||
| 466 | /* we're now waiting on the lock, but no longer actively locking */ | 496 | /* we're now waiting on the lock, but no longer actively locking */ |
| 467 | if (waiting) { | 497 | if (waiting) { |
| 468 | count = READ_ONCE(sem->count); | 498 | count = atomic_long_read(&sem->count); |
| 469 | 499 | ||
| 470 | /* | 500 | /* |
| 471 | * If there were already threads queued before us and there are | 501 | * If there were already threads queued before us and there are |
| 472 | * no active writers, the lock must be read owned; so we try to | 502 | * no active writers, the lock must be read owned; so we try to |
| 473 | * wake any read locks that were queued ahead of us. | 503 | * wake any read locks that were queued ahead of us. |
| 474 | */ | 504 | */ |
| 475 | if (count > RWSEM_WAITING_BIAS) | 505 | if (count > RWSEM_WAITING_BIAS) { |
| 476 | sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); | 506 | WAKE_Q(wake_q); |
| 507 | |||
| 508 | sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); | ||
| 509 | /* | ||
| 510 | * The wakeup is normally called _after_ the wait_lock | ||
| 511 | * is released, but given that we are proactively waking | ||
| 512 | * readers we can deal with the wake_q overhead as it is | ||
| 513 | * similar to releasing and taking the wait_lock again | ||
| 514 | * for attempting rwsem_try_write_lock(). | ||
| 515 | */ | ||
| 516 | wake_up_q(&wake_q); | ||
| 517 | } | ||
| 477 | 518 | ||
| 478 | } else | 519 | } else |
| 479 | count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); | 520 | count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count); |
| 480 | 521 | ||
| 481 | /* wait until we successfully acquire the lock */ | 522 | /* wait until we successfully acquire the lock */ |
| 482 | set_current_state(state); | 523 | set_current_state(state); |
| @@ -492,7 +533,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) | |||
| 492 | 533 | ||
| 493 | schedule(); | 534 | schedule(); |
| 494 | set_current_state(state); | 535 | set_current_state(state); |
| 495 | } while ((count = sem->count) & RWSEM_ACTIVE_MASK); | 536 | } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); |
| 496 | 537 | ||
| 497 | raw_spin_lock_irq(&sem->wait_lock); | 538 | raw_spin_lock_irq(&sem->wait_lock); |
| 498 | } | 539 | } |
| @@ -507,10 +548,11 @@ out_nolock: | |||
| 507 | raw_spin_lock_irq(&sem->wait_lock); | 548 | raw_spin_lock_irq(&sem->wait_lock); |
| 508 | list_del(&waiter.list); | 549 | list_del(&waiter.list); |
| 509 | if (list_empty(&sem->wait_list)) | 550 | if (list_empty(&sem->wait_list)) |
| 510 | rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem); | 551 | atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); |
| 511 | else | 552 | else |
| 512 | __rwsem_do_wake(sem, RWSEM_WAKE_ANY); | 553 | __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); |
| 513 | raw_spin_unlock_irq(&sem->wait_lock); | 554 | raw_spin_unlock_irq(&sem->wait_lock); |
| 555 | wake_up_q(&wake_q); | ||
| 514 | 556 | ||
| 515 | return ERR_PTR(-EINTR); | 557 | return ERR_PTR(-EINTR); |
| 516 | } | 558 | } |
| @@ -537,6 +579,7 @@ __visible | |||
| 537 | struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) | 579 | struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) |
| 538 | { | 580 | { |
| 539 | unsigned long flags; | 581 | unsigned long flags; |
| 582 | WAKE_Q(wake_q); | ||
| 540 | 583 | ||
| 541 | /* | 584 | /* |
| 542 | * If a spinner is present, it is not necessary to do the wakeup. | 585 | * If a spinner is present, it is not necessary to do the wakeup. |
| @@ -573,9 +616,10 @@ locked: | |||
| 573 | 616 | ||
| 574 | /* do nothing if list empty */ | 617 | /* do nothing if list empty */ |
| 575 | if (!list_empty(&sem->wait_list)) | 618 | if (!list_empty(&sem->wait_list)) |
| 576 | sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); | 619 | sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); |
| 577 | 620 | ||
| 578 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 621 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
| 622 | wake_up_q(&wake_q); | ||
| 579 | 623 | ||
| 580 | return sem; | 624 | return sem; |
| 581 | } | 625 | } |
| @@ -590,14 +634,16 @@ __visible | |||
| 590 | struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) | 634 | struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) |
| 591 | { | 635 | { |
| 592 | unsigned long flags; | 636 | unsigned long flags; |
| 637 | WAKE_Q(wake_q); | ||
| 593 | 638 | ||
| 594 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 639 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
| 595 | 640 | ||
| 596 | /* do nothing if list empty */ | 641 | /* do nothing if list empty */ |
| 597 | if (!list_empty(&sem->wait_list)) | 642 | if (!list_empty(&sem->wait_list)) |
| 598 | sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); | 643 | sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); |
| 599 | 644 | ||
| 600 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 645 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
| 646 | wake_up_q(&wake_q); | ||
| 601 | 647 | ||
| 602 | return sem; | 648 | return sem; |
| 603 | } | 649 | } |
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2e853ad93a3a..45ba475d4be3 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c | |||
| @@ -22,6 +22,7 @@ void __sched down_read(struct rw_semaphore *sem) | |||
| 22 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); | 22 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); |
| 23 | 23 | ||
| 24 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); | 24 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); |
| 25 | rwsem_set_reader_owned(sem); | ||
| 25 | } | 26 | } |
| 26 | 27 | ||
| 27 | EXPORT_SYMBOL(down_read); | 28 | EXPORT_SYMBOL(down_read); |
| @@ -33,8 +34,10 @@ int down_read_trylock(struct rw_semaphore *sem) | |||
| 33 | { | 34 | { |
| 34 | int ret = __down_read_trylock(sem); | 35 | int ret = __down_read_trylock(sem); |
| 35 | 36 | ||
| 36 | if (ret == 1) | 37 | if (ret == 1) { |
| 37 | rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); | 38 | rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); |
| 39 | rwsem_set_reader_owned(sem); | ||
| 40 | } | ||
| 38 | return ret; | 41 | return ret; |
| 39 | } | 42 | } |
| 40 | 43 | ||
| @@ -124,7 +127,7 @@ void downgrade_write(struct rw_semaphore *sem) | |||
| 124 | * lockdep: a downgraded write will live on as a write | 127 | * lockdep: a downgraded write will live on as a write |
| 125 | * dependency. | 128 | * dependency. |
| 126 | */ | 129 | */ |
| 127 | rwsem_clear_owner(sem); | 130 | rwsem_set_reader_owned(sem); |
| 128 | __downgrade_write(sem); | 131 | __downgrade_write(sem); |
| 129 | } | 132 | } |
| 130 | 133 | ||
| @@ -138,6 +141,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) | |||
| 138 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); | 141 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); |
| 139 | 142 | ||
| 140 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); | 143 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); |
| 144 | rwsem_set_reader_owned(sem); | ||
| 141 | } | 145 | } |
| 142 | 146 | ||
| 143 | EXPORT_SYMBOL(down_read_nested); | 147 | EXPORT_SYMBOL(down_read_nested); |
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 870ed9a5b426..a699f4048ba1 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h | |||
| @@ -1,14 +1,58 @@ | |||
| 1 | /* | ||
| 2 | * The owner field of the rw_semaphore structure will be set to | ||
| 3 | * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear | ||
| 4 | * the owner field when it unlocks. A reader, on the other hand, will | ||
| 5 | * not touch the owner field when it unlocks. | ||
| 6 | * | ||
| 7 | * In essence, the owner field now has the following 3 states: | ||
| 8 | * 1) 0 | ||
| 9 | * - lock is free or the owner hasn't set the field yet | ||
| 10 | * 2) RWSEM_READER_OWNED | ||
| 11 | * - lock is currently or previously owned by readers (lock is free | ||
| 12 | * or not set by owner yet) | ||
| 13 | * 3) Other non-zero value | ||
| 14 | * - a writer owns the lock | ||
| 15 | */ | ||
| 16 | #define RWSEM_READER_OWNED ((struct task_struct *)1UL) | ||
| 17 | |||
| 1 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | 18 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER |
| 19 | /* | ||
| 20 | * All writes to owner are protected by WRITE_ONCE() to make sure that | ||
| 21 | * store tearing can't happen as optimistic spinners may read and use | ||
| 22 | * the owner value concurrently without lock. Read from owner, however, | ||
| 23 | * may not need READ_ONCE() as long as the pointer value is only used | ||
| 24 | * for comparison and isn't being dereferenced. | ||
| 25 | */ | ||
| 2 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | 26 | static inline void rwsem_set_owner(struct rw_semaphore *sem) |
| 3 | { | 27 | { |
| 4 | sem->owner = current; | 28 | WRITE_ONCE(sem->owner, current); |
| 5 | } | 29 | } |
| 6 | 30 | ||
| 7 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | 31 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) |
| 8 | { | 32 | { |
| 9 | sem->owner = NULL; | 33 | WRITE_ONCE(sem->owner, NULL); |
| 34 | } | ||
| 35 | |||
| 36 | static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) | ||
| 37 | { | ||
| 38 | /* | ||
| 39 | * We check the owner value first to make sure that we will only | ||
| 40 | * do a write to the rwsem cacheline when it is really necessary | ||
| 41 | * to minimize cacheline contention. | ||
| 42 | */ | ||
| 43 | if (sem->owner != RWSEM_READER_OWNED) | ||
| 44 | WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); | ||
| 45 | } | ||
| 46 | |||
| 47 | static inline bool rwsem_owner_is_writer(struct task_struct *owner) | ||
| 48 | { | ||
| 49 | return owner && owner != RWSEM_READER_OWNED; | ||
| 10 | } | 50 | } |
| 11 | 51 | ||
| 52 | static inline bool rwsem_owner_is_reader(struct task_struct *owner) | ||
| 53 | { | ||
| 54 | return owner == RWSEM_READER_OWNED; | ||
| 55 | } | ||
| 12 | #else | 56 | #else |
| 13 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | 57 | static inline void rwsem_set_owner(struct rw_semaphore *sem) |
| 14 | { | 58 | { |
| @@ -17,4 +61,8 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem) | |||
| 17 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | 61 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) |
| 18 | { | 62 | { |
| 19 | } | 63 | } |
| 64 | |||
| 65 | static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) | ||
| 66 | { | ||
| 67 | } | ||
| 20 | #endif | 68 | #endif |
diff --git a/kernel/memremap.c b/kernel/memremap.c index 017532193fb1..251d16b4cb41 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c | |||
| @@ -169,12 +169,6 @@ void devm_memunmap(struct device *dev, void *addr) | |||
| 169 | } | 169 | } |
| 170 | EXPORT_SYMBOL(devm_memunmap); | 170 | EXPORT_SYMBOL(devm_memunmap); |
| 171 | 171 | ||
| 172 | pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags) | ||
| 173 | { | ||
| 174 | return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); | ||
| 175 | } | ||
| 176 | EXPORT_SYMBOL(phys_to_pfn_t); | ||
| 177 | |||
| 178 | #ifdef CONFIG_ZONE_DEVICE | 172 | #ifdef CONFIG_ZONE_DEVICE |
| 179 | static DEFINE_MUTEX(pgmap_lock); | 173 | static DEFINE_MUTEX(pgmap_lock); |
| 180 | static RADIX_TREE(pgmap_radix, GFP_KERNEL); | 174 | static RADIX_TREE(pgmap_radix, GFP_KERNEL); |
| @@ -308,12 +302,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
| 308 | if (is_ram == REGION_INTERSECTS) | 302 | if (is_ram == REGION_INTERSECTS) |
| 309 | return __va(res->start); | 303 | return __va(res->start); |
| 310 | 304 | ||
| 311 | if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) { | ||
| 312 | dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n", | ||
| 313 | __func__); | ||
| 314 | return ERR_PTR(-ENXIO); | ||
| 315 | } | ||
| 316 | |||
| 317 | if (!ref) | 305 | if (!ref) |
| 318 | return ERR_PTR(-EINVAL); | 306 | return ERR_PTR(-EINVAL); |
| 319 | 307 | ||
| @@ -401,7 +389,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) | |||
| 401 | altmap->alloc -= nr_pfns; | 389 | altmap->alloc -= nr_pfns; |
| 402 | } | 390 | } |
| 403 | 391 | ||
| 404 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | ||
| 405 | struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) | 392 | struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) |
| 406 | { | 393 | { |
| 407 | /* | 394 | /* |
| @@ -427,5 +414,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) | |||
| 427 | 414 | ||
| 428 | return pgmap ? pgmap->altmap : NULL; | 415 | return pgmap ? pgmap->altmap : NULL; |
| 429 | } | 416 | } |
| 430 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ | ||
| 431 | #endif /* CONFIG_ZONE_DEVICE */ | 417 | #endif /* CONFIG_ZONE_DEVICE */ |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index cb880a14cc39..eb4f717705ba 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
| @@ -1,6 +1,8 @@ | |||
| 1 | 1 | ||
| 2 | ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG | 2 | ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG |
| 3 | 3 | ||
| 4 | KASAN_SANITIZE_snapshot.o := n | ||
| 5 | |||
| 4 | obj-y += qos.o | 6 | obj-y += qos.o |
| 5 | obj-$(CONFIG_PM) += main.o | 7 | obj-$(CONFIG_PM) += main.o |
| 6 | obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o | 8 | obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o |
diff --git a/kernel/power/console.c b/kernel/power/console.c index aba9c545a0e3..0e781798b0b3 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c | |||
| @@ -126,17 +126,17 @@ out: | |||
| 126 | return ret; | 126 | return ret; |
| 127 | } | 127 | } |
| 128 | 128 | ||
| 129 | int pm_prepare_console(void) | 129 | void pm_prepare_console(void) |
| 130 | { | 130 | { |
| 131 | if (!pm_vt_switch()) | 131 | if (!pm_vt_switch()) |
| 132 | return 0; | 132 | return; |
| 133 | 133 | ||
| 134 | orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); | 134 | orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); |
| 135 | if (orig_fgconsole < 0) | 135 | if (orig_fgconsole < 0) |
| 136 | return 1; | 136 | return; |
| 137 | 137 | ||
| 138 | orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); | 138 | orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); |
| 139 | return 0; | 139 | return; |
| 140 | } | 140 | } |
| 141 | 141 | ||
| 142 | void pm_restore_console(void) | 142 | void pm_restore_console(void) |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index fca9254280ee..a881c6a7ba74 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -52,6 +52,7 @@ enum { | |||
| 52 | #ifdef CONFIG_SUSPEND | 52 | #ifdef CONFIG_SUSPEND |
| 53 | HIBERNATION_SUSPEND, | 53 | HIBERNATION_SUSPEND, |
| 54 | #endif | 54 | #endif |
| 55 | HIBERNATION_TEST_RESUME, | ||
| 55 | /* keep last */ | 56 | /* keep last */ |
| 56 | __HIBERNATION_AFTER_LAST | 57 | __HIBERNATION_AFTER_LAST |
| 57 | }; | 58 | }; |
| @@ -409,6 +410,11 @@ int hibernation_snapshot(int platform_mode) | |||
| 409 | goto Close; | 410 | goto Close; |
| 410 | } | 411 | } |
| 411 | 412 | ||
| 413 | int __weak hibernate_resume_nonboot_cpu_disable(void) | ||
| 414 | { | ||
| 415 | return disable_nonboot_cpus(); | ||
| 416 | } | ||
| 417 | |||
| 412 | /** | 418 | /** |
| 413 | * resume_target_kernel - Restore system state from a hibernation image. | 419 | * resume_target_kernel - Restore system state from a hibernation image. |
| 414 | * @platform_mode: Whether or not to use the platform driver. | 420 | * @platform_mode: Whether or not to use the platform driver. |
| @@ -433,7 +439,7 @@ static int resume_target_kernel(bool platform_mode) | |||
| 433 | if (error) | 439 | if (error) |
| 434 | goto Cleanup; | 440 | goto Cleanup; |
| 435 | 441 | ||
| 436 | error = disable_nonboot_cpus(); | 442 | error = hibernate_resume_nonboot_cpu_disable(); |
| 437 | if (error) | 443 | if (error) |
| 438 | goto Enable_cpus; | 444 | goto Enable_cpus; |
| 439 | 445 | ||
| @@ -642,12 +648,39 @@ static void power_down(void) | |||
| 642 | cpu_relax(); | 648 | cpu_relax(); |
| 643 | } | 649 | } |
| 644 | 650 | ||
| 651 | static int load_image_and_restore(void) | ||
| 652 | { | ||
| 653 | int error; | ||
| 654 | unsigned int flags; | ||
| 655 | |||
| 656 | pr_debug("PM: Loading hibernation image.\n"); | ||
| 657 | |||
| 658 | lock_device_hotplug(); | ||
| 659 | error = create_basic_memory_bitmaps(); | ||
| 660 | if (error) | ||
| 661 | goto Unlock; | ||
| 662 | |||
| 663 | error = swsusp_read(&flags); | ||
| 664 | swsusp_close(FMODE_READ); | ||
| 665 | if (!error) | ||
| 666 | hibernation_restore(flags & SF_PLATFORM_MODE); | ||
| 667 | |||
| 668 | printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); | ||
| 669 | swsusp_free(); | ||
| 670 | free_basic_memory_bitmaps(); | ||
| 671 | Unlock: | ||
| 672 | unlock_device_hotplug(); | ||
| 673 | |||
| 674 | return error; | ||
| 675 | } | ||
| 676 | |||
| 645 | /** | 677 | /** |
| 646 | * hibernate - Carry out system hibernation, including saving the image. | 678 | * hibernate - Carry out system hibernation, including saving the image. |
| 647 | */ | 679 | */ |
| 648 | int hibernate(void) | 680 | int hibernate(void) |
| 649 | { | 681 | { |
| 650 | int error; | 682 | int error, nr_calls = 0; |
| 683 | bool snapshot_test = false; | ||
| 651 | 684 | ||
| 652 | if (!hibernation_available()) { | 685 | if (!hibernation_available()) { |
| 653 | pr_debug("PM: Hibernation not available.\n"); | 686 | pr_debug("PM: Hibernation not available.\n"); |
| @@ -662,9 +695,11 @@ int hibernate(void) | |||
| 662 | } | 695 | } |
| 663 | 696 | ||
| 664 | pm_prepare_console(); | 697 | pm_prepare_console(); |
| 665 | error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); | 698 | error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); |
| 666 | if (error) | 699 | if (error) { |
| 700 | nr_calls--; | ||
| 667 | goto Exit; | 701 | goto Exit; |
| 702 | } | ||
| 668 | 703 | ||
| 669 | printk(KERN_INFO "PM: Syncing filesystems ... "); | 704 | printk(KERN_INFO "PM: Syncing filesystems ... "); |
| 670 | sys_sync(); | 705 | sys_sync(); |
| @@ -697,8 +732,12 @@ int hibernate(void) | |||
| 697 | pr_debug("PM: writing image.\n"); | 732 | pr_debug("PM: writing image.\n"); |
| 698 | error = swsusp_write(flags); | 733 | error = swsusp_write(flags); |
| 699 | swsusp_free(); | 734 | swsusp_free(); |
| 700 | if (!error) | 735 | if (!error) { |
| 701 | power_down(); | 736 | if (hibernation_mode == HIBERNATION_TEST_RESUME) |
| 737 | snapshot_test = true; | ||
| 738 | else | ||
| 739 | power_down(); | ||
| 740 | } | ||
| 702 | in_suspend = 0; | 741 | in_suspend = 0; |
| 703 | pm_restore_gfp_mask(); | 742 | pm_restore_gfp_mask(); |
| 704 | } else { | 743 | } else { |
| @@ -709,12 +748,18 @@ int hibernate(void) | |||
| 709 | free_basic_memory_bitmaps(); | 748 | free_basic_memory_bitmaps(); |
| 710 | Thaw: | 749 | Thaw: |
| 711 | unlock_device_hotplug(); | 750 | unlock_device_hotplug(); |
| 751 | if (snapshot_test) { | ||
| 752 | pr_debug("PM: Checking hibernation image\n"); | ||
| 753 | error = swsusp_check(); | ||
| 754 | if (!error) | ||
| 755 | error = load_image_and_restore(); | ||
| 756 | } | ||
| 712 | thaw_processes(); | 757 | thaw_processes(); |
| 713 | 758 | ||
| 714 | /* Don't bother checking whether freezer_test_done is true */ | 759 | /* Don't bother checking whether freezer_test_done is true */ |
| 715 | freezer_test_done = false; | 760 | freezer_test_done = false; |
| 716 | Exit: | 761 | Exit: |
| 717 | pm_notifier_call_chain(PM_POST_HIBERNATION); | 762 | __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); |
| 718 | pm_restore_console(); | 763 | pm_restore_console(); |
| 719 | atomic_inc(&snapshot_device_available); | 764 | atomic_inc(&snapshot_device_available); |
| 720 | Unlock: | 765 | Unlock: |
| @@ -740,8 +785,7 @@ int hibernate(void) | |||
| 740 | */ | 785 | */ |
| 741 | static int software_resume(void) | 786 | static int software_resume(void) |
| 742 | { | 787 | { |
| 743 | int error; | 788 | int error, nr_calls = 0; |
| 744 | unsigned int flags; | ||
| 745 | 789 | ||
| 746 | /* | 790 | /* |
| 747 | * If the user said "noresume".. bail out early. | 791 | * If the user said "noresume".. bail out early. |
| @@ -827,35 +871,20 @@ static int software_resume(void) | |||
| 827 | } | 871 | } |
| 828 | 872 | ||
| 829 | pm_prepare_console(); | 873 | pm_prepare_console(); |
| 830 | error = pm_notifier_call_chain(PM_RESTORE_PREPARE); | 874 | error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); |
| 831 | if (error) | 875 | if (error) { |
| 876 | nr_calls--; | ||
| 832 | goto Close_Finish; | 877 | goto Close_Finish; |
| 878 | } | ||
| 833 | 879 | ||
| 834 | pr_debug("PM: Preparing processes for restore.\n"); | 880 | pr_debug("PM: Preparing processes for restore.\n"); |
| 835 | error = freeze_processes(); | 881 | error = freeze_processes(); |
| 836 | if (error) | 882 | if (error) |
| 837 | goto Close_Finish; | 883 | goto Close_Finish; |
| 838 | 884 | error = load_image_and_restore(); | |
| 839 | pr_debug("PM: Loading hibernation image.\n"); | ||
| 840 | |||
| 841 | lock_device_hotplug(); | ||
| 842 | error = create_basic_memory_bitmaps(); | ||
| 843 | if (error) | ||
| 844 | goto Thaw; | ||
| 845 | |||
| 846 | error = swsusp_read(&flags); | ||
| 847 | swsusp_close(FMODE_READ); | ||
| 848 | if (!error) | ||
| 849 | hibernation_restore(flags & SF_PLATFORM_MODE); | ||
| 850 | |||
| 851 | printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); | ||
| 852 | swsusp_free(); | ||
| 853 | free_basic_memory_bitmaps(); | ||
| 854 | Thaw: | ||
| 855 | unlock_device_hotplug(); | ||
| 856 | thaw_processes(); | 885 | thaw_processes(); |
| 857 | Finish: | 886 | Finish: |
| 858 | pm_notifier_call_chain(PM_POST_RESTORE); | 887 | __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); |
| 859 | pm_restore_console(); | 888 | pm_restore_console(); |
| 860 | atomic_inc(&snapshot_device_available); | 889 | atomic_inc(&snapshot_device_available); |
| 861 | /* For success case, the suspend path will release the lock */ | 890 | /* For success case, the suspend path will release the lock */ |
| @@ -878,6 +907,7 @@ static const char * const hibernation_modes[] = { | |||
| 878 | #ifdef CONFIG_SUSPEND | 907 | #ifdef CONFIG_SUSPEND |
| 879 | [HIBERNATION_SUSPEND] = "suspend", | 908 | [HIBERNATION_SUSPEND] = "suspend", |
| 880 | #endif | 909 | #endif |
| 910 | [HIBERNATION_TEST_RESUME] = "test_resume", | ||
| 881 | }; | 911 | }; |
| 882 | 912 | ||
| 883 | /* | 913 | /* |
| @@ -924,6 +954,7 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
| 924 | #ifdef CONFIG_SUSPEND | 954 | #ifdef CONFIG_SUSPEND |
| 925 | case HIBERNATION_SUSPEND: | 955 | case HIBERNATION_SUSPEND: |
| 926 | #endif | 956 | #endif |
| 957 | case HIBERNATION_TEST_RESUME: | ||
| 927 | break; | 958 | break; |
| 928 | case HIBERNATION_PLATFORM: | 959 | case HIBERNATION_PLATFORM: |
| 929 | if (hibernation_ops) | 960 | if (hibernation_ops) |
| @@ -970,6 +1001,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
| 970 | #ifdef CONFIG_SUSPEND | 1001 | #ifdef CONFIG_SUSPEND |
| 971 | case HIBERNATION_SUSPEND: | 1002 | case HIBERNATION_SUSPEND: |
| 972 | #endif | 1003 | #endif |
| 1004 | case HIBERNATION_TEST_RESUME: | ||
| 973 | hibernation_mode = mode; | 1005 | hibernation_mode = mode; |
| 974 | break; | 1006 | break; |
| 975 | case HIBERNATION_PLATFORM: | 1007 | case HIBERNATION_PLATFORM: |
| @@ -1115,13 +1147,16 @@ static int __init resume_offset_setup(char *str) | |||
| 1115 | 1147 | ||
| 1116 | static int __init hibernate_setup(char *str) | 1148 | static int __init hibernate_setup(char *str) |
| 1117 | { | 1149 | { |
| 1118 | if (!strncmp(str, "noresume", 8)) | 1150 | if (!strncmp(str, "noresume", 8)) { |
| 1119 | noresume = 1; | 1151 | noresume = 1; |
| 1120 | else if (!strncmp(str, "nocompress", 10)) | 1152 | } else if (!strncmp(str, "nocompress", 10)) { |
| 1121 | nocompress = 1; | 1153 | nocompress = 1; |
| 1122 | else if (!strncmp(str, "no", 2)) { | 1154 | } else if (!strncmp(str, "no", 2)) { |
| 1123 | noresume = 1; | 1155 | noresume = 1; |
| 1124 | nohibernate = 1; | 1156 | nohibernate = 1; |
| 1157 | } else if (IS_ENABLED(CONFIG_DEBUG_RODATA) | ||
| 1158 | && !strncmp(str, "protect_image", 13)) { | ||
| 1159 | enable_restore_image_protection(); | ||
| 1125 | } | 1160 | } |
| 1126 | return 1; | 1161 | return 1; |
| 1127 | } | 1162 | } |
| @@ -1154,11 +1189,6 @@ static int __init nohibernate_setup(char *str) | |||
| 1154 | return 1; | 1189 | return 1; |
| 1155 | } | 1190 | } |
| 1156 | 1191 | ||
| 1157 | static int __init kaslr_nohibernate_setup(char *str) | ||
| 1158 | { | ||
| 1159 | return nohibernate_setup(str); | ||
| 1160 | } | ||
| 1161 | |||
| 1162 | static int __init page_poison_nohibernate_setup(char *str) | 1192 | static int __init page_poison_nohibernate_setup(char *str) |
| 1163 | { | 1193 | { |
| 1164 | #ifdef CONFIG_PAGE_POISONING_ZERO | 1194 | #ifdef CONFIG_PAGE_POISONING_ZERO |
| @@ -1182,5 +1212,4 @@ __setup("hibernate=", hibernate_setup); | |||
| 1182 | __setup("resumewait", resumewait_setup); | 1212 | __setup("resumewait", resumewait_setup); |
| 1183 | __setup("resumedelay=", resumedelay_setup); | 1213 | __setup("resumedelay=", resumedelay_setup); |
| 1184 | __setup("nohibernate", nohibernate_setup); | 1214 | __setup("nohibernate", nohibernate_setup); |
| 1185 | __setup("kaslr", kaslr_nohibernate_setup); | ||
| 1186 | __setup("page_poison=", page_poison_nohibernate_setup); | 1215 | __setup("page_poison=", page_poison_nohibernate_setup); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 27946975eff0..5ea50b1b7595 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -38,12 +38,19 @@ int unregister_pm_notifier(struct notifier_block *nb) | |||
| 38 | } | 38 | } |
| 39 | EXPORT_SYMBOL_GPL(unregister_pm_notifier); | 39 | EXPORT_SYMBOL_GPL(unregister_pm_notifier); |
| 40 | 40 | ||
| 41 | int pm_notifier_call_chain(unsigned long val) | 41 | int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls) |
| 42 | { | 42 | { |
| 43 | int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); | 43 | int ret; |
| 44 | |||
| 45 | ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL, | ||
| 46 | nr_to_call, nr_calls); | ||
| 44 | 47 | ||
| 45 | return notifier_to_errno(ret); | 48 | return notifier_to_errno(ret); |
| 46 | } | 49 | } |
| 50 | int pm_notifier_call_chain(unsigned long val) | ||
| 51 | { | ||
| 52 | return __pm_notifier_call_chain(val, -1, NULL); | ||
| 53 | } | ||
| 47 | 54 | ||
| 48 | /* If set, devices may be suspended and resumed asynchronously. */ | 55 | /* If set, devices may be suspended and resumed asynchronously. */ |
| 49 | int pm_async_enabled = 1; | 56 | int pm_async_enabled = 1; |
diff --git a/kernel/power/power.h b/kernel/power/power.h index efe1b3b17c88..242d8b827dd5 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
| @@ -38,6 +38,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) | |||
| 38 | } | 38 | } |
| 39 | #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ | 39 | #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ |
| 40 | 40 | ||
| 41 | extern int hibernate_resume_nonboot_cpu_disable(void); | ||
| 42 | |||
| 41 | /* | 43 | /* |
| 42 | * Keep some memory free so that I/O operations can succeed without paging | 44 | * Keep some memory free so that I/O operations can succeed without paging |
| 43 | * [Might this be more than 4 MB?] | 45 | * [Might this be more than 4 MB?] |
| @@ -59,6 +61,13 @@ extern int hibernation_snapshot(int platform_mode); | |||
| 59 | extern int hibernation_restore(int platform_mode); | 61 | extern int hibernation_restore(int platform_mode); |
| 60 | extern int hibernation_platform_enter(void); | 62 | extern int hibernation_platform_enter(void); |
| 61 | 63 | ||
| 64 | #ifdef CONFIG_DEBUG_RODATA | ||
| 65 | /* kernel/power/snapshot.c */ | ||
| 66 | extern void enable_restore_image_protection(void); | ||
| 67 | #else | ||
| 68 | static inline void enable_restore_image_protection(void) {} | ||
| 69 | #endif /* CONFIG_DEBUG_RODATA */ | ||
| 70 | |||
| 62 | #else /* !CONFIG_HIBERNATION */ | 71 | #else /* !CONFIG_HIBERNATION */ |
| 63 | 72 | ||
| 64 | static inline void hibernate_reserved_size_init(void) {} | 73 | static inline void hibernate_reserved_size_init(void) {} |
| @@ -200,6 +209,8 @@ static inline void suspend_test_finish(const char *label) {} | |||
| 200 | 209 | ||
| 201 | #ifdef CONFIG_PM_SLEEP | 210 | #ifdef CONFIG_PM_SLEEP |
| 202 | /* kernel/power/main.c */ | 211 | /* kernel/power/main.c */ |
| 212 | extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call, | ||
| 213 | int *nr_calls); | ||
| 203 | extern int pm_notifier_call_chain(unsigned long val); | 214 | extern int pm_notifier_call_chain(unsigned long val); |
| 204 | #endif | 215 | #endif |
| 205 | 216 | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c index 0c2ee9761d57..8f27d5a8adf6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
| @@ -89,6 +89,9 @@ static int try_to_freeze_tasks(bool user_only) | |||
| 89 | elapsed_msecs / 1000, elapsed_msecs % 1000, | 89 | elapsed_msecs / 1000, elapsed_msecs % 1000, |
| 90 | todo - wq_busy, wq_busy); | 90 | todo - wq_busy, wq_busy); |
| 91 | 91 | ||
| 92 | if (wq_busy) | ||
| 93 | show_workqueue_state(); | ||
| 94 | |||
| 92 | if (!wakeup) { | 95 | if (!wakeup) { |
| 93 | read_lock(&tasklist_lock); | 96 | read_lock(&tasklist_lock); |
| 94 | for_each_process_thread(g, p) { | 97 | for_each_process_thread(g, p) { |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3a970604308f..9a0178c2ac1d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -38,6 +38,43 @@ | |||
| 38 | 38 | ||
| 39 | #include "power.h" | 39 | #include "power.h" |
| 40 | 40 | ||
| 41 | #ifdef CONFIG_DEBUG_RODATA | ||
| 42 | static bool hibernate_restore_protection; | ||
| 43 | static bool hibernate_restore_protection_active; | ||
| 44 | |||
| 45 | void enable_restore_image_protection(void) | ||
| 46 | { | ||
| 47 | hibernate_restore_protection = true; | ||
| 48 | } | ||
| 49 | |||
| 50 | static inline void hibernate_restore_protection_begin(void) | ||
| 51 | { | ||
| 52 | hibernate_restore_protection_active = hibernate_restore_protection; | ||
| 53 | } | ||
| 54 | |||
| 55 | static inline void hibernate_restore_protection_end(void) | ||
| 56 | { | ||
| 57 | hibernate_restore_protection_active = false; | ||
| 58 | } | ||
| 59 | |||
| 60 | static inline void hibernate_restore_protect_page(void *page_address) | ||
| 61 | { | ||
| 62 | if (hibernate_restore_protection_active) | ||
| 63 | set_memory_ro((unsigned long)page_address, 1); | ||
| 64 | } | ||
| 65 | |||
| 66 | static inline void hibernate_restore_unprotect_page(void *page_address) | ||
| 67 | { | ||
| 68 | if (hibernate_restore_protection_active) | ||
| 69 | set_memory_rw((unsigned long)page_address, 1); | ||
| 70 | } | ||
| 71 | #else | ||
| 72 | static inline void hibernate_restore_protection_begin(void) {} | ||
| 73 | static inline void hibernate_restore_protection_end(void) {} | ||
| 74 | static inline void hibernate_restore_protect_page(void *page_address) {} | ||
| 75 | static inline void hibernate_restore_unprotect_page(void *page_address) {} | ||
| 76 | #endif /* CONFIG_DEBUG_RODATA */ | ||
| 77 | |||
| 41 | static int swsusp_page_is_free(struct page *); | 78 | static int swsusp_page_is_free(struct page *); |
| 42 | static void swsusp_set_page_forbidden(struct page *); | 79 | static void swsusp_set_page_forbidden(struct page *); |
| 43 | static void swsusp_unset_page_forbidden(struct page *); | 80 | static void swsusp_unset_page_forbidden(struct page *); |
| @@ -67,25 +104,32 @@ void __init hibernate_image_size_init(void) | |||
| 67 | image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; | 104 | image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; |
| 68 | } | 105 | } |
| 69 | 106 | ||
| 70 | /* List of PBEs needed for restoring the pages that were allocated before | 107 | /* |
| 108 | * List of PBEs needed for restoring the pages that were allocated before | ||
| 71 | * the suspend and included in the suspend image, but have also been | 109 | * the suspend and included in the suspend image, but have also been |
| 72 | * allocated by the "resume" kernel, so their contents cannot be written | 110 | * allocated by the "resume" kernel, so their contents cannot be written |
| 73 | * directly to their "original" page frames. | 111 | * directly to their "original" page frames. |
| 74 | */ | 112 | */ |
| 75 | struct pbe *restore_pblist; | 113 | struct pbe *restore_pblist; |
| 76 | 114 | ||
| 77 | /* Pointer to an auxiliary buffer (1 page) */ | 115 | /* struct linked_page is used to build chains of pages */ |
| 78 | static void *buffer; | ||
| 79 | 116 | ||
| 80 | /** | 117 | #define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) |
| 81 | * @safe_needed - on resume, for storing the PBE list and the image, | 118 | |
| 82 | * we can only use memory pages that do not conflict with the pages | 119 | struct linked_page { |
| 83 | * used before suspend. The unsafe pages have PageNosaveFree set | 120 | struct linked_page *next; |
| 84 | * and we count them using unsafe_pages. | 121 | char data[LINKED_PAGE_DATA_SIZE]; |
| 85 | * | 122 | } __packed; |
| 86 | * Each allocated image page is marked as PageNosave and PageNosaveFree | 123 | |
| 87 | * so that swsusp_free() can release it. | 124 | /* |
| 125 | * List of "safe" pages (ie. pages that were not used by the image kernel | ||
| 126 | * before hibernation) that may be used as temporary storage for image kernel | ||
| 127 | * memory contents. | ||
| 88 | */ | 128 | */ |
| 129 | static struct linked_page *safe_pages_list; | ||
| 130 | |||
| 131 | /* Pointer to an auxiliary buffer (1 page) */ | ||
| 132 | static void *buffer; | ||
| 89 | 133 | ||
| 90 | #define PG_ANY 0 | 134 | #define PG_ANY 0 |
| 91 | #define PG_SAFE 1 | 135 | #define PG_SAFE 1 |
| @@ -94,6 +138,19 @@ static void *buffer; | |||
| 94 | 138 | ||
| 95 | static unsigned int allocated_unsafe_pages; | 139 | static unsigned int allocated_unsafe_pages; |
| 96 | 140 | ||
| 141 | /** | ||
| 142 | * get_image_page - Allocate a page for a hibernation image. | ||
| 143 | * @gfp_mask: GFP mask for the allocation. | ||
| 144 | * @safe_needed: Get pages that were not used before hibernation (restore only) | ||
| 145 | * | ||
| 146 | * During image restoration, for storing the PBE list and the image data, we can | ||
| 147 | * only use memory pages that do not conflict with the pages used before | ||
| 148 | * hibernation. The "unsafe" pages have PageNosaveFree set and we count them | ||
| 149 | * using allocated_unsafe_pages. | ||
| 150 | * | ||
| 151 | * Each allocated image page is marked as PageNosave and PageNosaveFree so that | ||
| 152 | * swsusp_free() can release it. | ||
| 153 | */ | ||
| 97 | static void *get_image_page(gfp_t gfp_mask, int safe_needed) | 154 | static void *get_image_page(gfp_t gfp_mask, int safe_needed) |
| 98 | { | 155 | { |
| 99 | void *res; | 156 | void *res; |
| @@ -113,9 +170,21 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed) | |||
| 113 | return res; | 170 | return res; |
| 114 | } | 171 | } |
| 115 | 172 | ||
| 173 | static void *__get_safe_page(gfp_t gfp_mask) | ||
| 174 | { | ||
| 175 | if (safe_pages_list) { | ||
| 176 | void *ret = safe_pages_list; | ||
| 177 | |||
| 178 | safe_pages_list = safe_pages_list->next; | ||
| 179 | memset(ret, 0, PAGE_SIZE); | ||
| 180 | return ret; | ||
| 181 | } | ||
| 182 | return get_image_page(gfp_mask, PG_SAFE); | ||
| 183 | } | ||
| 184 | |||
| 116 | unsigned long get_safe_page(gfp_t gfp_mask) | 185 | unsigned long get_safe_page(gfp_t gfp_mask) |
| 117 | { | 186 | { |
| 118 | return (unsigned long)get_image_page(gfp_mask, PG_SAFE); | 187 | return (unsigned long)__get_safe_page(gfp_mask); |
| 119 | } | 188 | } |
| 120 | 189 | ||
| 121 | static struct page *alloc_image_page(gfp_t gfp_mask) | 190 | static struct page *alloc_image_page(gfp_t gfp_mask) |
| @@ -130,11 +199,22 @@ static struct page *alloc_image_page(gfp_t gfp_mask) | |||
| 130 | return page; | 199 | return page; |
| 131 | } | 200 | } |
| 132 | 201 | ||
| 202 | static void recycle_safe_page(void *page_address) | ||
| 203 | { | ||
| 204 | struct linked_page *lp = page_address; | ||
| 205 | |||
| 206 | lp->next = safe_pages_list; | ||
| 207 | safe_pages_list = lp; | ||
| 208 | } | ||
| 209 | |||
| 133 | /** | 210 | /** |
| 134 | * free_image_page - free page represented by @addr, allocated with | 211 | * free_image_page - Free a page allocated for hibernation image. |
| 135 | * get_image_page (page flags set by it must be cleared) | 212 | * @addr: Address of the page to free. |
| 213 | * @clear_nosave_free: If set, clear the PageNosaveFree bit for the page. | ||
| 214 | * | ||
| 215 | * The page to free should have been allocated by get_image_page() (page flags | ||
| 216 | * set by it are affected). | ||
| 136 | */ | 217 | */ |
| 137 | |||
| 138 | static inline void free_image_page(void *addr, int clear_nosave_free) | 218 | static inline void free_image_page(void *addr, int clear_nosave_free) |
| 139 | { | 219 | { |
| 140 | struct page *page; | 220 | struct page *page; |
| @@ -150,17 +230,8 @@ static inline void free_image_page(void *addr, int clear_nosave_free) | |||
| 150 | __free_page(page); | 230 | __free_page(page); |
| 151 | } | 231 | } |
| 152 | 232 | ||
| 153 | /* struct linked_page is used to build chains of pages */ | 233 | static inline void free_list_of_pages(struct linked_page *list, |
| 154 | 234 | int clear_page_nosave) | |
| 155 | #define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) | ||
| 156 | |||
| 157 | struct linked_page { | ||
| 158 | struct linked_page *next; | ||
| 159 | char data[LINKED_PAGE_DATA_SIZE]; | ||
| 160 | } __packed; | ||
| 161 | |||
| 162 | static inline void | ||
| 163 | free_list_of_pages(struct linked_page *list, int clear_page_nosave) | ||
| 164 | { | 235 | { |
| 165 | while (list) { | 236 | while (list) { |
| 166 | struct linked_page *lp = list->next; | 237 | struct linked_page *lp = list->next; |
| @@ -170,30 +241,28 @@ free_list_of_pages(struct linked_page *list, int clear_page_nosave) | |||
| 170 | } | 241 | } |
| 171 | } | 242 | } |
| 172 | 243 | ||
| 173 | /** | 244 | /* |
| 174 | * struct chain_allocator is used for allocating small objects out of | 245 | * struct chain_allocator is used for allocating small objects out of |
| 175 | * a linked list of pages called 'the chain'. | 246 | * a linked list of pages called 'the chain'. |
| 176 | * | 247 | * |
| 177 | * The chain grows each time when there is no room for a new object in | 248 | * The chain grows each time when there is no room for a new object in |
| 178 | * the current page. The allocated objects cannot be freed individually. | 249 | * the current page. The allocated objects cannot be freed individually. |
| 179 | * It is only possible to free them all at once, by freeing the entire | 250 | * It is only possible to free them all at once, by freeing the entire |
| 180 | * chain. | 251 | * chain. |
| 181 | * | 252 | * |
| 182 | * NOTE: The chain allocator may be inefficient if the allocated objects | 253 | * NOTE: The chain allocator may be inefficient if the allocated objects |
| 183 | * are not much smaller than PAGE_SIZE. | 254 | * are not much smaller than PAGE_SIZE. |
| 184 | */ | 255 | */ |
| 185 | |||
| 186 | struct chain_allocator { | 256 | struct chain_allocator { |
| 187 | struct linked_page *chain; /* the chain */ | 257 | struct linked_page *chain; /* the chain */ |
| 188 | unsigned int used_space; /* total size of objects allocated out | 258 | unsigned int used_space; /* total size of objects allocated out |
| 189 | * of the current page | 259 | of the current page */ |
| 190 | */ | ||
| 191 | gfp_t gfp_mask; /* mask for allocating pages */ | 260 | gfp_t gfp_mask; /* mask for allocating pages */ |
| 192 | int safe_needed; /* if set, only "safe" pages are allocated */ | 261 | int safe_needed; /* if set, only "safe" pages are allocated */ |
| 193 | }; | 262 | }; |
| 194 | 263 | ||
| 195 | static void | 264 | static void chain_init(struct chain_allocator *ca, gfp_t gfp_mask, |
| 196 | chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed) | 265 | int safe_needed) |
| 197 | { | 266 | { |
| 198 | ca->chain = NULL; | 267 | ca->chain = NULL; |
| 199 | ca->used_space = LINKED_PAGE_DATA_SIZE; | 268 | ca->used_space = LINKED_PAGE_DATA_SIZE; |
| @@ -208,7 +277,8 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) | |||
| 208 | if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { | 277 | if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { |
| 209 | struct linked_page *lp; | 278 | struct linked_page *lp; |
| 210 | 279 | ||
| 211 | lp = get_image_page(ca->gfp_mask, ca->safe_needed); | 280 | lp = ca->safe_needed ? __get_safe_page(ca->gfp_mask) : |
| 281 | get_image_page(ca->gfp_mask, PG_ANY); | ||
| 212 | if (!lp) | 282 | if (!lp) |
| 213 | return NULL; | 283 | return NULL; |
| 214 | 284 | ||
| @@ -222,44 +292,44 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) | |||
| 222 | } | 292 | } |
| 223 | 293 | ||
| 224 | /** | 294 | /** |
| 225 | * Data types related to memory bitmaps. | 295 | * Data types related to memory bitmaps. |
| 226 | * | 296 | * |
| 227 | * Memory bitmap is a structure consiting of many linked lists of | 297 | * Memory bitmap is a structure consiting of many linked lists of |
| 228 | * objects. The main list's elements are of type struct zone_bitmap | 298 | * objects. The main list's elements are of type struct zone_bitmap |
| 229 | * and each of them corresonds to one zone. For each zone bitmap | 299 | * and each of them corresonds to one zone. For each zone bitmap |
| 230 | * object there is a list of objects of type struct bm_block that | 300 | * object there is a list of objects of type struct bm_block that |
| 231 | * represent each blocks of bitmap in which information is stored. | 301 | * represent each blocks of bitmap in which information is stored. |
| 232 | * | 302 | * |
| 233 | * struct memory_bitmap contains a pointer to the main list of zone | 303 | * struct memory_bitmap contains a pointer to the main list of zone |
| 234 | * bitmap objects, a struct bm_position used for browsing the bitmap, | 304 | * bitmap objects, a struct bm_position used for browsing the bitmap, |
| 235 | * and a pointer to the list of pages used for allocating all of the | 305 | * and a pointer to the list of pages used for allocating all of the |
| 236 | * zone bitmap objects and bitmap block objects. | 306 | * zone bitmap objects and bitmap block objects. |
| 237 | * | 307 | * |
| 238 | * NOTE: It has to be possible to lay out the bitmap in memory | 308 | * NOTE: It has to be possible to lay out the bitmap in memory |
| 239 | * using only allocations of order 0. Additionally, the bitmap is | 309 | * using only allocations of order 0. Additionally, the bitmap is |
| 240 | * designed to work with arbitrary number of zones (this is over the | 310 | * designed to work with arbitrary number of zones (this is over the |
| 241 | * top for now, but let's avoid making unnecessary assumptions ;-). | 311 | * top for now, but let's avoid making unnecessary assumptions ;-). |
| 242 | * | 312 | * |
| 243 | * struct zone_bitmap contains a pointer to a list of bitmap block | 313 | * struct zone_bitmap contains a pointer to a list of bitmap block |
| 244 | * objects and a pointer to the bitmap block object that has been | 314 | * objects and a pointer to the bitmap block object that has been |
| 245 | * most recently used for setting bits. Additionally, it contains the | 315 | * most recently used for setting bits. Additionally, it contains the |
| 246 | * pfns that correspond to the start and end of the represented zone. | 316 | * PFNs that correspond to the start and end of the represented zone. |
| 247 | * | 317 | * |
| 248 | * struct bm_block contains a pointer to the memory page in which | 318 | * struct bm_block contains a pointer to the memory page in which |
| 249 | * information is stored (in the form of a block of bitmap) | 319 | * information is stored (in the form of a block of bitmap) |
| 250 | * It also contains the pfns that correspond to the start and end of | 320 | * It also contains the pfns that correspond to the start and end of |
| 251 | * the represented memory area. | 321 | * the represented memory area. |
| 252 | * | 322 | * |
| 253 | * The memory bitmap is organized as a radix tree to guarantee fast random | 323 | * The memory bitmap is organized as a radix tree to guarantee fast random |
| 254 | * access to the bits. There is one radix tree for each zone (as returned | 324 | * access to the bits. There is one radix tree for each zone (as returned |
| 255 | * from create_mem_extents). | 325 | * from create_mem_extents). |
| 256 | * | 326 | * |
| 257 | * One radix tree is represented by one struct mem_zone_bm_rtree. There are | 327 | * One radix tree is represented by one struct mem_zone_bm_rtree. There are |
| 258 | * two linked lists for the nodes of the tree, one for the inner nodes and | 328 | * two linked lists for the nodes of the tree, one for the inner nodes and |
| 259 | * one for the leave nodes. The linked leave nodes are used for fast linear | 329 | * one for the leave nodes. The linked leave nodes are used for fast linear |
| 260 | * access of the memory bitmap. | 330 | * access of the memory bitmap. |
| 261 | * | 331 | * |
| 262 | * The struct rtree_node represents one node of the radix tree. | 332 | * The struct rtree_node represents one node of the radix tree. |
| 263 | */ | 333 | */ |
| 264 | 334 | ||
| 265 | #define BM_END_OF_MAP (~0UL) | 335 | #define BM_END_OF_MAP (~0UL) |
| @@ -305,9 +375,8 @@ struct bm_position { | |||
| 305 | struct memory_bitmap { | 375 | struct memory_bitmap { |
| 306 | struct list_head zones; | 376 | struct list_head zones; |
| 307 | struct linked_page *p_list; /* list of pages used to store zone | 377 | struct linked_page *p_list; /* list of pages used to store zone |
| 308 | * bitmap objects and bitmap block | 378 | bitmap objects and bitmap block |
| 309 | * objects | 379 | objects */ |
| 310 | */ | ||
| 311 | struct bm_position cur; /* most recently used bit position */ | 380 | struct bm_position cur; /* most recently used bit position */ |
| 312 | }; | 381 | }; |
| 313 | 382 | ||
| @@ -321,12 +390,12 @@ struct memory_bitmap { | |||
| 321 | #endif | 390 | #endif |
| 322 | #define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1) | 391 | #define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1) |
| 323 | 392 | ||
| 324 | /* | 393 | /** |
| 325 | * alloc_rtree_node - Allocate a new node and add it to the radix tree. | 394 | * alloc_rtree_node - Allocate a new node and add it to the radix tree. |
| 326 | * | 395 | * |
| 327 | * This function is used to allocate inner nodes as well as the | 396 | * This function is used to allocate inner nodes as well as the |
| 328 | * leave nodes of the radix tree. It also adds the node to the | 397 | * leave nodes of the radix tree. It also adds the node to the |
| 329 | * corresponding linked list passed in by the *list parameter. | 398 | * corresponding linked list passed in by the *list parameter. |
| 330 | */ | 399 | */ |
| 331 | static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, | 400 | static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, |
| 332 | struct chain_allocator *ca, | 401 | struct chain_allocator *ca, |
| @@ -347,12 +416,12 @@ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, | |||
| 347 | return node; | 416 | return node; |
| 348 | } | 417 | } |
| 349 | 418 | ||
| 350 | /* | 419 | /** |
| 351 | * add_rtree_block - Add a new leave node to the radix tree | 420 | * add_rtree_block - Add a new leave node to the radix tree. |
| 352 | * | 421 | * |
| 353 | * The leave nodes need to be allocated in order to keep the leaves | 422 | * The leave nodes need to be allocated in order to keep the leaves |
| 354 | * linked list in order. This is guaranteed by the zone->blocks | 423 | * linked list in order. This is guaranteed by the zone->blocks |
| 355 | * counter. | 424 | * counter. |
| 356 | */ | 425 | */ |
| 357 | static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, | 426 | static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, |
| 358 | int safe_needed, struct chain_allocator *ca) | 427 | int safe_needed, struct chain_allocator *ca) |
| @@ -417,17 +486,18 @@ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, | |||
| 417 | static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, | 486 | static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, |
| 418 | int clear_nosave_free); | 487 | int clear_nosave_free); |
| 419 | 488 | ||
| 420 | /* | 489 | /** |
| 421 | * create_zone_bm_rtree - create a radix tree for one zone | 490 | * create_zone_bm_rtree - Create a radix tree for one zone. |
| 422 | * | 491 | * |
| 423 | * Allocated the mem_zone_bm_rtree structure and initializes it. | 492 | * Allocated the mem_zone_bm_rtree structure and initializes it. |
| 424 | * This function also allocated and builds the radix tree for the | 493 | * This function also allocated and builds the radix tree for the |
| 425 | * zone. | 494 | * zone. |
| 426 | */ | 495 | */ |
| 427 | static struct mem_zone_bm_rtree * | 496 | static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask, |
| 428 | create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, | 497 | int safe_needed, |
| 429 | struct chain_allocator *ca, | 498 | struct chain_allocator *ca, |
| 430 | unsigned long start, unsigned long end) | 499 | unsigned long start, |
| 500 | unsigned long end) | ||
| 431 | { | 501 | { |
| 432 | struct mem_zone_bm_rtree *zone; | 502 | struct mem_zone_bm_rtree *zone; |
| 433 | unsigned int i, nr_blocks; | 503 | unsigned int i, nr_blocks; |
| @@ -454,12 +524,12 @@ create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, | |||
| 454 | return zone; | 524 | return zone; |
| 455 | } | 525 | } |
| 456 | 526 | ||
| 457 | /* | 527 | /** |
| 458 | * free_zone_bm_rtree - Free the memory of the radix tree | 528 | * free_zone_bm_rtree - Free the memory of the radix tree. |
| 459 | * | 529 | * |
| 460 | * Free all node pages of the radix tree. The mem_zone_bm_rtree | 530 | * Free all node pages of the radix tree. The mem_zone_bm_rtree |
| 461 | * structure itself is not freed here nor are the rtree_node | 531 | * structure itself is not freed here nor are the rtree_node |
| 462 | * structs. | 532 | * structs. |
| 463 | */ | 533 | */ |
| 464 | static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, | 534 | static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, |
| 465 | int clear_nosave_free) | 535 | int clear_nosave_free) |
| @@ -492,8 +562,8 @@ struct mem_extent { | |||
| 492 | }; | 562 | }; |
| 493 | 563 | ||
| 494 | /** | 564 | /** |
| 495 | * free_mem_extents - free a list of memory extents | 565 | * free_mem_extents - Free a list of memory extents. |
| 496 | * @list - list of extents to empty | 566 | * @list: List of extents to free. |
| 497 | */ | 567 | */ |
| 498 | static void free_mem_extents(struct list_head *list) | 568 | static void free_mem_extents(struct list_head *list) |
| 499 | { | 569 | { |
| @@ -506,10 +576,11 @@ static void free_mem_extents(struct list_head *list) | |||
| 506 | } | 576 | } |
| 507 | 577 | ||
| 508 | /** | 578 | /** |
| 509 | * create_mem_extents - create a list of memory extents representing | 579 | * create_mem_extents - Create a list of memory extents. |
| 510 | * contiguous ranges of PFNs | 580 | * @list: List to put the extents into. |
| 511 | * @list - list to put the extents into | 581 | * @gfp_mask: Mask to use for memory allocations. |
| 512 | * @gfp_mask - mask to use for memory allocations | 582 | * |
| 583 | * The extents represent contiguous ranges of PFNs. | ||
| 513 | */ | 584 | */ |
| 514 | static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) | 585 | static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) |
| 515 | { | 586 | { |
| @@ -565,10 +636,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) | |||
| 565 | } | 636 | } |
| 566 | 637 | ||
| 567 | /** | 638 | /** |
| 568 | * memory_bm_create - allocate memory for a memory bitmap | 639 | * memory_bm_create - Allocate memory for a memory bitmap. |
| 569 | */ | 640 | */ |
| 570 | static int | 641 | static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, |
| 571 | memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | 642 | int safe_needed) |
| 572 | { | 643 | { |
| 573 | struct chain_allocator ca; | 644 | struct chain_allocator ca; |
| 574 | struct list_head mem_extents; | 645 | struct list_head mem_extents; |
| @@ -607,8 +678,9 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | |||
| 607 | } | 678 | } |
| 608 | 679 | ||
| 609 | /** | 680 | /** |
| 610 | * memory_bm_free - free memory occupied by the memory bitmap @bm | 681 | * memory_bm_free - Free memory occupied by the memory bitmap. |
| 611 | */ | 682 | * @bm: Memory bitmap. |
| 683 | */ | ||
| 612 | static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) | 684 | static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) |
| 613 | { | 685 | { |
| 614 | struct mem_zone_bm_rtree *zone; | 686 | struct mem_zone_bm_rtree *zone; |
| @@ -622,14 +694,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) | |||
| 622 | } | 694 | } |
| 623 | 695 | ||
| 624 | /** | 696 | /** |
| 625 | * memory_bm_find_bit - Find the bit for pfn in the memory | 697 | * memory_bm_find_bit - Find the bit for a given PFN in a memory bitmap. |
| 626 | * bitmap | ||
| 627 | * | 698 | * |
| 628 | * Find the bit in the bitmap @bm that corresponds to given pfn. | 699 | * Find the bit in memory bitmap @bm that corresponds to the given PFN. |
| 629 | * The cur.zone, cur.block and cur.node_pfn member of @bm are | 700 | * The cur.zone, cur.block and cur.node_pfn members of @bm are updated. |
| 630 | * updated. | 701 | * |
| 631 | * It walks the radix tree to find the page which contains the bit for | 702 | * Walk the radix tree to find the page containing the bit that represents @pfn |
| 632 | * pfn and returns the bit position in **addr and *bit_nr. | 703 | * and return the position of the bit in @addr and @bit_nr. |
| 633 | */ | 704 | */ |
| 634 | static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, | 705 | static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, |
| 635 | void **addr, unsigned int *bit_nr) | 706 | void **addr, unsigned int *bit_nr) |
| @@ -658,10 +729,9 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, | |||
| 658 | 729 | ||
| 659 | zone_found: | 730 | zone_found: |
| 660 | /* | 731 | /* |
| 661 | * We have a zone. Now walk the radix tree to find the leave | 732 | * We have found the zone. Now walk the radix tree to find the leaf node |
| 662 | * node for our pfn. | 733 | * for our PFN. |
| 663 | */ | 734 | */ |
| 664 | |||
| 665 | node = bm->cur.node; | 735 | node = bm->cur.node; |
| 666 | if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) | 736 | if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) |
| 667 | goto node_found; | 737 | goto node_found; |
| @@ -754,14 +824,14 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) | |||
| 754 | } | 824 | } |
| 755 | 825 | ||
| 756 | /* | 826 | /* |
| 757 | * rtree_next_node - Jumps to the next leave node | 827 | * rtree_next_node - Jump to the next leaf node. |
| 758 | * | 828 | * |
| 759 | * Sets the position to the beginning of the next node in the | 829 | * Set the position to the beginning of the next node in the |
| 760 | * memory bitmap. This is either the next node in the current | 830 | * memory bitmap. This is either the next node in the current |
| 761 | * zone's radix tree or the first node in the radix tree of the | 831 | * zone's radix tree or the first node in the radix tree of the |
| 762 | * next zone. | 832 | * next zone. |
| 763 | * | 833 | * |
| 764 | * Returns true if there is a next node, false otherwise. | 834 | * Return true if there is a next node, false otherwise. |
| 765 | */ | 835 | */ |
| 766 | static bool rtree_next_node(struct memory_bitmap *bm) | 836 | static bool rtree_next_node(struct memory_bitmap *bm) |
| 767 | { | 837 | { |
| @@ -790,14 +860,15 @@ static bool rtree_next_node(struct memory_bitmap *bm) | |||
| 790 | } | 860 | } |
| 791 | 861 | ||
| 792 | /** | 862 | /** |
| 793 | * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm | 863 | * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap. |
| 864 | * @bm: Memory bitmap. | ||
| 794 | * | 865 | * |
| 795 | * Starting from the last returned position this function searches | 866 | * Starting from the last returned position this function searches for the next |
| 796 | * for the next set bit in the memory bitmap and returns its | 867 | * set bit in @bm and returns the PFN represented by it. If no more bits are |
| 797 | * number. If no more bit is set BM_END_OF_MAP is returned. | 868 | * set, BM_END_OF_MAP is returned. |
| 798 | * | 869 | * |
| 799 | * It is required to run memory_bm_position_reset() before the | 870 | * It is required to run memory_bm_position_reset() before the first call to |
| 800 | * first call to this function. | 871 | * this function for the given memory bitmap. |
| 801 | */ | 872 | */ |
| 802 | static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) | 873 | static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) |
| 803 | { | 874 | { |
| @@ -819,11 +890,10 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) | |||
| 819 | return BM_END_OF_MAP; | 890 | return BM_END_OF_MAP; |
| 820 | } | 891 | } |
| 821 | 892 | ||
| 822 | /** | 893 | /* |
| 823 | * This structure represents a range of page frames the contents of which | 894 | * This structure represents a range of page frames the contents of which |
| 824 | * should not be saved during the suspend. | 895 | * should not be saved during hibernation. |
| 825 | */ | 896 | */ |
| 826 | |||
| 827 | struct nosave_region { | 897 | struct nosave_region { |
| 828 | struct list_head list; | 898 | struct list_head list; |
| 829 | unsigned long start_pfn; | 899 | unsigned long start_pfn; |
| @@ -832,15 +902,42 @@ struct nosave_region { | |||
| 832 | 902 | ||
| 833 | static LIST_HEAD(nosave_regions); | 903 | static LIST_HEAD(nosave_regions); |
| 834 | 904 | ||
| 905 | static void recycle_zone_bm_rtree(struct mem_zone_bm_rtree *zone) | ||
| 906 | { | ||
| 907 | struct rtree_node *node; | ||
| 908 | |||
| 909 | list_for_each_entry(node, &zone->nodes, list) | ||
| 910 | recycle_safe_page(node->data); | ||
| 911 | |||
| 912 | list_for_each_entry(node, &zone->leaves, list) | ||
| 913 | recycle_safe_page(node->data); | ||
| 914 | } | ||
| 915 | |||
| 916 | static void memory_bm_recycle(struct memory_bitmap *bm) | ||
| 917 | { | ||
| 918 | struct mem_zone_bm_rtree *zone; | ||
| 919 | struct linked_page *p_list; | ||
| 920 | |||
| 921 | list_for_each_entry(zone, &bm->zones, list) | ||
| 922 | recycle_zone_bm_rtree(zone); | ||
| 923 | |||
| 924 | p_list = bm->p_list; | ||
| 925 | while (p_list) { | ||
| 926 | struct linked_page *lp = p_list; | ||
| 927 | |||
| 928 | p_list = lp->next; | ||
| 929 | recycle_safe_page(lp); | ||
| 930 | } | ||
| 931 | } | ||
| 932 | |||
| 835 | /** | 933 | /** |
| 836 | * register_nosave_region - register a range of page frames the contents | 934 | * register_nosave_region - Register a region of unsaveable memory. |
| 837 | * of which should not be saved during the suspend (to be used in the early | 935 | * |
| 838 | * initialization code) | 936 | * Register a range of page frames the contents of which should not be saved |
| 937 | * during hibernation (to be used in the early initialization code). | ||
| 839 | */ | 938 | */ |
| 840 | 939 | void __init __register_nosave_region(unsigned long start_pfn, | |
| 841 | void __init | 940 | unsigned long end_pfn, int use_kmalloc) |
| 842 | __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, | ||
| 843 | int use_kmalloc) | ||
| 844 | { | 941 | { |
| 845 | struct nosave_region *region; | 942 | struct nosave_region *region; |
| 846 | 943 | ||
| @@ -857,12 +954,13 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, | |||
| 857 | } | 954 | } |
| 858 | } | 955 | } |
| 859 | if (use_kmalloc) { | 956 | if (use_kmalloc) { |
| 860 | /* during init, this shouldn't fail */ | 957 | /* During init, this shouldn't fail */ |
| 861 | region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); | 958 | region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); |
| 862 | BUG_ON(!region); | 959 | BUG_ON(!region); |
| 863 | } else | 960 | } else { |
| 864 | /* This allocation cannot fail */ | 961 | /* This allocation cannot fail */ |
| 865 | region = memblock_virt_alloc(sizeof(struct nosave_region), 0); | 962 | region = memblock_virt_alloc(sizeof(struct nosave_region), 0); |
| 963 | } | ||
| 866 | region->start_pfn = start_pfn; | 964 | region->start_pfn = start_pfn; |
| 867 | region->end_pfn = end_pfn; | 965 | region->end_pfn = end_pfn; |
| 868 | list_add_tail(®ion->list, &nosave_regions); | 966 | list_add_tail(®ion->list, &nosave_regions); |
| @@ -923,10 +1021,12 @@ static void swsusp_unset_page_forbidden(struct page *page) | |||
| 923 | } | 1021 | } |
| 924 | 1022 | ||
| 925 | /** | 1023 | /** |
| 926 | * mark_nosave_pages - set bits corresponding to the page frames the | 1024 | * mark_nosave_pages - Mark pages that should not be saved. |
| 927 | * contents of which should not be saved in a given bitmap. | 1025 | * @bm: Memory bitmap. |
| 1026 | * | ||
| 1027 | * Set the bits in @bm that correspond to the page frames the contents of which | ||
| 1028 | * should not be saved. | ||
| 928 | */ | 1029 | */ |
| 929 | |||
| 930 | static void mark_nosave_pages(struct memory_bitmap *bm) | 1030 | static void mark_nosave_pages(struct memory_bitmap *bm) |
| 931 | { | 1031 | { |
| 932 | struct nosave_region *region; | 1032 | struct nosave_region *region; |
| @@ -956,13 +1056,13 @@ static void mark_nosave_pages(struct memory_bitmap *bm) | |||
| 956 | } | 1056 | } |
| 957 | 1057 | ||
| 958 | /** | 1058 | /** |
| 959 | * create_basic_memory_bitmaps - create bitmaps needed for marking page | 1059 | * create_basic_memory_bitmaps - Create bitmaps to hold basic page information. |
| 960 | * frames that should not be saved and free page frames. The pointers | 1060 | * |
| 961 | * forbidden_pages_map and free_pages_map are only modified if everything | 1061 | * Create bitmaps needed for marking page frames that should not be saved and |
| 962 | * goes well, because we don't want the bits to be used before both bitmaps | 1062 | * free page frames. The forbidden_pages_map and free_pages_map pointers are |
| 963 | * are set up. | 1063 | * only modified if everything goes well, because we don't want the bits to be |
| 1064 | * touched before both bitmaps are set up. | ||
| 964 | */ | 1065 | */ |
| 965 | |||
| 966 | int create_basic_memory_bitmaps(void) | 1066 | int create_basic_memory_bitmaps(void) |
| 967 | { | 1067 | { |
| 968 | struct memory_bitmap *bm1, *bm2; | 1068 | struct memory_bitmap *bm1, *bm2; |
| @@ -1007,12 +1107,12 @@ int create_basic_memory_bitmaps(void) | |||
| 1007 | } | 1107 | } |
| 1008 | 1108 | ||
| 1009 | /** | 1109 | /** |
| 1010 | * free_basic_memory_bitmaps - free memory bitmaps allocated by | 1110 | * free_basic_memory_bitmaps - Free memory bitmaps holding basic information. |
| 1011 | * create_basic_memory_bitmaps(). The auxiliary pointers are necessary | 1111 | * |
| 1012 | * so that the bitmaps themselves are not referred to while they are being | 1112 | * Free memory bitmaps allocated by create_basic_memory_bitmaps(). The |
| 1013 | * freed. | 1113 | * auxiliary pointers are necessary so that the bitmaps themselves are not |
| 1114 | * referred to while they are being freed. | ||
| 1014 | */ | 1115 | */ |
| 1015 | |||
| 1016 | void free_basic_memory_bitmaps(void) | 1116 | void free_basic_memory_bitmaps(void) |
| 1017 | { | 1117 | { |
| 1018 | struct memory_bitmap *bm1, *bm2; | 1118 | struct memory_bitmap *bm1, *bm2; |
| @@ -1033,11 +1133,13 @@ void free_basic_memory_bitmaps(void) | |||
| 1033 | } | 1133 | } |
| 1034 | 1134 | ||
| 1035 | /** | 1135 | /** |
| 1036 | * snapshot_additional_pages - estimate the number of additional pages | 1136 | * snapshot_additional_pages - Estimate the number of extra pages needed. |
| 1037 | * be needed for setting up the suspend image data structures for given | 1137 | * @zone: Memory zone to carry out the computation for. |
| 1038 | * zone (usually the returned value is greater than the exact number) | 1138 | * |
| 1139 | * Estimate the number of additional pages needed for setting up a hibernation | ||
| 1140 | * image data structures for @zone (usually, the returned value is greater than | ||
| 1141 | * the exact number). | ||
| 1039 | */ | 1142 | */ |
| 1040 | |||
| 1041 | unsigned int snapshot_additional_pages(struct zone *zone) | 1143 | unsigned int snapshot_additional_pages(struct zone *zone) |
| 1042 | { | 1144 | { |
| 1043 | unsigned int rtree, nodes; | 1145 | unsigned int rtree, nodes; |
| @@ -1055,10 +1157,10 @@ unsigned int snapshot_additional_pages(struct zone *zone) | |||
| 1055 | 1157 | ||
| 1056 | #ifdef CONFIG_HIGHMEM | 1158 | #ifdef CONFIG_HIGHMEM |
| 1057 | /** | 1159 | /** |
| 1058 | * count_free_highmem_pages - compute the total number of free highmem | 1160 | * count_free_highmem_pages - Compute the total number of free highmem pages. |
| 1059 | * pages, system-wide. | 1161 | * |
| 1162 | * The returned number is system-wide. | ||
| 1060 | */ | 1163 | */ |
| 1061 | |||
| 1062 | static unsigned int count_free_highmem_pages(void) | 1164 | static unsigned int count_free_highmem_pages(void) |
| 1063 | { | 1165 | { |
| 1064 | struct zone *zone; | 1166 | struct zone *zone; |
| @@ -1072,11 +1174,12 @@ static unsigned int count_free_highmem_pages(void) | |||
| 1072 | } | 1174 | } |
| 1073 | 1175 | ||
| 1074 | /** | 1176 | /** |
| 1075 | * saveable_highmem_page - Determine whether a highmem page should be | 1177 | * saveable_highmem_page - Check if a highmem page is saveable. |
| 1076 | * included in the suspend image. | ||
| 1077 | * | 1178 | * |
| 1078 | * We should save the page if it isn't Nosave or NosaveFree, or Reserved, | 1179 | * Determine whether a highmem page should be included in a hibernation image. |
| 1079 | * and it isn't a part of a free chunk of pages. | 1180 | * |
| 1181 | * We should save the page if it isn't Nosave or NosaveFree, or Reserved, | ||
| 1182 | * and it isn't part of a free chunk of pages. | ||
| 1080 | */ | 1183 | */ |
| 1081 | static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) | 1184 | static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) |
| 1082 | { | 1185 | { |
| @@ -1102,10 +1205,8 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) | |||
| 1102 | } | 1205 | } |
| 1103 | 1206 | ||
| 1104 | /** | 1207 | /** |
| 1105 | * count_highmem_pages - compute the total number of saveable highmem | 1208 | * count_highmem_pages - Compute the total number of saveable highmem pages. |
| 1106 | * pages. | ||
| 1107 | */ | 1209 | */ |
| 1108 | |||
| 1109 | static unsigned int count_highmem_pages(void) | 1210 | static unsigned int count_highmem_pages(void) |
| 1110 | { | 1211 | { |
| 1111 | struct zone *zone; | 1212 | struct zone *zone; |
| @@ -1133,12 +1234,14 @@ static inline void *saveable_highmem_page(struct zone *z, unsigned long p) | |||
| 1133 | #endif /* CONFIG_HIGHMEM */ | 1234 | #endif /* CONFIG_HIGHMEM */ |
| 1134 | 1235 | ||
| 1135 | /** | 1236 | /** |
| 1136 | * saveable_page - Determine whether a non-highmem page should be included | 1237 | * saveable_page - Check if the given page is saveable. |
| 1137 | * in the suspend image. | ||
| 1138 | * | 1238 | * |
| 1139 | * We should save the page if it isn't Nosave, and is not in the range | 1239 | * Determine whether a non-highmem page should be included in a hibernation |
| 1140 | * of pages statically defined as 'unsaveable', and it isn't a part of | 1240 | * image. |
| 1141 | * a free chunk of pages. | 1241 | * |
| 1242 | * We should save the page if it isn't Nosave, and is not in the range | ||
| 1243 | * of pages statically defined as 'unsaveable', and it isn't part of | ||
| 1244 | * a free chunk of pages. | ||
| 1142 | */ | 1245 | */ |
| 1143 | static struct page *saveable_page(struct zone *zone, unsigned long pfn) | 1246 | static struct page *saveable_page(struct zone *zone, unsigned long pfn) |
| 1144 | { | 1247 | { |
| @@ -1167,10 +1270,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) | |||
| 1167 | } | 1270 | } |
| 1168 | 1271 | ||
| 1169 | /** | 1272 | /** |
| 1170 | * count_data_pages - compute the total number of saveable non-highmem | 1273 | * count_data_pages - Compute the total number of saveable non-highmem pages. |
| 1171 | * pages. | ||
| 1172 | */ | 1274 | */ |
| 1173 | |||
| 1174 | static unsigned int count_data_pages(void) | 1275 | static unsigned int count_data_pages(void) |
| 1175 | { | 1276 | { |
| 1176 | struct zone *zone; | 1277 | struct zone *zone; |
| @@ -1190,7 +1291,8 @@ static unsigned int count_data_pages(void) | |||
| 1190 | return n; | 1291 | return n; |
| 1191 | } | 1292 | } |
| 1192 | 1293 | ||
| 1193 | /* This is needed, because copy_page and memcpy are not usable for copying | 1294 | /* |
| 1295 | * This is needed, because copy_page and memcpy are not usable for copying | ||
| 1194 | * task structs. | 1296 | * task structs. |
| 1195 | */ | 1297 | */ |
| 1196 | static inline void do_copy_page(long *dst, long *src) | 1298 | static inline void do_copy_page(long *dst, long *src) |
| @@ -1201,12 +1303,12 @@ static inline void do_copy_page(long *dst, long *src) | |||
| 1201 | *dst++ = *src++; | 1303 | *dst++ = *src++; |
| 1202 | } | 1304 | } |
| 1203 | 1305 | ||
| 1204 | |||
| 1205 | /** | 1306 | /** |
| 1206 | * safe_copy_page - check if the page we are going to copy is marked as | 1307 | * safe_copy_page - Copy a page in a safe way. |
| 1207 | * present in the kernel page tables (this always is the case if | 1308 | * |
| 1208 | * CONFIG_DEBUG_PAGEALLOC is not set and in that case | 1309 | * Check if the page we are going to copy is marked as present in the kernel |
| 1209 | * kernel_page_present() always returns 'true'). | 1310 | * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set |
| 1311 | * and in that case kernel_page_present() always returns 'true'). | ||
| 1210 | */ | 1312 | */ |
| 1211 | static void safe_copy_page(void *dst, struct page *s_page) | 1313 | static void safe_copy_page(void *dst, struct page *s_page) |
| 1212 | { | 1314 | { |
| @@ -1219,10 +1321,8 @@ static void safe_copy_page(void *dst, struct page *s_page) | |||
| 1219 | } | 1321 | } |
| 1220 | } | 1322 | } |
| 1221 | 1323 | ||
| 1222 | |||
| 1223 | #ifdef CONFIG_HIGHMEM | 1324 | #ifdef CONFIG_HIGHMEM |
| 1224 | static inline struct page * | 1325 | static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn) |
| 1225 | page_is_saveable(struct zone *zone, unsigned long pfn) | ||
| 1226 | { | 1326 | { |
| 1227 | return is_highmem(zone) ? | 1327 | return is_highmem(zone) ? |
| 1228 | saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); | 1328 | saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); |
| @@ -1243,7 +1343,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) | |||
| 1243 | kunmap_atomic(src); | 1343 | kunmap_atomic(src); |
| 1244 | } else { | 1344 | } else { |
| 1245 | if (PageHighMem(d_page)) { | 1345 | if (PageHighMem(d_page)) { |
| 1246 | /* Page pointed to by src may contain some kernel | 1346 | /* |
| 1347 | * The page pointed to by src may contain some kernel | ||
| 1247 | * data modified by kmap_atomic() | 1348 | * data modified by kmap_atomic() |
| 1248 | */ | 1349 | */ |
| 1249 | safe_copy_page(buffer, s_page); | 1350 | safe_copy_page(buffer, s_page); |
| @@ -1265,8 +1366,8 @@ static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) | |||
| 1265 | } | 1366 | } |
| 1266 | #endif /* CONFIG_HIGHMEM */ | 1367 | #endif /* CONFIG_HIGHMEM */ |
| 1267 | 1368 | ||
| 1268 | static void | 1369 | static void copy_data_pages(struct memory_bitmap *copy_bm, |
| 1269 | copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) | 1370 | struct memory_bitmap *orig_bm) |
| 1270 | { | 1371 | { |
| 1271 | struct zone *zone; | 1372 | struct zone *zone; |
| 1272 | unsigned long pfn; | 1373 | unsigned long pfn; |
| @@ -1315,12 +1416,11 @@ static struct memory_bitmap orig_bm; | |||
| 1315 | static struct memory_bitmap copy_bm; | 1416 | static struct memory_bitmap copy_bm; |
| 1316 | 1417 | ||
| 1317 | /** | 1418 | /** |
| 1318 | * swsusp_free - free pages allocated for the suspend. | 1419 | * swsusp_free - Free pages allocated for hibernation image. |
| 1319 | * | 1420 | * |
| 1320 | * Suspend pages are alocated before the atomic copy is made, so we | 1421 | * Image pages are alocated before snapshot creation, so they need to be |
| 1321 | * need to release them after the resume. | 1422 | * released after resume. |
| 1322 | */ | 1423 | */ |
| 1323 | |||
| 1324 | void swsusp_free(void) | 1424 | void swsusp_free(void) |
| 1325 | { | 1425 | { |
| 1326 | unsigned long fb_pfn, fr_pfn; | 1426 | unsigned long fb_pfn, fr_pfn; |
| @@ -1351,6 +1451,7 @@ loop: | |||
| 1351 | 1451 | ||
| 1352 | memory_bm_clear_current(forbidden_pages_map); | 1452 | memory_bm_clear_current(forbidden_pages_map); |
| 1353 | memory_bm_clear_current(free_pages_map); | 1453 | memory_bm_clear_current(free_pages_map); |
| 1454 | hibernate_restore_unprotect_page(page_address(page)); | ||
| 1354 | __free_page(page); | 1455 | __free_page(page); |
| 1355 | goto loop; | 1456 | goto loop; |
| 1356 | } | 1457 | } |
| @@ -1362,6 +1463,7 @@ out: | |||
| 1362 | buffer = NULL; | 1463 | buffer = NULL; |
| 1363 | alloc_normal = 0; | 1464 | alloc_normal = 0; |
| 1364 | alloc_highmem = 0; | 1465 | alloc_highmem = 0; |
| 1466 | hibernate_restore_protection_end(); | ||
| 1365 | } | 1467 | } |
| 1366 | 1468 | ||
| 1367 | /* Helper functions used for the shrinking of memory. */ | 1469 | /* Helper functions used for the shrinking of memory. */ |
| @@ -1369,7 +1471,7 @@ out: | |||
| 1369 | #define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) | 1471 | #define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) |
| 1370 | 1472 | ||
| 1371 | /** | 1473 | /** |
| 1372 | * preallocate_image_pages - Allocate a number of pages for hibernation image | 1474 | * preallocate_image_pages - Allocate a number of pages for hibernation image. |
| 1373 | * @nr_pages: Number of page frames to allocate. | 1475 | * @nr_pages: Number of page frames to allocate. |
| 1374 | * @mask: GFP flags to use for the allocation. | 1476 | * @mask: GFP flags to use for the allocation. |
| 1375 | * | 1477 | * |
| @@ -1419,7 +1521,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages) | |||
| 1419 | } | 1521 | } |
| 1420 | 1522 | ||
| 1421 | /** | 1523 | /** |
| 1422 | * __fraction - Compute (an approximation of) x * (multiplier / base) | 1524 | * __fraction - Compute (an approximation of) x * (multiplier / base). |
| 1423 | */ | 1525 | */ |
| 1424 | static unsigned long __fraction(u64 x, u64 multiplier, u64 base) | 1526 | static unsigned long __fraction(u64 x, u64 multiplier, u64 base) |
| 1425 | { | 1527 | { |
| @@ -1429,8 +1531,8 @@ static unsigned long __fraction(u64 x, u64 multiplier, u64 base) | |||
| 1429 | } | 1531 | } |
| 1430 | 1532 | ||
| 1431 | static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, | 1533 | static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, |
| 1432 | unsigned long highmem, | 1534 | unsigned long highmem, |
| 1433 | unsigned long total) | 1535 | unsigned long total) |
| 1434 | { | 1536 | { |
| 1435 | unsigned long alloc = __fraction(nr_pages, highmem, total); | 1537 | unsigned long alloc = __fraction(nr_pages, highmem, total); |
| 1436 | 1538 | ||
| @@ -1443,15 +1545,15 @@ static inline unsigned long preallocate_image_highmem(unsigned long nr_pages) | |||
| 1443 | } | 1545 | } |
| 1444 | 1546 | ||
| 1445 | static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, | 1547 | static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, |
| 1446 | unsigned long highmem, | 1548 | unsigned long highmem, |
| 1447 | unsigned long total) | 1549 | unsigned long total) |
| 1448 | { | 1550 | { |
| 1449 | return 0; | 1551 | return 0; |
| 1450 | } | 1552 | } |
| 1451 | #endif /* CONFIG_HIGHMEM */ | 1553 | #endif /* CONFIG_HIGHMEM */ |
| 1452 | 1554 | ||
| 1453 | /** | 1555 | /** |
| 1454 | * free_unnecessary_pages - Release preallocated pages not needed for the image | 1556 | * free_unnecessary_pages - Release preallocated pages not needed for the image. |
| 1455 | */ | 1557 | */ |
| 1456 | static unsigned long free_unnecessary_pages(void) | 1558 | static unsigned long free_unnecessary_pages(void) |
| 1457 | { | 1559 | { |
| @@ -1505,7 +1607,7 @@ static unsigned long free_unnecessary_pages(void) | |||
| 1505 | } | 1607 | } |
| 1506 | 1608 | ||
| 1507 | /** | 1609 | /** |
| 1508 | * minimum_image_size - Estimate the minimum acceptable size of an image | 1610 | * minimum_image_size - Estimate the minimum acceptable size of an image. |
| 1509 | * @saveable: Number of saveable pages in the system. | 1611 | * @saveable: Number of saveable pages in the system. |
| 1510 | * | 1612 | * |
| 1511 | * We want to avoid attempting to free too much memory too hard, so estimate the | 1613 | * We want to avoid attempting to free too much memory too hard, so estimate the |
| @@ -1525,17 +1627,17 @@ static unsigned long minimum_image_size(unsigned long saveable) | |||
| 1525 | unsigned long size; | 1627 | unsigned long size; |
| 1526 | 1628 | ||
| 1527 | size = global_page_state(NR_SLAB_RECLAIMABLE) | 1629 | size = global_page_state(NR_SLAB_RECLAIMABLE) |
| 1528 | + global_page_state(NR_ACTIVE_ANON) | 1630 | + global_node_page_state(NR_ACTIVE_ANON) |
| 1529 | + global_page_state(NR_INACTIVE_ANON) | 1631 | + global_node_page_state(NR_INACTIVE_ANON) |
| 1530 | + global_page_state(NR_ACTIVE_FILE) | 1632 | + global_node_page_state(NR_ACTIVE_FILE) |
| 1531 | + global_page_state(NR_INACTIVE_FILE) | 1633 | + global_node_page_state(NR_INACTIVE_FILE) |
| 1532 | - global_page_state(NR_FILE_MAPPED); | 1634 | - global_node_page_state(NR_FILE_MAPPED); |
| 1533 | 1635 | ||
| 1534 | return saveable <= size ? 0 : saveable - size; | 1636 | return saveable <= size ? 0 : saveable - size; |
| 1535 | } | 1637 | } |
| 1536 | 1638 | ||
| 1537 | /** | 1639 | /** |
| 1538 | * hibernate_preallocate_memory - Preallocate memory for hibernation image | 1640 | * hibernate_preallocate_memory - Preallocate memory for hibernation image. |
| 1539 | * | 1641 | * |
| 1540 | * To create a hibernation image it is necessary to make a copy of every page | 1642 | * To create a hibernation image it is necessary to make a copy of every page |
| 1541 | * frame in use. We also need a number of page frames to be free during | 1643 | * frame in use. We also need a number of page frames to be free during |
| @@ -1708,10 +1810,11 @@ int hibernate_preallocate_memory(void) | |||
| 1708 | 1810 | ||
| 1709 | #ifdef CONFIG_HIGHMEM | 1811 | #ifdef CONFIG_HIGHMEM |
| 1710 | /** | 1812 | /** |
| 1711 | * count_pages_for_highmem - compute the number of non-highmem pages | 1813 | * count_pages_for_highmem - Count non-highmem pages needed for copying highmem. |
| 1712 | * that will be necessary for creating copies of highmem pages. | 1814 | * |
| 1713 | */ | 1815 | * Compute the number of non-highmem pages that will be necessary for creating |
| 1714 | 1816 | * copies of highmem pages. | |
| 1817 | */ | ||
| 1715 | static unsigned int count_pages_for_highmem(unsigned int nr_highmem) | 1818 | static unsigned int count_pages_for_highmem(unsigned int nr_highmem) |
| 1716 | { | 1819 | { |
| 1717 | unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; | 1820 | unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; |
| @@ -1724,15 +1827,12 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem) | |||
| 1724 | return nr_highmem; | 1827 | return nr_highmem; |
| 1725 | } | 1828 | } |
| 1726 | #else | 1829 | #else |
| 1727 | static unsigned int | 1830 | static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; } |
| 1728 | count_pages_for_highmem(unsigned int nr_highmem) { return 0; } | ||
| 1729 | #endif /* CONFIG_HIGHMEM */ | 1831 | #endif /* CONFIG_HIGHMEM */ |
| 1730 | 1832 | ||
| 1731 | /** | 1833 | /** |
| 1732 | * enough_free_mem - Make sure we have enough free memory for the | 1834 | * enough_free_mem - Check if there is enough free memory for the image. |
| 1733 | * snapshot image. | ||
| 1734 | */ | 1835 | */ |
| 1735 | |||
| 1736 | static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) | 1836 | static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) |
| 1737 | { | 1837 | { |
| 1738 | struct zone *zone; | 1838 | struct zone *zone; |
| @@ -1751,10 +1851,11 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) | |||
| 1751 | 1851 | ||
| 1752 | #ifdef CONFIG_HIGHMEM | 1852 | #ifdef CONFIG_HIGHMEM |
| 1753 | /** | 1853 | /** |
| 1754 | * get_highmem_buffer - if there are some highmem pages in the suspend | 1854 | * get_highmem_buffer - Allocate a buffer for highmem pages. |
| 1755 | * image, we may need the buffer to copy them and/or load their data. | 1855 | * |
| 1856 | * If there are some highmem pages in the hibernation image, we may need a | ||
| 1857 | * buffer to copy them and/or load their data. | ||
| 1756 | */ | 1858 | */ |
| 1757 | |||
| 1758 | static inline int get_highmem_buffer(int safe_needed) | 1859 | static inline int get_highmem_buffer(int safe_needed) |
| 1759 | { | 1860 | { |
| 1760 | buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); | 1861 | buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); |
| @@ -1762,13 +1863,13 @@ static inline int get_highmem_buffer(int safe_needed) | |||
| 1762 | } | 1863 | } |
| 1763 | 1864 | ||
| 1764 | /** | 1865 | /** |
| 1765 | * alloc_highmem_image_pages - allocate some highmem pages for the image. | 1866 | * alloc_highmem_image_pages - Allocate some highmem pages for the image. |
| 1766 | * Try to allocate as many pages as needed, but if the number of free | 1867 | * |
| 1767 | * highmem pages is lesser than that, allocate them all. | 1868 | * Try to allocate as many pages as needed, but if the number of free highmem |
| 1869 | * pages is less than that, allocate them all. | ||
| 1768 | */ | 1870 | */ |
| 1769 | 1871 | static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, | |
| 1770 | static inline unsigned int | 1872 | unsigned int nr_highmem) |
| 1771 | alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) | ||
| 1772 | { | 1873 | { |
| 1773 | unsigned int to_alloc = count_free_highmem_pages(); | 1874 | unsigned int to_alloc = count_free_highmem_pages(); |
| 1774 | 1875 | ||
| @@ -1787,25 +1888,24 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) | |||
| 1787 | #else | 1888 | #else |
| 1788 | static inline int get_highmem_buffer(int safe_needed) { return 0; } | 1889 | static inline int get_highmem_buffer(int safe_needed) { return 0; } |
| 1789 | 1890 | ||
| 1790 | static inline unsigned int | 1891 | static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, |
| 1791 | alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } | 1892 | unsigned int n) { return 0; } |
| 1792 | #endif /* CONFIG_HIGHMEM */ | 1893 | #endif /* CONFIG_HIGHMEM */ |
| 1793 | 1894 | ||
| 1794 | /** | 1895 | /** |
| 1795 | * swsusp_alloc - allocate memory for the suspend image | 1896 | * swsusp_alloc - Allocate memory for hibernation image. |
| 1796 | * | 1897 | * |
| 1797 | * We first try to allocate as many highmem pages as there are | 1898 | * We first try to allocate as many highmem pages as there are |
| 1798 | * saveable highmem pages in the system. If that fails, we allocate | 1899 | * saveable highmem pages in the system. If that fails, we allocate |
| 1799 | * non-highmem pages for the copies of the remaining highmem ones. | 1900 | * non-highmem pages for the copies of the remaining highmem ones. |
| 1800 | * | 1901 | * |
| 1801 | * In this approach it is likely that the copies of highmem pages will | 1902 | * In this approach it is likely that the copies of highmem pages will |
| 1802 | * also be located in the high memory, because of the way in which | 1903 | * also be located in the high memory, because of the way in which |
| 1803 | * copy_data_pages() works. | 1904 | * copy_data_pages() works. |
| 1804 | */ | 1905 | */ |
| 1805 | 1906 | static int swsusp_alloc(struct memory_bitmap *orig_bm, | |
| 1806 | static int | 1907 | struct memory_bitmap *copy_bm, |
| 1807 | swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | 1908 | unsigned int nr_pages, unsigned int nr_highmem) |
| 1808 | unsigned int nr_pages, unsigned int nr_highmem) | ||
| 1809 | { | 1909 | { |
| 1810 | if (nr_highmem > 0) { | 1910 | if (nr_highmem > 0) { |
| 1811 | if (get_highmem_buffer(PG_ANY)) | 1911 | if (get_highmem_buffer(PG_ANY)) |
| @@ -1855,7 +1955,8 @@ asmlinkage __visible int swsusp_save(void) | |||
| 1855 | return -ENOMEM; | 1955 | return -ENOMEM; |
| 1856 | } | 1956 | } |
| 1857 | 1957 | ||
| 1858 | /* During allocating of suspend pagedir, new cold pages may appear. | 1958 | /* |
| 1959 | * During allocating of suspend pagedir, new cold pages may appear. | ||
| 1859 | * Kill them. | 1960 | * Kill them. |
| 1860 | */ | 1961 | */ |
| 1861 | drain_local_pages(NULL); | 1962 | drain_local_pages(NULL); |
| @@ -1918,12 +2019,14 @@ static int init_header(struct swsusp_info *info) | |||
| 1918 | } | 2019 | } |
| 1919 | 2020 | ||
| 1920 | /** | 2021 | /** |
| 1921 | * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm | 2022 | * pack_pfns - Prepare PFNs for saving. |
| 1922 | * are stored in the array @buf[] (1 page at a time) | 2023 | * @bm: Memory bitmap. |
| 2024 | * @buf: Memory buffer to store the PFNs in. | ||
| 2025 | * | ||
| 2026 | * PFNs corresponding to set bits in @bm are stored in the area of memory | ||
| 2027 | * pointed to by @buf (1 page at a time). | ||
| 1923 | */ | 2028 | */ |
| 1924 | 2029 | static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm) | |
| 1925 | static inline void | ||
| 1926 | pack_pfns(unsigned long *buf, struct memory_bitmap *bm) | ||
| 1927 | { | 2030 | { |
| 1928 | int j; | 2031 | int j; |
| 1929 | 2032 | ||
| @@ -1937,22 +2040,21 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) | |||
| 1937 | } | 2040 | } |
| 1938 | 2041 | ||
| 1939 | /** | 2042 | /** |
| 1940 | * snapshot_read_next - used for reading the system memory snapshot. | 2043 | * snapshot_read_next - Get the address to read the next image page from. |
| 2044 | * @handle: Snapshot handle to be used for the reading. | ||
| 1941 | * | 2045 | * |
| 1942 | * On the first call to it @handle should point to a zeroed | 2046 | * On the first call, @handle should point to a zeroed snapshot_handle |
| 1943 | * snapshot_handle structure. The structure gets updated and a pointer | 2047 | * structure. The structure gets populated then and a pointer to it should be |
| 1944 | * to it should be passed to this function every next time. | 2048 | * passed to this function every next time. |
| 1945 | * | 2049 | * |
| 1946 | * On success the function returns a positive number. Then, the caller | 2050 | * On success, the function returns a positive number. Then, the caller |
| 1947 | * is allowed to read up to the returned number of bytes from the memory | 2051 | * is allowed to read up to the returned number of bytes from the memory |
| 1948 | * location computed by the data_of() macro. | 2052 | * location computed by the data_of() macro. |
| 1949 | * | 2053 | * |
| 1950 | * The function returns 0 to indicate the end of data stream condition, | 2054 | * The function returns 0 to indicate the end of the data stream condition, |
| 1951 | * and a negative number is returned on error. In such cases the | 2055 | * and negative numbers are returned on errors. If that happens, the structure |
| 1952 | * structure pointed to by @handle is not updated and should not be used | 2056 | * pointed to by @handle is not updated and should not be used any more. |
| 1953 | * any more. | ||
| 1954 | */ | 2057 | */ |
| 1955 | |||
| 1956 | int snapshot_read_next(struct snapshot_handle *handle) | 2058 | int snapshot_read_next(struct snapshot_handle *handle) |
| 1957 | { | 2059 | { |
| 1958 | if (handle->cur > nr_meta_pages + nr_copy_pages) | 2060 | if (handle->cur > nr_meta_pages + nr_copy_pages) |
| @@ -1981,7 +2083,8 @@ int snapshot_read_next(struct snapshot_handle *handle) | |||
| 1981 | 2083 | ||
| 1982 | page = pfn_to_page(memory_bm_next_pfn(©_bm)); | 2084 | page = pfn_to_page(memory_bm_next_pfn(©_bm)); |
| 1983 | if (PageHighMem(page)) { | 2085 | if (PageHighMem(page)) { |
| 1984 | /* Highmem pages are copied to the buffer, | 2086 | /* |
| 2087 | * Highmem pages are copied to the buffer, | ||
| 1985 | * because we can't return with a kmapped | 2088 | * because we can't return with a kmapped |
| 1986 | * highmem page (we may not be called again). | 2089 | * highmem page (we may not be called again). |
| 1987 | */ | 2090 | */ |
| @@ -1999,53 +2102,41 @@ int snapshot_read_next(struct snapshot_handle *handle) | |||
| 1999 | return PAGE_SIZE; | 2102 | return PAGE_SIZE; |
| 2000 | } | 2103 | } |
| 2001 | 2104 | ||
| 2002 | /** | 2105 | static void duplicate_memory_bitmap(struct memory_bitmap *dst, |
| 2003 | * mark_unsafe_pages - mark the pages that cannot be used for storing | 2106 | struct memory_bitmap *src) |
| 2004 | * the image during resume, because they conflict with the pages that | ||
| 2005 | * had been used before suspend | ||
| 2006 | */ | ||
| 2007 | |||
| 2008 | static int mark_unsafe_pages(struct memory_bitmap *bm) | ||
| 2009 | { | 2107 | { |
| 2010 | struct zone *zone; | 2108 | unsigned long pfn; |
| 2011 | unsigned long pfn, max_zone_pfn; | ||
| 2012 | 2109 | ||
| 2013 | /* Clear page flags */ | 2110 | memory_bm_position_reset(src); |
| 2014 | for_each_populated_zone(zone) { | 2111 | pfn = memory_bm_next_pfn(src); |
| 2015 | max_zone_pfn = zone_end_pfn(zone); | 2112 | while (pfn != BM_END_OF_MAP) { |
| 2016 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 2113 | memory_bm_set_bit(dst, pfn); |
| 2017 | if (pfn_valid(pfn)) | 2114 | pfn = memory_bm_next_pfn(src); |
| 2018 | swsusp_unset_page_free(pfn_to_page(pfn)); | ||
| 2019 | } | 2115 | } |
| 2020 | |||
| 2021 | /* Mark pages that correspond to the "original" pfns as "unsafe" */ | ||
| 2022 | memory_bm_position_reset(bm); | ||
| 2023 | do { | ||
| 2024 | pfn = memory_bm_next_pfn(bm); | ||
| 2025 | if (likely(pfn != BM_END_OF_MAP)) { | ||
| 2026 | if (likely(pfn_valid(pfn))) | ||
| 2027 | swsusp_set_page_free(pfn_to_page(pfn)); | ||
| 2028 | else | ||
| 2029 | return -EFAULT; | ||
| 2030 | } | ||
| 2031 | } while (pfn != BM_END_OF_MAP); | ||
| 2032 | |||
| 2033 | allocated_unsafe_pages = 0; | ||
| 2034 | |||
| 2035 | return 0; | ||
| 2036 | } | 2116 | } |
| 2037 | 2117 | ||
| 2038 | static void | 2118 | /** |
| 2039 | duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src) | 2119 | * mark_unsafe_pages - Mark pages that were used before hibernation. |
| 2120 | * | ||
| 2121 | * Mark the pages that cannot be used for storing the image during restoration, | ||
| 2122 | * because they conflict with the pages that had been used before hibernation. | ||
| 2123 | */ | ||
| 2124 | static void mark_unsafe_pages(struct memory_bitmap *bm) | ||
| 2040 | { | 2125 | { |
| 2041 | unsigned long pfn; | 2126 | unsigned long pfn; |
| 2042 | 2127 | ||
| 2043 | memory_bm_position_reset(src); | 2128 | /* Clear the "free"/"unsafe" bit for all PFNs */ |
| 2044 | pfn = memory_bm_next_pfn(src); | 2129 | memory_bm_position_reset(free_pages_map); |
| 2130 | pfn = memory_bm_next_pfn(free_pages_map); | ||
| 2045 | while (pfn != BM_END_OF_MAP) { | 2131 | while (pfn != BM_END_OF_MAP) { |
| 2046 | memory_bm_set_bit(dst, pfn); | 2132 | memory_bm_clear_current(free_pages_map); |
| 2047 | pfn = memory_bm_next_pfn(src); | 2133 | pfn = memory_bm_next_pfn(free_pages_map); |
| 2048 | } | 2134 | } |
| 2135 | |||
| 2136 | /* Mark pages that correspond to the "original" PFNs as "unsafe" */ | ||
| 2137 | duplicate_memory_bitmap(free_pages_map, bm); | ||
| 2138 | |||
| 2139 | allocated_unsafe_pages = 0; | ||
| 2049 | } | 2140 | } |
| 2050 | 2141 | ||
| 2051 | static int check_header(struct swsusp_info *info) | 2142 | static int check_header(struct swsusp_info *info) |
| @@ -2063,11 +2154,9 @@ static int check_header(struct swsusp_info *info) | |||
| 2063 | } | 2154 | } |
| 2064 | 2155 | ||
| 2065 | /** | 2156 | /** |
| 2066 | * load header - check the image header and copy data from it | 2157 | * load header - Check the image header and copy the data from it. |
| 2067 | */ | 2158 | */ |
| 2068 | 2159 | static int load_header(struct swsusp_info *info) | |
| 2069 | static int | ||
| 2070 | load_header(struct swsusp_info *info) | ||
| 2071 | { | 2160 | { |
| 2072 | int error; | 2161 | int error; |
| 2073 | 2162 | ||
| @@ -2081,8 +2170,12 @@ load_header(struct swsusp_info *info) | |||
| 2081 | } | 2170 | } |
| 2082 | 2171 | ||
| 2083 | /** | 2172 | /** |
| 2084 | * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set | 2173 | * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap. |
| 2085 | * the corresponding bit in the memory bitmap @bm | 2174 | * @bm: Memory bitmap. |
| 2175 | * @buf: Area of memory containing the PFNs. | ||
| 2176 | * | ||
| 2177 | * For each element of the array pointed to by @buf (1 page at a time), set the | ||
| 2178 | * corresponding bit in @bm. | ||
| 2086 | */ | 2179 | */ |
| 2087 | static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) | 2180 | static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) |
| 2088 | { | 2181 | { |
| @@ -2095,7 +2188,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) | |||
| 2095 | /* Extract and buffer page key for data page (s390 only). */ | 2188 | /* Extract and buffer page key for data page (s390 only). */ |
| 2096 | page_key_memorize(buf + j); | 2189 | page_key_memorize(buf + j); |
| 2097 | 2190 | ||
| 2098 | if (memory_bm_pfn_present(bm, buf[j])) | 2191 | if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) |
| 2099 | memory_bm_set_bit(bm, buf[j]); | 2192 | memory_bm_set_bit(bm, buf[j]); |
| 2100 | else | 2193 | else |
| 2101 | return -EFAULT; | 2194 | return -EFAULT; |
| @@ -2104,13 +2197,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) | |||
| 2104 | return 0; | 2197 | return 0; |
| 2105 | } | 2198 | } |
| 2106 | 2199 | ||
| 2107 | /* List of "safe" pages that may be used to store data loaded from the suspend | ||
| 2108 | * image | ||
| 2109 | */ | ||
| 2110 | static struct linked_page *safe_pages_list; | ||
| 2111 | |||
| 2112 | #ifdef CONFIG_HIGHMEM | 2200 | #ifdef CONFIG_HIGHMEM |
| 2113 | /* struct highmem_pbe is used for creating the list of highmem pages that | 2201 | /* |
| 2202 | * struct highmem_pbe is used for creating the list of highmem pages that | ||
| 2114 | * should be restored atomically during the resume from disk, because the page | 2203 | * should be restored atomically during the resume from disk, because the page |
| 2115 | * frames they have occupied before the suspend are in use. | 2204 | * frames they have occupied before the suspend are in use. |
| 2116 | */ | 2205 | */ |
| @@ -2120,7 +2209,8 @@ struct highmem_pbe { | |||
| 2120 | struct highmem_pbe *next; | 2209 | struct highmem_pbe *next; |
| 2121 | }; | 2210 | }; |
| 2122 | 2211 | ||
| 2123 | /* List of highmem PBEs needed for restoring the highmem pages that were | 2212 | /* |
| 2213 | * List of highmem PBEs needed for restoring the highmem pages that were | ||
| 2124 | * allocated before the suspend and included in the suspend image, but have | 2214 | * allocated before the suspend and included in the suspend image, but have |
| 2125 | * also been allocated by the "resume" kernel, so their contents cannot be | 2215 | * also been allocated by the "resume" kernel, so their contents cannot be |
| 2126 | * written directly to their "original" page frames. | 2216 | * written directly to their "original" page frames. |
| @@ -2128,11 +2218,11 @@ struct highmem_pbe { | |||
| 2128 | static struct highmem_pbe *highmem_pblist; | 2218 | static struct highmem_pbe *highmem_pblist; |
| 2129 | 2219 | ||
| 2130 | /** | 2220 | /** |
| 2131 | * count_highmem_image_pages - compute the number of highmem pages in the | 2221 | * count_highmem_image_pages - Compute the number of highmem pages in the image. |
| 2132 | * suspend image. The bits in the memory bitmap @bm that correspond to the | 2222 | * @bm: Memory bitmap. |
| 2133 | * image pages are assumed to be set. | 2223 | * |
| 2224 | * The bits in @bm that correspond to image pages are assumed to be set. | ||
| 2134 | */ | 2225 | */ |
| 2135 | |||
| 2136 | static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) | 2226 | static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) |
| 2137 | { | 2227 | { |
| 2138 | unsigned long pfn; | 2228 | unsigned long pfn; |
| @@ -2149,24 +2239,25 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) | |||
| 2149 | return cnt; | 2239 | return cnt; |
| 2150 | } | 2240 | } |
| 2151 | 2241 | ||
| 2152 | /** | ||
| 2153 | * prepare_highmem_image - try to allocate as many highmem pages as | ||
| 2154 | * there are highmem image pages (@nr_highmem_p points to the variable | ||
| 2155 | * containing the number of highmem image pages). The pages that are | ||
| 2156 | * "safe" (ie. will not be overwritten when the suspend image is | ||
| 2157 | * restored) have the corresponding bits set in @bm (it must be | ||
| 2158 | * unitialized). | ||
| 2159 | * | ||
| 2160 | * NOTE: This function should not be called if there are no highmem | ||
| 2161 | * image pages. | ||
| 2162 | */ | ||
| 2163 | |||
| 2164 | static unsigned int safe_highmem_pages; | 2242 | static unsigned int safe_highmem_pages; |
| 2165 | 2243 | ||
| 2166 | static struct memory_bitmap *safe_highmem_bm; | 2244 | static struct memory_bitmap *safe_highmem_bm; |
| 2167 | 2245 | ||
| 2168 | static int | 2246 | /** |
| 2169 | prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) | 2247 | * prepare_highmem_image - Allocate memory for loading highmem data from image. |
| 2248 | * @bm: Pointer to an uninitialized memory bitmap structure. | ||
| 2249 | * @nr_highmem_p: Pointer to the number of highmem image pages. | ||
| 2250 | * | ||
| 2251 | * Try to allocate as many highmem pages as there are highmem image pages | ||
| 2252 | * (@nr_highmem_p points to the variable containing the number of highmem image | ||
| 2253 | * pages). The pages that are "safe" (ie. will not be overwritten when the | ||
| 2254 | * hibernation image is restored entirely) have the corresponding bits set in | ||
| 2255 | * @bm (it must be unitialized). | ||
| 2256 | * | ||
| 2257 | * NOTE: This function should not be called if there are no highmem image pages. | ||
| 2258 | */ | ||
| 2259 | static int prepare_highmem_image(struct memory_bitmap *bm, | ||
| 2260 | unsigned int *nr_highmem_p) | ||
| 2170 | { | 2261 | { |
| 2171 | unsigned int to_alloc; | 2262 | unsigned int to_alloc; |
| 2172 | 2263 | ||
| @@ -2201,39 +2292,42 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) | |||
| 2201 | return 0; | 2292 | return 0; |
| 2202 | } | 2293 | } |
| 2203 | 2294 | ||
| 2295 | static struct page *last_highmem_page; | ||
| 2296 | |||
| 2204 | /** | 2297 | /** |
| 2205 | * get_highmem_page_buffer - for given highmem image page find the buffer | 2298 | * get_highmem_page_buffer - Prepare a buffer to store a highmem image page. |
| 2206 | * that suspend_write_next() should set for its caller to write to. | ||
| 2207 | * | 2299 | * |
| 2208 | * If the page is to be saved to its "original" page frame or a copy of | 2300 | * For a given highmem image page get a buffer that suspend_write_next() should |
| 2209 | * the page is to be made in the highmem, @buffer is returned. Otherwise, | 2301 | * return to its caller to write to. |
| 2210 | * the copy of the page is to be made in normal memory, so the address of | ||
| 2211 | * the copy is returned. | ||
| 2212 | * | 2302 | * |
| 2213 | * If @buffer is returned, the caller of suspend_write_next() will write | 2303 | * If the page is to be saved to its "original" page frame or a copy of |
| 2214 | * the page's contents to @buffer, so they will have to be copied to the | 2304 | * the page is to be made in the highmem, @buffer is returned. Otherwise, |
| 2215 | * right location on the next call to suspend_write_next() and it is done | 2305 | * the copy of the page is to be made in normal memory, so the address of |
| 2216 | * with the help of copy_last_highmem_page(). For this purpose, if | 2306 | * the copy is returned. |
| 2217 | * @buffer is returned, @last_highmem page is set to the page to which | 2307 | * |
| 2218 | * the data will have to be copied from @buffer. | 2308 | * If @buffer is returned, the caller of suspend_write_next() will write |
| 2309 | * the page's contents to @buffer, so they will have to be copied to the | ||
| 2310 | * right location on the next call to suspend_write_next() and it is done | ||
| 2311 | * with the help of copy_last_highmem_page(). For this purpose, if | ||
| 2312 | * @buffer is returned, @last_highmem_page is set to the page to which | ||
| 2313 | * the data will have to be copied from @buffer. | ||
| 2219 | */ | 2314 | */ |
| 2220 | 2315 | static void *get_highmem_page_buffer(struct page *page, | |
| 2221 | static struct page *last_highmem_page; | 2316 | struct chain_allocator *ca) |
| 2222 | |||
| 2223 | static void * | ||
| 2224 | get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) | ||
| 2225 | { | 2317 | { |
| 2226 | struct highmem_pbe *pbe; | 2318 | struct highmem_pbe *pbe; |
| 2227 | void *kaddr; | 2319 | void *kaddr; |
| 2228 | 2320 | ||
| 2229 | if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) { | 2321 | if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) { |
| 2230 | /* We have allocated the "original" page frame and we can | 2322 | /* |
| 2323 | * We have allocated the "original" page frame and we can | ||
| 2231 | * use it directly to store the loaded page. | 2324 | * use it directly to store the loaded page. |
| 2232 | */ | 2325 | */ |
| 2233 | last_highmem_page = page; | 2326 | last_highmem_page = page; |
| 2234 | return buffer; | 2327 | return buffer; |
| 2235 | } | 2328 | } |
| 2236 | /* The "original" page frame has not been allocated and we have to | 2329 | /* |
| 2330 | * The "original" page frame has not been allocated and we have to | ||
| 2237 | * use a "safe" page frame to store the loaded page. | 2331 | * use a "safe" page frame to store the loaded page. |
| 2238 | */ | 2332 | */ |
| 2239 | pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); | 2333 | pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); |
| @@ -2263,11 +2357,12 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) | |||
| 2263 | } | 2357 | } |
| 2264 | 2358 | ||
| 2265 | /** | 2359 | /** |
| 2266 | * copy_last_highmem_page - copy the contents of a highmem image from | 2360 | * copy_last_highmem_page - Copy most the most recent highmem image page. |
| 2267 | * @buffer, where the caller of snapshot_write_next() has place them, | 2361 | * |
| 2268 | * to the right location represented by @last_highmem_page . | 2362 | * Copy the contents of a highmem image from @buffer, where the caller of |
| 2363 | * snapshot_write_next() has stored them, to the right location represented by | ||
| 2364 | * @last_highmem_page . | ||
| 2269 | */ | 2365 | */ |
| 2270 | |||
| 2271 | static void copy_last_highmem_page(void) | 2366 | static void copy_last_highmem_page(void) |
| 2272 | { | 2367 | { |
| 2273 | if (last_highmem_page) { | 2368 | if (last_highmem_page) { |
| @@ -2294,17 +2389,13 @@ static inline void free_highmem_data(void) | |||
| 2294 | free_image_page(buffer, PG_UNSAFE_CLEAR); | 2389 | free_image_page(buffer, PG_UNSAFE_CLEAR); |
| 2295 | } | 2390 | } |
| 2296 | #else | 2391 | #else |
| 2297 | static unsigned int | 2392 | static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } |
| 2298 | count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } | ||
| 2299 | 2393 | ||
| 2300 | static inline int | 2394 | static inline int prepare_highmem_image(struct memory_bitmap *bm, |
| 2301 | prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) | 2395 | unsigned int *nr_highmem_p) { return 0; } |
| 2302 | { | ||
| 2303 | return 0; | ||
| 2304 | } | ||
| 2305 | 2396 | ||
| 2306 | static inline void * | 2397 | static inline void *get_highmem_page_buffer(struct page *page, |
| 2307 | get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) | 2398 | struct chain_allocator *ca) |
| 2308 | { | 2399 | { |
| 2309 | return ERR_PTR(-EINVAL); | 2400 | return ERR_PTR(-EINVAL); |
| 2310 | } | 2401 | } |
| @@ -2314,27 +2405,27 @@ static inline int last_highmem_page_copied(void) { return 1; } | |||
| 2314 | static inline void free_highmem_data(void) {} | 2405 | static inline void free_highmem_data(void) {} |
| 2315 | #endif /* CONFIG_HIGHMEM */ | 2406 | #endif /* CONFIG_HIGHMEM */ |
| 2316 | 2407 | ||
| 2408 | #define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) | ||
| 2409 | |||
| 2317 | /** | 2410 | /** |
| 2318 | * prepare_image - use the memory bitmap @bm to mark the pages that will | 2411 | * prepare_image - Make room for loading hibernation image. |
| 2319 | * be overwritten in the process of restoring the system memory state | 2412 | * @new_bm: Unitialized memory bitmap structure. |
| 2320 | * from the suspend image ("unsafe" pages) and allocate memory for the | 2413 | * @bm: Memory bitmap with unsafe pages marked. |
| 2321 | * image. | 2414 | * |
| 2415 | * Use @bm to mark the pages that will be overwritten in the process of | ||
| 2416 | * restoring the system memory state from the suspend image ("unsafe" pages) | ||
| 2417 | * and allocate memory for the image. | ||
| 2322 | * | 2418 | * |
| 2323 | * The idea is to allocate a new memory bitmap first and then allocate | 2419 | * The idea is to allocate a new memory bitmap first and then allocate |
| 2324 | * as many pages as needed for the image data, but not to assign these | 2420 | * as many pages as needed for image data, but without specifying what those |
| 2325 | * pages to specific tasks initially. Instead, we just mark them as | 2421 | * pages will be used for just yet. Instead, we mark them all as allocated and |
| 2326 | * allocated and create a lists of "safe" pages that will be used | 2422 | * create a lists of "safe" pages to be used later. On systems with high |
| 2327 | * later. On systems with high memory a list of "safe" highmem pages is | 2423 | * memory a list of "safe" highmem pages is created too. |
| 2328 | * also created. | ||
| 2329 | */ | 2424 | */ |
| 2330 | 2425 | static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |
| 2331 | #define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) | ||
| 2332 | |||
| 2333 | static int | ||
| 2334 | prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | ||
| 2335 | { | 2426 | { |
| 2336 | unsigned int nr_pages, nr_highmem; | 2427 | unsigned int nr_pages, nr_highmem; |
| 2337 | struct linked_page *sp_list, *lp; | 2428 | struct linked_page *lp; |
| 2338 | int error; | 2429 | int error; |
| 2339 | 2430 | ||
| 2340 | /* If there is no highmem, the buffer will not be necessary */ | 2431 | /* If there is no highmem, the buffer will not be necessary */ |
| @@ -2342,9 +2433,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
| 2342 | buffer = NULL; | 2433 | buffer = NULL; |
| 2343 | 2434 | ||
| 2344 | nr_highmem = count_highmem_image_pages(bm); | 2435 | nr_highmem = count_highmem_image_pages(bm); |
| 2345 | error = mark_unsafe_pages(bm); | 2436 | mark_unsafe_pages(bm); |
| 2346 | if (error) | ||
| 2347 | goto Free; | ||
| 2348 | 2437 | ||
| 2349 | error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE); | 2438 | error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE); |
| 2350 | if (error) | 2439 | if (error) |
| @@ -2357,14 +2446,15 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
| 2357 | if (error) | 2446 | if (error) |
| 2358 | goto Free; | 2447 | goto Free; |
| 2359 | } | 2448 | } |
| 2360 | /* Reserve some safe pages for potential later use. | 2449 | /* |
| 2450 | * Reserve some safe pages for potential later use. | ||
| 2361 | * | 2451 | * |
| 2362 | * NOTE: This way we make sure there will be enough safe pages for the | 2452 | * NOTE: This way we make sure there will be enough safe pages for the |
| 2363 | * chain_alloc() in get_buffer(). It is a bit wasteful, but | 2453 | * chain_alloc() in get_buffer(). It is a bit wasteful, but |
| 2364 | * nr_copy_pages cannot be greater than 50% of the memory anyway. | 2454 | * nr_copy_pages cannot be greater than 50% of the memory anyway. |
| 2455 | * | ||
| 2456 | * nr_copy_pages cannot be less than allocated_unsafe_pages too. | ||
| 2365 | */ | 2457 | */ |
| 2366 | sp_list = NULL; | ||
| 2367 | /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ | ||
| 2368 | nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; | 2458 | nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; |
| 2369 | nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); | 2459 | nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); |
| 2370 | while (nr_pages > 0) { | 2460 | while (nr_pages > 0) { |
| @@ -2373,12 +2463,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
| 2373 | error = -ENOMEM; | 2463 | error = -ENOMEM; |
| 2374 | goto Free; | 2464 | goto Free; |
| 2375 | } | 2465 | } |
| 2376 | lp->next = sp_list; | 2466 | lp->next = safe_pages_list; |
| 2377 | sp_list = lp; | 2467 | safe_pages_list = lp; |
| 2378 | nr_pages--; | 2468 | nr_pages--; |
| 2379 | } | 2469 | } |
| 2380 | /* Preallocate memory for the image */ | 2470 | /* Preallocate memory for the image */ |
| 2381 | safe_pages_list = NULL; | ||
| 2382 | nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; | 2471 | nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; |
| 2383 | while (nr_pages > 0) { | 2472 | while (nr_pages > 0) { |
| 2384 | lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); | 2473 | lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); |
| @@ -2396,12 +2485,6 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
| 2396 | swsusp_set_page_free(virt_to_page(lp)); | 2485 | swsusp_set_page_free(virt_to_page(lp)); |
| 2397 | nr_pages--; | 2486 | nr_pages--; |
| 2398 | } | 2487 | } |
| 2399 | /* Free the reserved safe pages so that chain_alloc() can use them */ | ||
| 2400 | while (sp_list) { | ||
| 2401 | lp = sp_list->next; | ||
| 2402 | free_image_page(sp_list, PG_UNSAFE_CLEAR); | ||
| 2403 | sp_list = lp; | ||
| 2404 | } | ||
| 2405 | return 0; | 2488 | return 0; |
| 2406 | 2489 | ||
| 2407 | Free: | 2490 | Free: |
| @@ -2410,10 +2493,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
| 2410 | } | 2493 | } |
| 2411 | 2494 | ||
| 2412 | /** | 2495 | /** |
| 2413 | * get_buffer - compute the address that snapshot_write_next() should | 2496 | * get_buffer - Get the address to store the next image data page. |
| 2414 | * set for its caller to write to. | 2497 | * |
| 2498 | * Get the address that snapshot_write_next() should return to its caller to | ||
| 2499 | * write to. | ||
| 2415 | */ | 2500 | */ |
| 2416 | |||
| 2417 | static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) | 2501 | static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) |
| 2418 | { | 2502 | { |
| 2419 | struct pbe *pbe; | 2503 | struct pbe *pbe; |
| @@ -2428,12 +2512,14 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) | |||
| 2428 | return get_highmem_page_buffer(page, ca); | 2512 | return get_highmem_page_buffer(page, ca); |
| 2429 | 2513 | ||
| 2430 | if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) | 2514 | if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) |
| 2431 | /* We have allocated the "original" page frame and we can | 2515 | /* |
| 2516 | * We have allocated the "original" page frame and we can | ||
| 2432 | * use it directly to store the loaded page. | 2517 | * use it directly to store the loaded page. |
| 2433 | */ | 2518 | */ |
| 2434 | return page_address(page); | 2519 | return page_address(page); |
| 2435 | 2520 | ||
| 2436 | /* The "original" page frame has not been allocated and we have to | 2521 | /* |
| 2522 | * The "original" page frame has not been allocated and we have to | ||
| 2437 | * use a "safe" page frame to store the loaded page. | 2523 | * use a "safe" page frame to store the loaded page. |
| 2438 | */ | 2524 | */ |
| 2439 | pbe = chain_alloc(ca, sizeof(struct pbe)); | 2525 | pbe = chain_alloc(ca, sizeof(struct pbe)); |
| @@ -2450,22 +2536,21 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) | |||
| 2450 | } | 2536 | } |
| 2451 | 2537 | ||
| 2452 | /** | 2538 | /** |
| 2453 | * snapshot_write_next - used for writing the system memory snapshot. | 2539 | * snapshot_write_next - Get the address to store the next image page. |
| 2540 | * @handle: Snapshot handle structure to guide the writing. | ||
| 2454 | * | 2541 | * |
| 2455 | * On the first call to it @handle should point to a zeroed | 2542 | * On the first call, @handle should point to a zeroed snapshot_handle |
| 2456 | * snapshot_handle structure. The structure gets updated and a pointer | 2543 | * structure. The structure gets populated then and a pointer to it should be |
| 2457 | * to it should be passed to this function every next time. | 2544 | * passed to this function every next time. |
| 2458 | * | 2545 | * |
| 2459 | * On success the function returns a positive number. Then, the caller | 2546 | * On success, the function returns a positive number. Then, the caller |
| 2460 | * is allowed to write up to the returned number of bytes to the memory | 2547 | * is allowed to write up to the returned number of bytes to the memory |
| 2461 | * location computed by the data_of() macro. | 2548 | * location computed by the data_of() macro. |
| 2462 | * | 2549 | * |
| 2463 | * The function returns 0 to indicate the "end of file" condition, | 2550 | * The function returns 0 to indicate the "end of file" condition. Negative |
| 2464 | * and a negative number is returned on error. In such cases the | 2551 | * numbers are returned on errors, in which cases the structure pointed to by |
| 2465 | * structure pointed to by @handle is not updated and should not be used | 2552 | * @handle is not updated and should not be used any more. |
| 2466 | * any more. | ||
| 2467 | */ | 2553 | */ |
| 2468 | |||
| 2469 | int snapshot_write_next(struct snapshot_handle *handle) | 2554 | int snapshot_write_next(struct snapshot_handle *handle) |
| 2470 | { | 2555 | { |
| 2471 | static struct chain_allocator ca; | 2556 | static struct chain_allocator ca; |
| @@ -2491,6 +2576,8 @@ int snapshot_write_next(struct snapshot_handle *handle) | |||
| 2491 | if (error) | 2576 | if (error) |
| 2492 | return error; | 2577 | return error; |
| 2493 | 2578 | ||
| 2579 | safe_pages_list = NULL; | ||
| 2580 | |||
| 2494 | error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); | 2581 | error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); |
| 2495 | if (error) | 2582 | if (error) |
| 2496 | return error; | 2583 | return error; |
| @@ -2500,6 +2587,7 @@ int snapshot_write_next(struct snapshot_handle *handle) | |||
| 2500 | if (error) | 2587 | if (error) |
| 2501 | return error; | 2588 | return error; |
| 2502 | 2589 | ||
| 2590 | hibernate_restore_protection_begin(); | ||
| 2503 | } else if (handle->cur <= nr_meta_pages + 1) { | 2591 | } else if (handle->cur <= nr_meta_pages + 1) { |
| 2504 | error = unpack_orig_pfns(buffer, ©_bm); | 2592 | error = unpack_orig_pfns(buffer, ©_bm); |
| 2505 | if (error) | 2593 | if (error) |
| @@ -2522,6 +2610,7 @@ int snapshot_write_next(struct snapshot_handle *handle) | |||
| 2522 | copy_last_highmem_page(); | 2610 | copy_last_highmem_page(); |
| 2523 | /* Restore page key for data page (s390 only). */ | 2611 | /* Restore page key for data page (s390 only). */ |
| 2524 | page_key_write(handle->buffer); | 2612 | page_key_write(handle->buffer); |
| 2613 | hibernate_restore_protect_page(handle->buffer); | ||
| 2525 | handle->buffer = get_buffer(&orig_bm, &ca); | 2614 | handle->buffer = get_buffer(&orig_bm, &ca); |
| 2526 | if (IS_ERR(handle->buffer)) | 2615 | if (IS_ERR(handle->buffer)) |
| 2527 | return PTR_ERR(handle->buffer); | 2616 | return PTR_ERR(handle->buffer); |
| @@ -2533,22 +2622,23 @@ int snapshot_write_next(struct snapshot_handle *handle) | |||
| 2533 | } | 2622 | } |
| 2534 | 2623 | ||
| 2535 | /** | 2624 | /** |
| 2536 | * snapshot_write_finalize - must be called after the last call to | 2625 | * snapshot_write_finalize - Complete the loading of a hibernation image. |
| 2537 | * snapshot_write_next() in case the last page in the image happens | 2626 | * |
| 2538 | * to be a highmem page and its contents should be stored in the | 2627 | * Must be called after the last call to snapshot_write_next() in case the last |
| 2539 | * highmem. Additionally, it releases the memory that will not be | 2628 | * page in the image happens to be a highmem page and its contents should be |
| 2540 | * used any more. | 2629 | * stored in highmem. Additionally, it recycles bitmap memory that's not |
| 2630 | * necessary any more. | ||
| 2541 | */ | 2631 | */ |
| 2542 | |||
| 2543 | void snapshot_write_finalize(struct snapshot_handle *handle) | 2632 | void snapshot_write_finalize(struct snapshot_handle *handle) |
| 2544 | { | 2633 | { |
| 2545 | copy_last_highmem_page(); | 2634 | copy_last_highmem_page(); |
| 2546 | /* Restore page key for data page (s390 only). */ | 2635 | /* Restore page key for data page (s390 only). */ |
| 2547 | page_key_write(handle->buffer); | 2636 | page_key_write(handle->buffer); |
| 2548 | page_key_free(); | 2637 | page_key_free(); |
| 2549 | /* Free only if we have loaded the image entirely */ | 2638 | hibernate_restore_protect_page(handle->buffer); |
| 2639 | /* Do that only if we have loaded the image entirely */ | ||
| 2550 | if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { | 2640 | if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { |
| 2551 | memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); | 2641 | memory_bm_recycle(&orig_bm); |
| 2552 | free_highmem_data(); | 2642 | free_highmem_data(); |
| 2553 | } | 2643 | } |
| 2554 | } | 2644 | } |
| @@ -2561,8 +2651,8 @@ int snapshot_image_loaded(struct snapshot_handle *handle) | |||
| 2561 | 2651 | ||
| 2562 | #ifdef CONFIG_HIGHMEM | 2652 | #ifdef CONFIG_HIGHMEM |
| 2563 | /* Assumes that @buf is ready and points to a "safe" page */ | 2653 | /* Assumes that @buf is ready and points to a "safe" page */ |
| 2564 | static inline void | 2654 | static inline void swap_two_pages_data(struct page *p1, struct page *p2, |
| 2565 | swap_two_pages_data(struct page *p1, struct page *p2, void *buf) | 2655 | void *buf) |
| 2566 | { | 2656 | { |
| 2567 | void *kaddr1, *kaddr2; | 2657 | void *kaddr1, *kaddr2; |
| 2568 | 2658 | ||
| @@ -2576,15 +2666,15 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) | |||
| 2576 | } | 2666 | } |
| 2577 | 2667 | ||
| 2578 | /** | 2668 | /** |
| 2579 | * restore_highmem - for each highmem page that was allocated before | 2669 | * restore_highmem - Put highmem image pages into their original locations. |
| 2580 | * the suspend and included in the suspend image, and also has been | 2670 | * |
| 2581 | * allocated by the "resume" kernel swap its current (ie. "before | 2671 | * For each highmem page that was in use before hibernation and is included in |
| 2582 | * resume") contents with the previous (ie. "before suspend") one. | 2672 | * the image, and also has been allocated by the "restore" kernel, swap its |
| 2673 | * current contents with the previous (ie. "before hibernation") ones. | ||
| 2583 | * | 2674 | * |
| 2584 | * If the resume eventually fails, we can call this function once | 2675 | * If the restore eventually fails, we can call this function once again and |
| 2585 | * again and restore the "before resume" highmem state. | 2676 | * restore the highmem state as seen by the restore kernel. |
| 2586 | */ | 2677 | */ |
| 2587 | |||
| 2588 | int restore_highmem(void) | 2678 | int restore_highmem(void) |
| 2589 | { | 2679 | { |
| 2590 | struct highmem_pbe *pbe = highmem_pblist; | 2680 | struct highmem_pbe *pbe = highmem_pblist; |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 5b70d64b871e..0acab9d7f96f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
| @@ -266,16 +266,18 @@ static int suspend_test(int level) | |||
| 266 | */ | 266 | */ |
| 267 | static int suspend_prepare(suspend_state_t state) | 267 | static int suspend_prepare(suspend_state_t state) |
| 268 | { | 268 | { |
| 269 | int error; | 269 | int error, nr_calls = 0; |
| 270 | 270 | ||
| 271 | if (!sleep_state_supported(state)) | 271 | if (!sleep_state_supported(state)) |
| 272 | return -EPERM; | 272 | return -EPERM; |
| 273 | 273 | ||
| 274 | pm_prepare_console(); | 274 | pm_prepare_console(); |
| 275 | 275 | ||
| 276 | error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); | 276 | error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls); |
| 277 | if (error) | 277 | if (error) { |
| 278 | nr_calls--; | ||
| 278 | goto Finish; | 279 | goto Finish; |
| 280 | } | ||
| 279 | 281 | ||
| 280 | trace_suspend_resume(TPS("freeze_processes"), 0, true); | 282 | trace_suspend_resume(TPS("freeze_processes"), 0, true); |
| 281 | error = suspend_freeze_processes(); | 283 | error = suspend_freeze_processes(); |
| @@ -286,7 +288,7 @@ static int suspend_prepare(suspend_state_t state) | |||
| 286 | suspend_stats.failed_freeze++; | 288 | suspend_stats.failed_freeze++; |
| 287 | dpm_save_failed_step(SUSPEND_FREEZE); | 289 | dpm_save_failed_step(SUSPEND_FREEZE); |
| 288 | Finish: | 290 | Finish: |
| 289 | pm_notifier_call_chain(PM_POST_SUSPEND); | 291 | __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL); |
| 290 | pm_restore_console(); | 292 | pm_restore_console(); |
| 291 | return error; | 293 | return error; |
| 292 | } | 294 | } |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 160e1006640d..a3b1e617bcdc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
| @@ -261,7 +261,7 @@ static void hib_end_io(struct bio *bio) | |||
| 261 | bio_put(bio); | 261 | bio_put(bio); |
| 262 | } | 262 | } |
| 263 | 263 | ||
| 264 | static int hib_submit_io(int rw, pgoff_t page_off, void *addr, | 264 | static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, |
| 265 | struct hib_bio_batch *hb) | 265 | struct hib_bio_batch *hb) |
| 266 | { | 266 | { |
| 267 | struct page *page = virt_to_page(addr); | 267 | struct page *page = virt_to_page(addr); |
| @@ -271,6 +271,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, | |||
| 271 | bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); | 271 | bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); |
| 272 | bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); | 272 | bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); |
| 273 | bio->bi_bdev = hib_resume_bdev; | 273 | bio->bi_bdev = hib_resume_bdev; |
| 274 | bio_set_op_attrs(bio, op, op_flags); | ||
| 274 | 275 | ||
| 275 | if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { | 276 | if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { |
| 276 | printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", | 277 | printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", |
| @@ -283,9 +284,9 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, | |||
| 283 | bio->bi_end_io = hib_end_io; | 284 | bio->bi_end_io = hib_end_io; |
| 284 | bio->bi_private = hb; | 285 | bio->bi_private = hb; |
| 285 | atomic_inc(&hb->count); | 286 | atomic_inc(&hb->count); |
| 286 | submit_bio(rw, bio); | 287 | submit_bio(bio); |
| 287 | } else { | 288 | } else { |
| 288 | error = submit_bio_wait(rw, bio); | 289 | error = submit_bio_wait(bio); |
| 289 | bio_put(bio); | 290 | bio_put(bio); |
| 290 | } | 291 | } |
| 291 | 292 | ||
| @@ -306,7 +307,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) | |||
| 306 | { | 307 | { |
| 307 | int error; | 308 | int error; |
| 308 | 309 | ||
| 309 | hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); | 310 | hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, |
| 311 | swsusp_header, NULL); | ||
| 310 | if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || | 312 | if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || |
| 311 | !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { | 313 | !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { |
| 312 | memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); | 314 | memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); |
| @@ -315,8 +317,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) | |||
| 315 | swsusp_header->flags = flags; | 317 | swsusp_header->flags = flags; |
| 316 | if (flags & SF_CRC32_MODE) | 318 | if (flags & SF_CRC32_MODE) |
| 317 | swsusp_header->crc32 = handle->crc32; | 319 | swsusp_header->crc32 = handle->crc32; |
| 318 | error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, | 320 | error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, |
| 319 | swsusp_header, NULL); | 321 | swsusp_resume_block, swsusp_header, NULL); |
| 320 | } else { | 322 | } else { |
| 321 | printk(KERN_ERR "PM: Swap header not found!\n"); | 323 | printk(KERN_ERR "PM: Swap header not found!\n"); |
| 322 | error = -ENODEV; | 324 | error = -ENODEV; |
| @@ -348,6 +350,12 @@ static int swsusp_swap_check(void) | |||
| 348 | if (res < 0) | 350 | if (res < 0) |
| 349 | blkdev_put(hib_resume_bdev, FMODE_WRITE); | 351 | blkdev_put(hib_resume_bdev, FMODE_WRITE); |
| 350 | 352 | ||
| 353 | /* | ||
| 354 | * Update the resume device to the one actually used, | ||
| 355 | * so the test_resume mode can use it in case it is | ||
| 356 | * invoked from hibernate() to test the snapshot. | ||
| 357 | */ | ||
| 358 | swsusp_resume_device = hib_resume_bdev->bd_dev; | ||
| 351 | return res; | 359 | return res; |
| 352 | } | 360 | } |
| 353 | 361 | ||
| @@ -389,7 +397,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) | |||
| 389 | } else { | 397 | } else { |
| 390 | src = buf; | 398 | src = buf; |
| 391 | } | 399 | } |
| 392 | return hib_submit_io(WRITE_SYNC, offset, src, hb); | 400 | return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb); |
| 393 | } | 401 | } |
| 394 | 402 | ||
| 395 | static void release_swap_writer(struct swap_map_handle *handle) | 403 | static void release_swap_writer(struct swap_map_handle *handle) |
| @@ -992,7 +1000,8 @@ static int get_swap_reader(struct swap_map_handle *handle, | |||
| 992 | return -ENOMEM; | 1000 | return -ENOMEM; |
| 993 | } | 1001 | } |
| 994 | 1002 | ||
| 995 | error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL); | 1003 | error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, |
| 1004 | tmp->map, NULL); | ||
| 996 | if (error) { | 1005 | if (error) { |
| 997 | release_swap_reader(handle); | 1006 | release_swap_reader(handle); |
| 998 | return error; | 1007 | return error; |
| @@ -1016,7 +1025,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, | |||
| 1016 | offset = handle->cur->entries[handle->k]; | 1025 | offset = handle->cur->entries[handle->k]; |
| 1017 | if (!offset) | 1026 | if (!offset) |
| 1018 | return -EFAULT; | 1027 | return -EFAULT; |
| 1019 | error = hib_submit_io(READ_SYNC, offset, buf, hb); | 1028 | error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb); |
| 1020 | if (error) | 1029 | if (error) |
| 1021 | return error; | 1030 | return error; |
| 1022 | if (++handle->k >= MAP_PAGE_ENTRIES) { | 1031 | if (++handle->k >= MAP_PAGE_ENTRIES) { |
| @@ -1525,7 +1534,8 @@ int swsusp_check(void) | |||
| 1525 | if (!IS_ERR(hib_resume_bdev)) { | 1534 | if (!IS_ERR(hib_resume_bdev)) { |
| 1526 | set_blocksize(hib_resume_bdev, PAGE_SIZE); | 1535 | set_blocksize(hib_resume_bdev, PAGE_SIZE); |
| 1527 | clear_page(swsusp_header); | 1536 | clear_page(swsusp_header); |
| 1528 | error = hib_submit_io(READ_SYNC, swsusp_resume_block, | 1537 | error = hib_submit_io(REQ_OP_READ, READ_SYNC, |
| 1538 | swsusp_resume_block, | ||
| 1529 | swsusp_header, NULL); | 1539 | swsusp_header, NULL); |
| 1530 | if (error) | 1540 | if (error) |
| 1531 | goto put; | 1541 | goto put; |
| @@ -1533,7 +1543,8 @@ int swsusp_check(void) | |||
| 1533 | if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { | 1543 | if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { |
| 1534 | memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); | 1544 | memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); |
| 1535 | /* Reset swap signature now */ | 1545 | /* Reset swap signature now */ |
| 1536 | error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, | 1546 | error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, |
| 1547 | swsusp_resume_block, | ||
| 1537 | swsusp_header, NULL); | 1548 | swsusp_header, NULL); |
| 1538 | } else { | 1549 | } else { |
| 1539 | error = -EINVAL; | 1550 | error = -EINVAL; |
| @@ -1577,10 +1588,12 @@ int swsusp_unmark(void) | |||
| 1577 | { | 1588 | { |
| 1578 | int error; | 1589 | int error; |
| 1579 | 1590 | ||
| 1580 | hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); | 1591 | hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, |
| 1592 | swsusp_header, NULL); | ||
| 1581 | if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { | 1593 | if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { |
| 1582 | memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); | 1594 | memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); |
| 1583 | error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, | 1595 | error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, |
| 1596 | swsusp_resume_block, | ||
| 1584 | swsusp_header, NULL); | 1597 | swsusp_header, NULL); |
| 1585 | } else { | 1598 | } else { |
| 1586 | printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); | 1599 | printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 526e8911460a..35310b627388 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
| @@ -47,7 +47,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1); | |||
| 47 | static int snapshot_open(struct inode *inode, struct file *filp) | 47 | static int snapshot_open(struct inode *inode, struct file *filp) |
| 48 | { | 48 | { |
| 49 | struct snapshot_data *data; | 49 | struct snapshot_data *data; |
| 50 | int error; | 50 | int error, nr_calls = 0; |
| 51 | 51 | ||
| 52 | if (!hibernation_available()) | 52 | if (!hibernation_available()) |
| 53 | return -EPERM; | 53 | return -EPERM; |
| @@ -74,9 +74,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
| 74 | swap_type_of(swsusp_resume_device, 0, NULL) : -1; | 74 | swap_type_of(swsusp_resume_device, 0, NULL) : -1; |
| 75 | data->mode = O_RDONLY; | 75 | data->mode = O_RDONLY; |
| 76 | data->free_bitmaps = false; | 76 | data->free_bitmaps = false; |
| 77 | error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); | 77 | error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); |
| 78 | if (error) | 78 | if (error) |
| 79 | pm_notifier_call_chain(PM_POST_HIBERNATION); | 79 | __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL); |
| 80 | } else { | 80 | } else { |
| 81 | /* | 81 | /* |
| 82 | * Resuming. We may need to wait for the image device to | 82 | * Resuming. We may need to wait for the image device to |
| @@ -86,13 +86,15 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
| 86 | 86 | ||
| 87 | data->swap = -1; | 87 | data->swap = -1; |
| 88 | data->mode = O_WRONLY; | 88 | data->mode = O_WRONLY; |
| 89 | error = pm_notifier_call_chain(PM_RESTORE_PREPARE); | 89 | error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); |
| 90 | if (!error) { | 90 | if (!error) { |
| 91 | error = create_basic_memory_bitmaps(); | 91 | error = create_basic_memory_bitmaps(); |
| 92 | data->free_bitmaps = !error; | 92 | data->free_bitmaps = !error; |
| 93 | } | 93 | } else |
| 94 | nr_calls--; | ||
| 95 | |||
| 94 | if (error) | 96 | if (error) |
| 95 | pm_notifier_call_chain(PM_POST_RESTORE); | 97 | __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); |
| 96 | } | 98 | } |
| 97 | if (error) | 99 | if (error) |
| 98 | atomic_inc(&snapshot_device_available); | 100 | atomic_inc(&snapshot_device_available); |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 60cdf6386763..d4de33934dac 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
| @@ -3177,9 +3177,8 @@ void show_regs_print_info(const char *log_lvl) | |||
| 3177 | { | 3177 | { |
| 3178 | dump_stack_print_info(log_lvl); | 3178 | dump_stack_print_info(log_lvl); |
| 3179 | 3179 | ||
| 3180 | printk("%stask: %p ti: %p task.ti: %p\n", | 3180 | printk("%stask: %p task.stack: %p\n", |
| 3181 | log_lvl, current, current_thread_info(), | 3181 | log_lvl, current, task_stack_page(current)); |
| 3182 | task_thread_info(current)); | ||
| 3183 | } | 3182 | } |
| 3184 | 3183 | ||
| 3185 | #endif | 3184 | #endif |
diff --git a/kernel/profile.c b/kernel/profile.c index c2199e9901c9..2dbccf2d806c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
| @@ -328,68 +328,57 @@ out: | |||
| 328 | put_cpu(); | 328 | put_cpu(); |
| 329 | } | 329 | } |
| 330 | 330 | ||
| 331 | static int profile_cpu_callback(struct notifier_block *info, | 331 | static int profile_dead_cpu(unsigned int cpu) |
| 332 | unsigned long action, void *__cpu) | ||
| 333 | { | 332 | { |
| 334 | int node, cpu = (unsigned long)__cpu; | ||
| 335 | struct page *page; | 333 | struct page *page; |
| 334 | int i; | ||
| 336 | 335 | ||
| 337 | switch (action) { | 336 | if (prof_cpu_mask != NULL) |
| 338 | case CPU_UP_PREPARE: | 337 | cpumask_clear_cpu(cpu, prof_cpu_mask); |
| 339 | case CPU_UP_PREPARE_FROZEN: | 338 | |
| 340 | node = cpu_to_mem(cpu); | 339 | for (i = 0; i < 2; i++) { |
| 341 | per_cpu(cpu_profile_flip, cpu) = 0; | 340 | if (per_cpu(cpu_profile_hits, cpu)[i]) { |
| 342 | if (!per_cpu(cpu_profile_hits, cpu)[1]) { | 341 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); |
| 343 | page = __alloc_pages_node(node, | 342 | per_cpu(cpu_profile_hits, cpu)[i] = NULL; |
| 344 | GFP_KERNEL | __GFP_ZERO, | ||
| 345 | 0); | ||
| 346 | if (!page) | ||
| 347 | return notifier_from_errno(-ENOMEM); | ||
| 348 | per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); | ||
| 349 | } | ||
| 350 | if (!per_cpu(cpu_profile_hits, cpu)[0]) { | ||
| 351 | page = __alloc_pages_node(node, | ||
| 352 | GFP_KERNEL | __GFP_ZERO, | ||
| 353 | 0); | ||
| 354 | if (!page) | ||
| 355 | goto out_free; | ||
| 356 | per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); | ||
| 357 | } | ||
| 358 | break; | ||
| 359 | out_free: | ||
| 360 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); | ||
| 361 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; | ||
| 362 | __free_page(page); | ||
| 363 | return notifier_from_errno(-ENOMEM); | ||
| 364 | case CPU_ONLINE: | ||
| 365 | case CPU_ONLINE_FROZEN: | ||
| 366 | if (prof_cpu_mask != NULL) | ||
| 367 | cpumask_set_cpu(cpu, prof_cpu_mask); | ||
| 368 | break; | ||
| 369 | case CPU_UP_CANCELED: | ||
| 370 | case CPU_UP_CANCELED_FROZEN: | ||
| 371 | case CPU_DEAD: | ||
| 372 | case CPU_DEAD_FROZEN: | ||
| 373 | if (prof_cpu_mask != NULL) | ||
| 374 | cpumask_clear_cpu(cpu, prof_cpu_mask); | ||
| 375 | if (per_cpu(cpu_profile_hits, cpu)[0]) { | ||
| 376 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); | ||
| 377 | per_cpu(cpu_profile_hits, cpu)[0] = NULL; | ||
| 378 | __free_page(page); | 343 | __free_page(page); |
| 379 | } | 344 | } |
| 380 | if (per_cpu(cpu_profile_hits, cpu)[1]) { | 345 | } |
| 381 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); | 346 | return 0; |
| 382 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; | 347 | } |
| 383 | __free_page(page); | 348 | |
| 349 | static int profile_prepare_cpu(unsigned int cpu) | ||
| 350 | { | ||
| 351 | int i, node = cpu_to_mem(cpu); | ||
| 352 | struct page *page; | ||
| 353 | |||
| 354 | per_cpu(cpu_profile_flip, cpu) = 0; | ||
| 355 | |||
| 356 | for (i = 0; i < 2; i++) { | ||
| 357 | if (per_cpu(cpu_profile_hits, cpu)[i]) | ||
| 358 | continue; | ||
| 359 | |||
| 360 | page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | ||
| 361 | if (!page) { | ||
| 362 | profile_dead_cpu(cpu); | ||
| 363 | return -ENOMEM; | ||
| 384 | } | 364 | } |
| 385 | break; | 365 | per_cpu(cpu_profile_hits, cpu)[i] = page_address(page); |
| 366 | |||
| 386 | } | 367 | } |
| 387 | return NOTIFY_OK; | 368 | return 0; |
| 369 | } | ||
| 370 | |||
| 371 | static int profile_online_cpu(unsigned int cpu) | ||
| 372 | { | ||
| 373 | if (prof_cpu_mask != NULL) | ||
| 374 | cpumask_set_cpu(cpu, prof_cpu_mask); | ||
| 375 | |||
| 376 | return 0; | ||
| 388 | } | 377 | } |
| 378 | |||
| 389 | #else /* !CONFIG_SMP */ | 379 | #else /* !CONFIG_SMP */ |
| 390 | #define profile_flip_buffers() do { } while (0) | 380 | #define profile_flip_buffers() do { } while (0) |
| 391 | #define profile_discard_flip_buffers() do { } while (0) | 381 | #define profile_discard_flip_buffers() do { } while (0) |
| 392 | #define profile_cpu_callback NULL | ||
| 393 | 382 | ||
| 394 | static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) | 383 | static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) |
| 395 | { | 384 | { |
| @@ -531,83 +520,43 @@ static const struct file_operations proc_profile_operations = { | |||
| 531 | .llseek = default_llseek, | 520 | .llseek = default_llseek, |
| 532 | }; | 521 | }; |
| 533 | 522 | ||
| 534 | #ifdef CONFIG_SMP | 523 | int __ref create_proc_profile(void) |
| 535 | static void profile_nop(void *unused) | ||
| 536 | { | ||
| 537 | } | ||
| 538 | |||
| 539 | static int create_hash_tables(void) | ||
| 540 | { | 524 | { |
| 541 | int cpu; | 525 | struct proc_dir_entry *entry; |
| 542 | 526 | #ifdef CONFIG_SMP | |
| 543 | for_each_online_cpu(cpu) { | 527 | enum cpuhp_state online_state; |
| 544 | int node = cpu_to_mem(cpu); | ||
| 545 | struct page *page; | ||
| 546 | |||
| 547 | page = __alloc_pages_node(node, | ||
| 548 | GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, | ||
| 549 | 0); | ||
| 550 | if (!page) | ||
| 551 | goto out_cleanup; | ||
| 552 | per_cpu(cpu_profile_hits, cpu)[1] | ||
| 553 | = (struct profile_hit *)page_address(page); | ||
| 554 | page = __alloc_pages_node(node, | ||
| 555 | GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, | ||
| 556 | 0); | ||
| 557 | if (!page) | ||
| 558 | goto out_cleanup; | ||
| 559 | per_cpu(cpu_profile_hits, cpu)[0] | ||
| 560 | = (struct profile_hit *)page_address(page); | ||
| 561 | } | ||
| 562 | return 0; | ||
| 563 | out_cleanup: | ||
| 564 | prof_on = 0; | ||
| 565 | smp_mb(); | ||
| 566 | on_each_cpu(profile_nop, NULL, 1); | ||
| 567 | for_each_online_cpu(cpu) { | ||
| 568 | struct page *page; | ||
| 569 | |||
| 570 | if (per_cpu(cpu_profile_hits, cpu)[0]) { | ||
| 571 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); | ||
| 572 | per_cpu(cpu_profile_hits, cpu)[0] = NULL; | ||
| 573 | __free_page(page); | ||
| 574 | } | ||
| 575 | if (per_cpu(cpu_profile_hits, cpu)[1]) { | ||
| 576 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); | ||
| 577 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; | ||
| 578 | __free_page(page); | ||
| 579 | } | ||
| 580 | } | ||
| 581 | return -1; | ||
| 582 | } | ||
| 583 | #else | ||
| 584 | #define create_hash_tables() ({ 0; }) | ||
| 585 | #endif | 528 | #endif |
| 586 | 529 | ||
| 587 | int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ | ||
| 588 | { | ||
| 589 | struct proc_dir_entry *entry; | ||
| 590 | int err = 0; | 530 | int err = 0; |
| 591 | 531 | ||
| 592 | if (!prof_on) | 532 | if (!prof_on) |
| 593 | return 0; | 533 | return 0; |
| 594 | 534 | #ifdef CONFIG_SMP | |
| 595 | cpu_notifier_register_begin(); | 535 | err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE", |
| 596 | 536 | profile_prepare_cpu, profile_dead_cpu); | |
| 597 | if (create_hash_tables()) { | 537 | if (err) |
| 598 | err = -ENOMEM; | 538 | return err; |
| 599 | goto out; | 539 | |
| 600 | } | 540 | err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE", |
| 601 | 541 | profile_online_cpu, NULL); | |
| 542 | if (err < 0) | ||
| 543 | goto err_state_prep; | ||
| 544 | online_state = err; | ||
| 545 | err = 0; | ||
| 546 | #endif | ||
| 602 | entry = proc_create("profile", S_IWUSR | S_IRUGO, | 547 | entry = proc_create("profile", S_IWUSR | S_IRUGO, |
| 603 | NULL, &proc_profile_operations); | 548 | NULL, &proc_profile_operations); |
| 604 | if (!entry) | 549 | if (!entry) |
| 605 | goto out; | 550 | goto err_state_onl; |
| 606 | proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); | 551 | proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); |
| 607 | __hotcpu_notifier(profile_cpu_callback, 0); | ||
| 608 | 552 | ||
| 609 | out: | 553 | return err; |
| 610 | cpu_notifier_register_done(); | 554 | err_state_onl: |
| 555 | #ifdef CONFIG_SMP | ||
| 556 | cpuhp_remove_state(online_state); | ||
| 557 | err_state_prep: | ||
| 558 | cpuhp_remove_state(CPUHP_PROFILE_PREPARE); | ||
| 559 | #endif | ||
| 611 | return err; | 560 | return err; |
| 612 | } | 561 | } |
| 613 | subsys_initcall(create_proc_profile); | 562 | subsys_initcall(create_proc_profile); |
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 3cee0d8393ed..d38ab08a3fe7 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c | |||
| @@ -58,7 +58,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); | |||
| 58 | #define VERBOSE_PERFOUT_ERRSTRING(s) \ | 58 | #define VERBOSE_PERFOUT_ERRSTRING(s) \ |
| 59 | do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) | 59 | do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) |
| 60 | 60 | ||
| 61 | torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); | 61 | torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); |
| 62 | torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); | 62 | torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); |
| 63 | torture_param(int, nreaders, -1, "Number of RCU reader threads"); | 63 | torture_param(int, nreaders, -1, "Number of RCU reader threads"); |
| 64 | torture_param(int, nwriters, -1, "Number of RCU updater threads"); | 64 | torture_param(int, nwriters, -1, "Number of RCU updater threads"); |
| @@ -96,12 +96,7 @@ static int rcu_perf_writer_state; | |||
| 96 | #define MAX_MEAS 10000 | 96 | #define MAX_MEAS 10000 |
| 97 | #define MIN_MEAS 100 | 97 | #define MIN_MEAS 100 |
| 98 | 98 | ||
| 99 | #if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE) | 99 | static int perf_runnable = IS_ENABLED(MODULE); |
| 100 | #define RCUPERF_RUNNABLE_INIT 1 | ||
| 101 | #else | ||
| 102 | #define RCUPERF_RUNNABLE_INIT 0 | ||
| 103 | #endif | ||
| 104 | static int perf_runnable = RCUPERF_RUNNABLE_INIT; | ||
| 105 | module_param(perf_runnable, int, 0444); | 100 | module_param(perf_runnable, int, 0444); |
| 106 | MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); | 101 | MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); |
| 107 | 102 | ||
| @@ -363,8 +358,6 @@ rcu_perf_writer(void *arg) | |||
| 363 | u64 *wdpp = writer_durations[me]; | 358 | u64 *wdpp = writer_durations[me]; |
| 364 | 359 | ||
| 365 | VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); | 360 | VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); |
| 366 | WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); | ||
| 367 | WARN_ON(rcu_gp_is_normal() && gp_exp); | ||
| 368 | WARN_ON(!wdpp); | 361 | WARN_ON(!wdpp); |
| 369 | set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); | 362 | set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); |
| 370 | sp.sched_priority = 1; | 363 | sp.sched_priority = 1; |
| @@ -631,12 +624,24 @@ rcu_perf_init(void) | |||
| 631 | firsterr = -ENOMEM; | 624 | firsterr = -ENOMEM; |
| 632 | goto unwind; | 625 | goto unwind; |
| 633 | } | 626 | } |
| 627 | if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) { | ||
| 628 | VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); | ||
| 629 | firsterr = -EINVAL; | ||
| 630 | goto unwind; | ||
| 631 | } | ||
| 632 | if (rcu_gp_is_normal() && gp_exp) { | ||
| 633 | VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); | ||
| 634 | firsterr = -EINVAL; | ||
| 635 | goto unwind; | ||
| 636 | } | ||
| 634 | for (i = 0; i < nrealwriters; i++) { | 637 | for (i = 0; i < nrealwriters; i++) { |
| 635 | writer_durations[i] = | 638 | writer_durations[i] = |
| 636 | kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), | 639 | kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), |
| 637 | GFP_KERNEL); | 640 | GFP_KERNEL); |
| 638 | if (!writer_durations[i]) | 641 | if (!writer_durations[i]) { |
| 642 | firsterr = -ENOMEM; | ||
| 639 | goto unwind; | 643 | goto unwind; |
| 644 | } | ||
| 640 | firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, | 645 | firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, |
| 641 | writer_tasks[i]); | 646 | writer_tasks[i]); |
| 642 | if (firsterr) | 647 | if (firsterr) |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 084a28a732eb..971e2b138063 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
| @@ -182,12 +182,7 @@ static const char *rcu_torture_writer_state_getname(void) | |||
| 182 | return rcu_torture_writer_state_names[i]; | 182 | return rcu_torture_writer_state_names[i]; |
| 183 | } | 183 | } |
| 184 | 184 | ||
| 185 | #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) | 185 | static int torture_runnable = IS_ENABLED(MODULE); |
| 186 | #define RCUTORTURE_RUNNABLE_INIT 1 | ||
| 187 | #else | ||
| 188 | #define RCUTORTURE_RUNNABLE_INIT 0 | ||
| 189 | #endif | ||
| 190 | static int torture_runnable = RCUTORTURE_RUNNABLE_INIT; | ||
| 191 | module_param(torture_runnable, int, 0444); | 186 | module_param(torture_runnable, int, 0444); |
| 192 | MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); | 187 | MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); |
| 193 | 188 | ||
| @@ -1476,7 +1471,7 @@ static int rcu_torture_barrier_cbs(void *arg) | |||
| 1476 | break; | 1471 | break; |
| 1477 | /* | 1472 | /* |
| 1478 | * The above smp_load_acquire() ensures barrier_phase load | 1473 | * The above smp_load_acquire() ensures barrier_phase load |
| 1479 | * is ordered before the folloiwng ->call(). | 1474 | * is ordered before the following ->call(). |
| 1480 | */ | 1475 | */ |
| 1481 | local_irq_disable(); /* Just to test no-irq call_rcu(). */ | 1476 | local_irq_disable(); /* Just to test no-irq call_rcu(). */ |
| 1482 | cur_ops->call(&rcu, rcu_torture_barrier_cbf); | 1477 | cur_ops->call(&rcu, rcu_torture_barrier_cbf); |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c7f1bc4f817c..5d80925e7fc8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
| @@ -125,12 +125,14 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; | |||
| 125 | /* Number of rcu_nodes at specified level. */ | 125 | /* Number of rcu_nodes at specified level. */ |
| 126 | static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; | 126 | static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; |
| 127 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ | 127 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ |
| 128 | /* panic() on RCU Stall sysctl. */ | ||
| 129 | int sysctl_panic_on_rcu_stall __read_mostly; | ||
| 128 | 130 | ||
| 129 | /* | 131 | /* |
| 130 | * The rcu_scheduler_active variable transitions from zero to one just | 132 | * The rcu_scheduler_active variable transitions from zero to one just |
| 131 | * before the first task is spawned. So when this variable is zero, RCU | 133 | * before the first task is spawned. So when this variable is zero, RCU |
| 132 | * can assume that there is but one task, allowing RCU to (for example) | 134 | * can assume that there is but one task, allowing RCU to (for example) |
| 133 | * optimize synchronize_sched() to a simple barrier(). When this variable | 135 | * optimize synchronize_rcu() to a simple barrier(). When this variable |
| 134 | * is one, RCU must actually do all the hard work required to detect real | 136 | * is one, RCU must actually do all the hard work required to detect real |
| 135 | * grace periods. This variable is also used to suppress boot-time false | 137 | * grace periods. This variable is also used to suppress boot-time false |
| 136 | * positives from lockdep-RCU error checking. | 138 | * positives from lockdep-RCU error checking. |
| @@ -159,6 +161,7 @@ static void invoke_rcu_core(void); | |||
| 159 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | 161 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); |
| 160 | static void rcu_report_exp_rdp(struct rcu_state *rsp, | 162 | static void rcu_report_exp_rdp(struct rcu_state *rsp, |
| 161 | struct rcu_data *rdp, bool wake); | 163 | struct rcu_data *rdp, bool wake); |
| 164 | static void sync_sched_exp_online_cleanup(int cpu); | ||
| 162 | 165 | ||
| 163 | /* rcuc/rcub kthread realtime priority */ | 166 | /* rcuc/rcub kthread realtime priority */ |
| 164 | #ifdef CONFIG_RCU_KTHREAD_PRIO | 167 | #ifdef CONFIG_RCU_KTHREAD_PRIO |
| @@ -1070,11 +1073,11 @@ EXPORT_SYMBOL_GPL(rcu_is_watching); | |||
| 1070 | * offline to continue to use RCU for one jiffy after marking itself | 1073 | * offline to continue to use RCU for one jiffy after marking itself |
| 1071 | * offline in the cpu_online_mask. This leniency is necessary given the | 1074 | * offline in the cpu_online_mask. This leniency is necessary given the |
| 1072 | * non-atomic nature of the online and offline processing, for example, | 1075 | * non-atomic nature of the online and offline processing, for example, |
| 1073 | * the fact that a CPU enters the scheduler after completing the CPU_DYING | 1076 | * the fact that a CPU enters the scheduler after completing the teardown |
| 1074 | * notifiers. | 1077 | * of the CPU. |
| 1075 | * | 1078 | * |
| 1076 | * This is also why RCU internally marks CPUs online during the | 1079 | * This is also why RCU internally marks CPUs online during in the |
| 1077 | * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. | 1080 | * preparation phase and offline after the CPU has been taken down. |
| 1078 | * | 1081 | * |
| 1079 | * Disable checking if in an NMI handler because we cannot safely report | 1082 | * Disable checking if in an NMI handler because we cannot safely report |
| 1080 | * errors from NMI handlers anyway. | 1083 | * errors from NMI handlers anyway. |
| @@ -1284,9 +1287,9 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) | |||
| 1284 | rcu_for_each_leaf_node(rsp, rnp) { | 1287 | rcu_for_each_leaf_node(rsp, rnp) { |
| 1285 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | 1288 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 1286 | if (rnp->qsmask != 0) { | 1289 | if (rnp->qsmask != 0) { |
| 1287 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | 1290 | for_each_leaf_node_possible_cpu(rnp, cpu) |
| 1288 | if (rnp->qsmask & (1UL << cpu)) | 1291 | if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) |
| 1289 | dump_cpu_task(rnp->grplo + cpu); | 1292 | dump_cpu_task(cpu); |
| 1290 | } | 1293 | } |
| 1291 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 1294 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
| 1292 | } | 1295 | } |
| @@ -1311,6 +1314,12 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) | |||
| 1311 | } | 1314 | } |
| 1312 | } | 1315 | } |
| 1313 | 1316 | ||
| 1317 | static inline void panic_on_rcu_stall(void) | ||
| 1318 | { | ||
| 1319 | if (sysctl_panic_on_rcu_stall) | ||
| 1320 | panic("RCU Stall\n"); | ||
| 1321 | } | ||
| 1322 | |||
| 1314 | static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) | 1323 | static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) |
| 1315 | { | 1324 | { |
| 1316 | int cpu; | 1325 | int cpu; |
| @@ -1351,10 +1360,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) | |||
| 1351 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | 1360 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 1352 | ndetected += rcu_print_task_stall(rnp); | 1361 | ndetected += rcu_print_task_stall(rnp); |
| 1353 | if (rnp->qsmask != 0) { | 1362 | if (rnp->qsmask != 0) { |
| 1354 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | 1363 | for_each_leaf_node_possible_cpu(rnp, cpu) |
| 1355 | if (rnp->qsmask & (1UL << cpu)) { | 1364 | if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { |
| 1356 | print_cpu_stall_info(rsp, | 1365 | print_cpu_stall_info(rsp, cpu); |
| 1357 | rnp->grplo + cpu); | ||
| 1358 | ndetected++; | 1366 | ndetected++; |
| 1359 | } | 1367 | } |
| 1360 | } | 1368 | } |
| @@ -1390,6 +1398,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) | |||
| 1390 | 1398 | ||
| 1391 | rcu_check_gp_kthread_starvation(rsp); | 1399 | rcu_check_gp_kthread_starvation(rsp); |
| 1392 | 1400 | ||
| 1401 | panic_on_rcu_stall(); | ||
| 1402 | |||
| 1393 | force_quiescent_state(rsp); /* Kick them all. */ | 1403 | force_quiescent_state(rsp); /* Kick them all. */ |
| 1394 | } | 1404 | } |
| 1395 | 1405 | ||
| @@ -1430,6 +1440,8 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
| 1430 | jiffies + 3 * rcu_jiffies_till_stall_check() + 3); | 1440 | jiffies + 3 * rcu_jiffies_till_stall_check() + 3); |
| 1431 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 1441 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
| 1432 | 1442 | ||
| 1443 | panic_on_rcu_stall(); | ||
| 1444 | |||
| 1433 | /* | 1445 | /* |
| 1434 | * Attempt to revive the RCU machinery by forcing a context switch. | 1446 | * Attempt to revive the RCU machinery by forcing a context switch. |
| 1435 | * | 1447 | * |
| @@ -1989,8 +2001,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) | |||
| 1989 | * of the tree within the rsp->node[] array. Note that other CPUs | 2001 | * of the tree within the rsp->node[] array. Note that other CPUs |
| 1990 | * will access only the leaves of the hierarchy, thus seeing that no | 2002 | * will access only the leaves of the hierarchy, thus seeing that no |
| 1991 | * grace period is in progress, at least until the corresponding | 2003 | * grace period is in progress, at least until the corresponding |
| 1992 | * leaf node has been initialized. In addition, we have excluded | 2004 | * leaf node has been initialized. |
| 1993 | * CPU-hotplug operations. | ||
| 1994 | * | 2005 | * |
| 1995 | * The grace period cannot complete until the initialization | 2006 | * The grace period cannot complete until the initialization |
| 1996 | * process finishes, because this kthread handles both. | 2007 | * process finishes, because this kthread handles both. |
| @@ -2872,7 +2883,6 @@ static void force_qs_rnp(struct rcu_state *rsp, | |||
| 2872 | unsigned long *maxj), | 2883 | unsigned long *maxj), |
| 2873 | bool *isidle, unsigned long *maxj) | 2884 | bool *isidle, unsigned long *maxj) |
| 2874 | { | 2885 | { |
| 2875 | unsigned long bit; | ||
| 2876 | int cpu; | 2886 | int cpu; |
| 2877 | unsigned long flags; | 2887 | unsigned long flags; |
| 2878 | unsigned long mask; | 2888 | unsigned long mask; |
| @@ -2907,9 +2917,8 @@ static void force_qs_rnp(struct rcu_state *rsp, | |||
| 2907 | continue; | 2917 | continue; |
| 2908 | } | 2918 | } |
| 2909 | } | 2919 | } |
| 2910 | cpu = rnp->grplo; | 2920 | for_each_leaf_node_possible_cpu(rnp, cpu) { |
| 2911 | bit = 1; | 2921 | unsigned long bit = leaf_node_cpu_bit(rnp, cpu); |
| 2912 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { | ||
| 2913 | if ((rnp->qsmask & bit) != 0) { | 2922 | if ((rnp->qsmask & bit) != 0) { |
| 2914 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) | 2923 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) |
| 2915 | mask |= bit; | 2924 | mask |= bit; |
| @@ -3448,549 +3457,6 @@ static bool rcu_seq_done(unsigned long *sp, unsigned long s) | |||
| 3448 | return ULONG_CMP_GE(READ_ONCE(*sp), s); | 3457 | return ULONG_CMP_GE(READ_ONCE(*sp), s); |
| 3449 | } | 3458 | } |
| 3450 | 3459 | ||
| 3451 | /* Wrapper functions for expedited grace periods. */ | ||
| 3452 | static void rcu_exp_gp_seq_start(struct rcu_state *rsp) | ||
| 3453 | { | ||
| 3454 | rcu_seq_start(&rsp->expedited_sequence); | ||
| 3455 | } | ||
| 3456 | static void rcu_exp_gp_seq_end(struct rcu_state *rsp) | ||
| 3457 | { | ||
| 3458 | rcu_seq_end(&rsp->expedited_sequence); | ||
| 3459 | smp_mb(); /* Ensure that consecutive grace periods serialize. */ | ||
| 3460 | } | ||
| 3461 | static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) | ||
| 3462 | { | ||
| 3463 | unsigned long s; | ||
| 3464 | |||
| 3465 | smp_mb(); /* Caller's modifications seen first by other CPUs. */ | ||
| 3466 | s = rcu_seq_snap(&rsp->expedited_sequence); | ||
| 3467 | trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); | ||
| 3468 | return s; | ||
| 3469 | } | ||
| 3470 | static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) | ||
| 3471 | { | ||
| 3472 | return rcu_seq_done(&rsp->expedited_sequence, s); | ||
| 3473 | } | ||
| 3474 | |||
| 3475 | /* | ||
| 3476 | * Reset the ->expmaskinit values in the rcu_node tree to reflect any | ||
| 3477 | * recent CPU-online activity. Note that these masks are not cleared | ||
| 3478 | * when CPUs go offline, so they reflect the union of all CPUs that have | ||
| 3479 | * ever been online. This means that this function normally takes its | ||
| 3480 | * no-work-to-do fastpath. | ||
| 3481 | */ | ||
| 3482 | static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) | ||
| 3483 | { | ||
| 3484 | bool done; | ||
| 3485 | unsigned long flags; | ||
| 3486 | unsigned long mask; | ||
| 3487 | unsigned long oldmask; | ||
| 3488 | int ncpus = READ_ONCE(rsp->ncpus); | ||
| 3489 | struct rcu_node *rnp; | ||
| 3490 | struct rcu_node *rnp_up; | ||
| 3491 | |||
| 3492 | /* If no new CPUs onlined since last time, nothing to do. */ | ||
| 3493 | if (likely(ncpus == rsp->ncpus_snap)) | ||
| 3494 | return; | ||
| 3495 | rsp->ncpus_snap = ncpus; | ||
| 3496 | |||
| 3497 | /* | ||
| 3498 | * Each pass through the following loop propagates newly onlined | ||
| 3499 | * CPUs for the current rcu_node structure up the rcu_node tree. | ||
| 3500 | */ | ||
| 3501 | rcu_for_each_leaf_node(rsp, rnp) { | ||
| 3502 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 3503 | if (rnp->expmaskinit == rnp->expmaskinitnext) { | ||
| 3504 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 3505 | continue; /* No new CPUs, nothing to do. */ | ||
| 3506 | } | ||
| 3507 | |||
| 3508 | /* Update this node's mask, track old value for propagation. */ | ||
| 3509 | oldmask = rnp->expmaskinit; | ||
| 3510 | rnp->expmaskinit = rnp->expmaskinitnext; | ||
| 3511 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 3512 | |||
| 3513 | /* If was already nonzero, nothing to propagate. */ | ||
| 3514 | if (oldmask) | ||
| 3515 | continue; | ||
| 3516 | |||
| 3517 | /* Propagate the new CPU up the tree. */ | ||
| 3518 | mask = rnp->grpmask; | ||
| 3519 | rnp_up = rnp->parent; | ||
| 3520 | done = false; | ||
| 3521 | while (rnp_up) { | ||
| 3522 | raw_spin_lock_irqsave_rcu_node(rnp_up, flags); | ||
| 3523 | if (rnp_up->expmaskinit) | ||
| 3524 | done = true; | ||
| 3525 | rnp_up->expmaskinit |= mask; | ||
| 3526 | raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); | ||
| 3527 | if (done) | ||
| 3528 | break; | ||
| 3529 | mask = rnp_up->grpmask; | ||
| 3530 | rnp_up = rnp_up->parent; | ||
| 3531 | } | ||
| 3532 | } | ||
| 3533 | } | ||
| 3534 | |||
| 3535 | /* | ||
| 3536 | * Reset the ->expmask values in the rcu_node tree in preparation for | ||
| 3537 | * a new expedited grace period. | ||
| 3538 | */ | ||
| 3539 | static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) | ||
| 3540 | { | ||
| 3541 | unsigned long flags; | ||
| 3542 | struct rcu_node *rnp; | ||
| 3543 | |||
| 3544 | sync_exp_reset_tree_hotplug(rsp); | ||
| 3545 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
| 3546 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 3547 | WARN_ON_ONCE(rnp->expmask); | ||
| 3548 | rnp->expmask = rnp->expmaskinit; | ||
| 3549 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 3550 | } | ||
| 3551 | } | ||
| 3552 | |||
| 3553 | /* | ||
| 3554 | * Return non-zero if there is no RCU expedited grace period in progress | ||
| 3555 | * for the specified rcu_node structure, in other words, if all CPUs and | ||
| 3556 | * tasks covered by the specified rcu_node structure have done their bit | ||
| 3557 | * for the current expedited grace period. Works only for preemptible | ||
| 3558 | * RCU -- other RCU implementation use other means. | ||
| 3559 | * | ||
| 3560 | * Caller must hold the rcu_state's exp_mutex. | ||
| 3561 | */ | ||
| 3562 | static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) | ||
| 3563 | { | ||
| 3564 | return rnp->exp_tasks == NULL && | ||
| 3565 | READ_ONCE(rnp->expmask) == 0; | ||
| 3566 | } | ||
| 3567 | |||
| 3568 | /* | ||
| 3569 | * Report the exit from RCU read-side critical section for the last task | ||
| 3570 | * that queued itself during or before the current expedited preemptible-RCU | ||
| 3571 | * grace period. This event is reported either to the rcu_node structure on | ||
| 3572 | * which the task was queued or to one of that rcu_node structure's ancestors, | ||
| 3573 | * recursively up the tree. (Calm down, calm down, we do the recursion | ||
| 3574 | * iteratively!) | ||
| 3575 | * | ||
| 3576 | * Caller must hold the rcu_state's exp_mutex and the specified rcu_node | ||
| 3577 | * structure's ->lock. | ||
| 3578 | */ | ||
| 3579 | static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | ||
| 3580 | bool wake, unsigned long flags) | ||
| 3581 | __releases(rnp->lock) | ||
| 3582 | { | ||
| 3583 | unsigned long mask; | ||
| 3584 | |||
| 3585 | for (;;) { | ||
| 3586 | if (!sync_rcu_preempt_exp_done(rnp)) { | ||
| 3587 | if (!rnp->expmask) | ||
| 3588 | rcu_initiate_boost(rnp, flags); | ||
| 3589 | else | ||
| 3590 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 3591 | break; | ||
| 3592 | } | ||
| 3593 | if (rnp->parent == NULL) { | ||
| 3594 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 3595 | if (wake) { | ||
| 3596 | smp_mb(); /* EGP done before wake_up(). */ | ||
| 3597 | swake_up(&rsp->expedited_wq); | ||
| 3598 | } | ||
| 3599 | break; | ||
| 3600 | } | ||
| 3601 | mask = rnp->grpmask; | ||
| 3602 | raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ | ||
| 3603 | rnp = rnp->parent; | ||
| 3604 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ | ||
| 3605 | WARN_ON_ONCE(!(rnp->expmask & mask)); | ||
| 3606 | rnp->expmask &= ~mask; | ||
| 3607 | } | ||
| 3608 | } | ||
| 3609 | |||
| 3610 | /* | ||
| 3611 | * Report expedited quiescent state for specified node. This is a | ||
| 3612 | * lock-acquisition wrapper function for __rcu_report_exp_rnp(). | ||
| 3613 | * | ||
| 3614 | * Caller must hold the rcu_state's exp_mutex. | ||
| 3615 | */ | ||
| 3616 | static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, | ||
| 3617 | struct rcu_node *rnp, bool wake) | ||
| 3618 | { | ||
| 3619 | unsigned long flags; | ||
| 3620 | |||
| 3621 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 3622 | __rcu_report_exp_rnp(rsp, rnp, wake, flags); | ||
| 3623 | } | ||
| 3624 | |||
| 3625 | /* | ||
| 3626 | * Report expedited quiescent state for multiple CPUs, all covered by the | ||
| 3627 | * specified leaf rcu_node structure. Caller must hold the rcu_state's | ||
| 3628 | * exp_mutex. | ||
| 3629 | */ | ||
| 3630 | static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, | ||
| 3631 | unsigned long mask, bool wake) | ||
| 3632 | { | ||
| 3633 | unsigned long flags; | ||
| 3634 | |||
| 3635 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 3636 | if (!(rnp->expmask & mask)) { | ||
| 3637 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 3638 | return; | ||
| 3639 | } | ||
| 3640 | rnp->expmask &= ~mask; | ||
| 3641 | __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ | ||
| 3642 | } | ||
| 3643 | |||
| 3644 | /* | ||
| 3645 | * Report expedited quiescent state for specified rcu_data (CPU). | ||
| 3646 | */ | ||
| 3647 | static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, | ||
| 3648 | bool wake) | ||
| 3649 | { | ||
| 3650 | rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); | ||
| 3651 | } | ||
| 3652 | |||
| 3653 | /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ | ||
| 3654 | static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, | ||
| 3655 | unsigned long s) | ||
| 3656 | { | ||
| 3657 | if (rcu_exp_gp_seq_done(rsp, s)) { | ||
| 3658 | trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); | ||
| 3659 | /* Ensure test happens before caller kfree(). */ | ||
| 3660 | smp_mb__before_atomic(); /* ^^^ */ | ||
| 3661 | atomic_long_inc(stat); | ||
| 3662 | return true; | ||
| 3663 | } | ||
| 3664 | return false; | ||
| 3665 | } | ||
| 3666 | |||
| 3667 | /* | ||
| 3668 | * Funnel-lock acquisition for expedited grace periods. Returns true | ||
| 3669 | * if some other task completed an expedited grace period that this task | ||
| 3670 | * can piggy-back on, and with no mutex held. Otherwise, returns false | ||
| 3671 | * with the mutex held, indicating that the caller must actually do the | ||
| 3672 | * expedited grace period. | ||
| 3673 | */ | ||
| 3674 | static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) | ||
| 3675 | { | ||
| 3676 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); | ||
| 3677 | struct rcu_node *rnp = rdp->mynode; | ||
| 3678 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
| 3679 | |||
| 3680 | /* Low-contention fastpath. */ | ||
| 3681 | if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && | ||
| 3682 | (rnp == rnp_root || | ||
| 3683 | ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && | ||
| 3684 | !mutex_is_locked(&rsp->exp_mutex) && | ||
| 3685 | mutex_trylock(&rsp->exp_mutex)) | ||
| 3686 | goto fastpath; | ||
| 3687 | |||
| 3688 | /* | ||
| 3689 | * Each pass through the following loop works its way up | ||
| 3690 | * the rcu_node tree, returning if others have done the work or | ||
| 3691 | * otherwise falls through to acquire rsp->exp_mutex. The mapping | ||
| 3692 | * from CPU to rcu_node structure can be inexact, as it is just | ||
| 3693 | * promoting locality and is not strictly needed for correctness. | ||
| 3694 | */ | ||
| 3695 | for (; rnp != NULL; rnp = rnp->parent) { | ||
| 3696 | if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) | ||
| 3697 | return true; | ||
| 3698 | |||
| 3699 | /* Work not done, either wait here or go up. */ | ||
| 3700 | spin_lock(&rnp->exp_lock); | ||
| 3701 | if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { | ||
| 3702 | |||
| 3703 | /* Someone else doing GP, so wait for them. */ | ||
| 3704 | spin_unlock(&rnp->exp_lock); | ||
| 3705 | trace_rcu_exp_funnel_lock(rsp->name, rnp->level, | ||
| 3706 | rnp->grplo, rnp->grphi, | ||
| 3707 | TPS("wait")); | ||
| 3708 | wait_event(rnp->exp_wq[(s >> 1) & 0x3], | ||
| 3709 | sync_exp_work_done(rsp, | ||
| 3710 | &rdp->exp_workdone2, s)); | ||
| 3711 | return true; | ||
| 3712 | } | ||
| 3713 | rnp->exp_seq_rq = s; /* Followers can wait on us. */ | ||
| 3714 | spin_unlock(&rnp->exp_lock); | ||
| 3715 | trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, | ||
| 3716 | rnp->grphi, TPS("nxtlvl")); | ||
| 3717 | } | ||
| 3718 | mutex_lock(&rsp->exp_mutex); | ||
| 3719 | fastpath: | ||
| 3720 | if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { | ||
| 3721 | mutex_unlock(&rsp->exp_mutex); | ||
| 3722 | return true; | ||
| 3723 | } | ||
| 3724 | rcu_exp_gp_seq_start(rsp); | ||
| 3725 | trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); | ||
| 3726 | return false; | ||
| 3727 | } | ||
| 3728 | |||
| 3729 | /* Invoked on each online non-idle CPU for expedited quiescent state. */ | ||
| 3730 | static void sync_sched_exp_handler(void *data) | ||
| 3731 | { | ||
| 3732 | struct rcu_data *rdp; | ||
| 3733 | struct rcu_node *rnp; | ||
| 3734 | struct rcu_state *rsp = data; | ||
| 3735 | |||
| 3736 | rdp = this_cpu_ptr(rsp->rda); | ||
| 3737 | rnp = rdp->mynode; | ||
| 3738 | if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || | ||
| 3739 | __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) | ||
| 3740 | return; | ||
| 3741 | if (rcu_is_cpu_rrupt_from_idle()) { | ||
| 3742 | rcu_report_exp_rdp(&rcu_sched_state, | ||
| 3743 | this_cpu_ptr(&rcu_sched_data), true); | ||
| 3744 | return; | ||
| 3745 | } | ||
| 3746 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); | ||
| 3747 | resched_cpu(smp_processor_id()); | ||
| 3748 | } | ||
| 3749 | |||
| 3750 | /* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ | ||
| 3751 | static void sync_sched_exp_online_cleanup(int cpu) | ||
| 3752 | { | ||
| 3753 | struct rcu_data *rdp; | ||
| 3754 | int ret; | ||
| 3755 | struct rcu_node *rnp; | ||
| 3756 | struct rcu_state *rsp = &rcu_sched_state; | ||
| 3757 | |||
| 3758 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 3759 | rnp = rdp->mynode; | ||
| 3760 | if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) | ||
| 3761 | return; | ||
| 3762 | ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); | ||
| 3763 | WARN_ON_ONCE(ret); | ||
| 3764 | } | ||
| 3765 | |||
| 3766 | /* | ||
| 3767 | * Select the nodes that the upcoming expedited grace period needs | ||
| 3768 | * to wait for. | ||
| 3769 | */ | ||
| 3770 | static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, | ||
| 3771 | smp_call_func_t func) | ||
| 3772 | { | ||
| 3773 | int cpu; | ||
| 3774 | unsigned long flags; | ||
| 3775 | unsigned long mask; | ||
| 3776 | unsigned long mask_ofl_test; | ||
| 3777 | unsigned long mask_ofl_ipi; | ||
| 3778 | int ret; | ||
| 3779 | struct rcu_node *rnp; | ||
| 3780 | |||
| 3781 | sync_exp_reset_tree(rsp); | ||
| 3782 | rcu_for_each_leaf_node(rsp, rnp) { | ||
| 3783 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 3784 | |||
| 3785 | /* Each pass checks a CPU for identity, offline, and idle. */ | ||
| 3786 | mask_ofl_test = 0; | ||
| 3787 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { | ||
| 3788 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 3789 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
| 3790 | |||
| 3791 | if (raw_smp_processor_id() == cpu || | ||
| 3792 | !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) | ||
| 3793 | mask_ofl_test |= rdp->grpmask; | ||
| 3794 | } | ||
| 3795 | mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; | ||
| 3796 | |||
| 3797 | /* | ||
| 3798 | * Need to wait for any blocked tasks as well. Note that | ||
| 3799 | * additional blocking tasks will also block the expedited | ||
| 3800 | * GP until such time as the ->expmask bits are cleared. | ||
| 3801 | */ | ||
| 3802 | if (rcu_preempt_has_tasks(rnp)) | ||
| 3803 | rnp->exp_tasks = rnp->blkd_tasks.next; | ||
| 3804 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 3805 | |||
| 3806 | /* IPI the remaining CPUs for expedited quiescent state. */ | ||
| 3807 | mask = 1; | ||
| 3808 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { | ||
| 3809 | if (!(mask_ofl_ipi & mask)) | ||
| 3810 | continue; | ||
| 3811 | retry_ipi: | ||
| 3812 | ret = smp_call_function_single(cpu, func, rsp, 0); | ||
| 3813 | if (!ret) { | ||
| 3814 | mask_ofl_ipi &= ~mask; | ||
| 3815 | continue; | ||
| 3816 | } | ||
| 3817 | /* Failed, raced with offline. */ | ||
| 3818 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 3819 | if (cpu_online(cpu) && | ||
| 3820 | (rnp->expmask & mask)) { | ||
| 3821 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 3822 | schedule_timeout_uninterruptible(1); | ||
| 3823 | if (cpu_online(cpu) && | ||
| 3824 | (rnp->expmask & mask)) | ||
| 3825 | goto retry_ipi; | ||
| 3826 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 3827 | } | ||
| 3828 | if (!(rnp->expmask & mask)) | ||
| 3829 | mask_ofl_ipi &= ~mask; | ||
| 3830 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 3831 | } | ||
| 3832 | /* Report quiescent states for those that went offline. */ | ||
| 3833 | mask_ofl_test |= mask_ofl_ipi; | ||
| 3834 | if (mask_ofl_test) | ||
| 3835 | rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); | ||
| 3836 | } | ||
| 3837 | } | ||
| 3838 | |||
| 3839 | static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | ||
| 3840 | { | ||
| 3841 | int cpu; | ||
| 3842 | unsigned long jiffies_stall; | ||
| 3843 | unsigned long jiffies_start; | ||
| 3844 | unsigned long mask; | ||
| 3845 | int ndetected; | ||
| 3846 | struct rcu_node *rnp; | ||
| 3847 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
| 3848 | int ret; | ||
| 3849 | |||
| 3850 | jiffies_stall = rcu_jiffies_till_stall_check(); | ||
| 3851 | jiffies_start = jiffies; | ||
| 3852 | |||
| 3853 | for (;;) { | ||
| 3854 | ret = swait_event_timeout( | ||
| 3855 | rsp->expedited_wq, | ||
| 3856 | sync_rcu_preempt_exp_done(rnp_root), | ||
| 3857 | jiffies_stall); | ||
| 3858 | if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) | ||
| 3859 | return; | ||
| 3860 | if (ret < 0) { | ||
| 3861 | /* Hit a signal, disable CPU stall warnings. */ | ||
| 3862 | swait_event(rsp->expedited_wq, | ||
| 3863 | sync_rcu_preempt_exp_done(rnp_root)); | ||
| 3864 | return; | ||
| 3865 | } | ||
| 3866 | pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", | ||
| 3867 | rsp->name); | ||
| 3868 | ndetected = 0; | ||
| 3869 | rcu_for_each_leaf_node(rsp, rnp) { | ||
| 3870 | ndetected += rcu_print_task_exp_stall(rnp); | ||
| 3871 | mask = 1; | ||
| 3872 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { | ||
| 3873 | struct rcu_data *rdp; | ||
| 3874 | |||
| 3875 | if (!(rnp->expmask & mask)) | ||
| 3876 | continue; | ||
| 3877 | ndetected++; | ||
| 3878 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 3879 | pr_cont(" %d-%c%c%c", cpu, | ||
| 3880 | "O."[!!cpu_online(cpu)], | ||
| 3881 | "o."[!!(rdp->grpmask & rnp->expmaskinit)], | ||
| 3882 | "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); | ||
| 3883 | } | ||
| 3884 | mask <<= 1; | ||
| 3885 | } | ||
| 3886 | pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", | ||
| 3887 | jiffies - jiffies_start, rsp->expedited_sequence, | ||
| 3888 | rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); | ||
| 3889 | if (ndetected) { | ||
| 3890 | pr_err("blocking rcu_node structures:"); | ||
| 3891 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
| 3892 | if (rnp == rnp_root) | ||
| 3893 | continue; /* printed unconditionally */ | ||
| 3894 | if (sync_rcu_preempt_exp_done(rnp)) | ||
| 3895 | continue; | ||
| 3896 | pr_cont(" l=%u:%d-%d:%#lx/%c", | ||
| 3897 | rnp->level, rnp->grplo, rnp->grphi, | ||
| 3898 | rnp->expmask, | ||
| 3899 | ".T"[!!rnp->exp_tasks]); | ||
| 3900 | } | ||
| 3901 | pr_cont("\n"); | ||
| 3902 | } | ||
| 3903 | rcu_for_each_leaf_node(rsp, rnp) { | ||
| 3904 | mask = 1; | ||
| 3905 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { | ||
| 3906 | if (!(rnp->expmask & mask)) | ||
| 3907 | continue; | ||
| 3908 | dump_cpu_task(cpu); | ||
| 3909 | } | ||
| 3910 | } | ||
| 3911 | jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; | ||
| 3912 | } | ||
| 3913 | } | ||
| 3914 | |||
| 3915 | /* | ||
| 3916 | * Wait for the current expedited grace period to complete, and then | ||
| 3917 | * wake up everyone who piggybacked on the just-completed expedited | ||
| 3918 | * grace period. Also update all the ->exp_seq_rq counters as needed | ||
| 3919 | * in order to avoid counter-wrap problems. | ||
| 3920 | */ | ||
| 3921 | static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) | ||
| 3922 | { | ||
| 3923 | struct rcu_node *rnp; | ||
| 3924 | |||
| 3925 | synchronize_sched_expedited_wait(rsp); | ||
| 3926 | rcu_exp_gp_seq_end(rsp); | ||
| 3927 | trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); | ||
| 3928 | |||
| 3929 | /* | ||
| 3930 | * Switch over to wakeup mode, allowing the next GP, but -only- the | ||
| 3931 | * next GP, to proceed. | ||
| 3932 | */ | ||
| 3933 | mutex_lock(&rsp->exp_wake_mutex); | ||
| 3934 | mutex_unlock(&rsp->exp_mutex); | ||
| 3935 | |||
| 3936 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
| 3937 | if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { | ||
| 3938 | spin_lock(&rnp->exp_lock); | ||
| 3939 | /* Recheck, avoid hang in case someone just arrived. */ | ||
| 3940 | if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) | ||
| 3941 | rnp->exp_seq_rq = s; | ||
| 3942 | spin_unlock(&rnp->exp_lock); | ||
| 3943 | } | ||
| 3944 | wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); | ||
| 3945 | } | ||
| 3946 | trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); | ||
| 3947 | mutex_unlock(&rsp->exp_wake_mutex); | ||
| 3948 | } | ||
| 3949 | |||
| 3950 | /** | ||
| 3951 | * synchronize_sched_expedited - Brute-force RCU-sched grace period | ||
| 3952 | * | ||
| 3953 | * Wait for an RCU-sched grace period to elapse, but use a "big hammer" | ||
| 3954 | * approach to force the grace period to end quickly. This consumes | ||
| 3955 | * significant time on all CPUs and is unfriendly to real-time workloads, | ||
| 3956 | * so is thus not recommended for any sort of common-case code. In fact, | ||
| 3957 | * if you are using synchronize_sched_expedited() in a loop, please | ||
| 3958 | * restructure your code to batch your updates, and then use a single | ||
| 3959 | * synchronize_sched() instead. | ||
| 3960 | * | ||
| 3961 | * This implementation can be thought of as an application of sequence | ||
| 3962 | * locking to expedited grace periods, but using the sequence counter to | ||
| 3963 | * determine when someone else has already done the work instead of for | ||
| 3964 | * retrying readers. | ||
| 3965 | */ | ||
| 3966 | void synchronize_sched_expedited(void) | ||
| 3967 | { | ||
| 3968 | unsigned long s; | ||
| 3969 | struct rcu_state *rsp = &rcu_sched_state; | ||
| 3970 | |||
| 3971 | /* If only one CPU, this is automatically a grace period. */ | ||
| 3972 | if (rcu_blocking_is_gp()) | ||
| 3973 | return; | ||
| 3974 | |||
| 3975 | /* If expedited grace periods are prohibited, fall back to normal. */ | ||
| 3976 | if (rcu_gp_is_normal()) { | ||
| 3977 | wait_rcu_gp(call_rcu_sched); | ||
| 3978 | return; | ||
| 3979 | } | ||
| 3980 | |||
| 3981 | /* Take a snapshot of the sequence number. */ | ||
| 3982 | s = rcu_exp_gp_seq_snap(rsp); | ||
| 3983 | if (exp_funnel_lock(rsp, s)) | ||
| 3984 | return; /* Someone else did our work for us. */ | ||
| 3985 | |||
| 3986 | /* Initialize the rcu_node tree in preparation for the wait. */ | ||
| 3987 | sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); | ||
| 3988 | |||
| 3989 | /* Wait and clean up, including waking everyone. */ | ||
| 3990 | rcu_exp_wait_wake(rsp, s); | ||
| 3991 | } | ||
| 3992 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
| 3993 | |||
| 3994 | /* | 3460 | /* |
| 3995 | * Check to see if there is any immediate RCU-related work to be done | 3461 | * Check to see if there is any immediate RCU-related work to be done |
| 3996 | * by the current CPU, for the specified type of RCU, returning 1 if so. | 3462 | * by the current CPU, for the specified type of RCU, returning 1 if so. |
| @@ -4281,7 +3747,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
| 4281 | 3747 | ||
| 4282 | /* Set up local state, ensuring consistent view of global state. */ | 3748 | /* Set up local state, ensuring consistent view of global state. */ |
| 4283 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | 3749 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 4284 | rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); | 3750 | rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); |
| 4285 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 3751 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
| 4286 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); | 3752 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); |
| 4287 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); | 3753 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); |
| @@ -4340,12 +3806,58 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
| 4340 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 3806 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
| 4341 | } | 3807 | } |
| 4342 | 3808 | ||
| 4343 | static void rcu_prepare_cpu(int cpu) | 3809 | int rcutree_prepare_cpu(unsigned int cpu) |
| 4344 | { | 3810 | { |
| 4345 | struct rcu_state *rsp; | 3811 | struct rcu_state *rsp; |
| 4346 | 3812 | ||
| 4347 | for_each_rcu_flavor(rsp) | 3813 | for_each_rcu_flavor(rsp) |
| 4348 | rcu_init_percpu_data(cpu, rsp); | 3814 | rcu_init_percpu_data(cpu, rsp); |
| 3815 | |||
| 3816 | rcu_prepare_kthreads(cpu); | ||
| 3817 | rcu_spawn_all_nocb_kthreads(cpu); | ||
| 3818 | |||
| 3819 | return 0; | ||
| 3820 | } | ||
| 3821 | |||
| 3822 | static void rcutree_affinity_setting(unsigned int cpu, int outgoing) | ||
| 3823 | { | ||
| 3824 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); | ||
| 3825 | |||
| 3826 | rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); | ||
| 3827 | } | ||
| 3828 | |||
| 3829 | int rcutree_online_cpu(unsigned int cpu) | ||
| 3830 | { | ||
| 3831 | sync_sched_exp_online_cleanup(cpu); | ||
| 3832 | rcutree_affinity_setting(cpu, -1); | ||
| 3833 | return 0; | ||
| 3834 | } | ||
| 3835 | |||
| 3836 | int rcutree_offline_cpu(unsigned int cpu) | ||
| 3837 | { | ||
| 3838 | rcutree_affinity_setting(cpu, cpu); | ||
| 3839 | return 0; | ||
| 3840 | } | ||
| 3841 | |||
| 3842 | |||
| 3843 | int rcutree_dying_cpu(unsigned int cpu) | ||
| 3844 | { | ||
| 3845 | struct rcu_state *rsp; | ||
| 3846 | |||
| 3847 | for_each_rcu_flavor(rsp) | ||
| 3848 | rcu_cleanup_dying_cpu(rsp); | ||
| 3849 | return 0; | ||
| 3850 | } | ||
| 3851 | |||
| 3852 | int rcutree_dead_cpu(unsigned int cpu) | ||
| 3853 | { | ||
| 3854 | struct rcu_state *rsp; | ||
| 3855 | |||
| 3856 | for_each_rcu_flavor(rsp) { | ||
| 3857 | rcu_cleanup_dead_cpu(cpu, rsp); | ||
| 3858 | do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); | ||
| 3859 | } | ||
| 3860 | return 0; | ||
| 4349 | } | 3861 | } |
| 4350 | 3862 | ||
| 4351 | #ifdef CONFIG_HOTPLUG_CPU | 3863 | #ifdef CONFIG_HOTPLUG_CPU |
| @@ -4364,9 +3876,6 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) | |||
| 4364 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 3876 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
| 4365 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ | 3877 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ |
| 4366 | 3878 | ||
| 4367 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) | ||
| 4368 | return; | ||
| 4369 | |||
| 4370 | /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ | 3879 | /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ |
| 4371 | mask = rdp->grpmask; | 3880 | mask = rdp->grpmask; |
| 4372 | raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ | 3881 | raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ |
| @@ -4388,52 +3897,6 @@ void rcu_report_dead(unsigned int cpu) | |||
| 4388 | } | 3897 | } |
| 4389 | #endif | 3898 | #endif |
| 4390 | 3899 | ||
| 4391 | /* | ||
| 4392 | * Handle CPU online/offline notification events. | ||
| 4393 | */ | ||
| 4394 | int rcu_cpu_notify(struct notifier_block *self, | ||
| 4395 | unsigned long action, void *hcpu) | ||
| 4396 | { | ||
| 4397 | long cpu = (long)hcpu; | ||
| 4398 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); | ||
| 4399 | struct rcu_node *rnp = rdp->mynode; | ||
| 4400 | struct rcu_state *rsp; | ||
| 4401 | |||
| 4402 | switch (action) { | ||
| 4403 | case CPU_UP_PREPARE: | ||
| 4404 | case CPU_UP_PREPARE_FROZEN: | ||
| 4405 | rcu_prepare_cpu(cpu); | ||
| 4406 | rcu_prepare_kthreads(cpu); | ||
| 4407 | rcu_spawn_all_nocb_kthreads(cpu); | ||
| 4408 | break; | ||
| 4409 | case CPU_ONLINE: | ||
| 4410 | case CPU_DOWN_FAILED: | ||
| 4411 | sync_sched_exp_online_cleanup(cpu); | ||
| 4412 | rcu_boost_kthread_setaffinity(rnp, -1); | ||
| 4413 | break; | ||
| 4414 | case CPU_DOWN_PREPARE: | ||
| 4415 | rcu_boost_kthread_setaffinity(rnp, cpu); | ||
| 4416 | break; | ||
| 4417 | case CPU_DYING: | ||
| 4418 | case CPU_DYING_FROZEN: | ||
| 4419 | for_each_rcu_flavor(rsp) | ||
| 4420 | rcu_cleanup_dying_cpu(rsp); | ||
| 4421 | break; | ||
| 4422 | case CPU_DEAD: | ||
| 4423 | case CPU_DEAD_FROZEN: | ||
| 4424 | case CPU_UP_CANCELED: | ||
| 4425 | case CPU_UP_CANCELED_FROZEN: | ||
| 4426 | for_each_rcu_flavor(rsp) { | ||
| 4427 | rcu_cleanup_dead_cpu(cpu, rsp); | ||
| 4428 | do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); | ||
| 4429 | } | ||
| 4430 | break; | ||
| 4431 | default: | ||
| 4432 | break; | ||
| 4433 | } | ||
| 4434 | return NOTIFY_OK; | ||
| 4435 | } | ||
| 4436 | |||
| 4437 | static int rcu_pm_notify(struct notifier_block *self, | 3900 | static int rcu_pm_notify(struct notifier_block *self, |
| 4438 | unsigned long action, void *hcpu) | 3901 | unsigned long action, void *hcpu) |
| 4439 | { | 3902 | { |
| @@ -4745,10 +4208,10 @@ void __init rcu_init(void) | |||
| 4745 | * this is called early in boot, before either interrupts | 4208 | * this is called early in boot, before either interrupts |
| 4746 | * or the scheduler are operational. | 4209 | * or the scheduler are operational. |
| 4747 | */ | 4210 | */ |
| 4748 | cpu_notifier(rcu_cpu_notify, 0); | ||
| 4749 | pm_notifier(rcu_pm_notify, 0); | 4211 | pm_notifier(rcu_pm_notify, 0); |
| 4750 | for_each_online_cpu(cpu) | 4212 | for_each_online_cpu(cpu) |
| 4751 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 4213 | rcutree_prepare_cpu(cpu); |
| 4752 | } | 4214 | } |
| 4753 | 4215 | ||
| 4216 | #include "tree_exp.h" | ||
| 4754 | #include "tree_plugin.h" | 4217 | #include "tree_plugin.h" |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e3959f5e6ddf..f714f873bf9d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
| @@ -254,6 +254,13 @@ struct rcu_node { | |||
| 254 | } ____cacheline_internodealigned_in_smp; | 254 | } ____cacheline_internodealigned_in_smp; |
| 255 | 255 | ||
| 256 | /* | 256 | /* |
| 257 | * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and | ||
| 258 | * are indexed relative to this interval rather than the global CPU ID space. | ||
| 259 | * This generates the bit for a CPU in node-local masks. | ||
| 260 | */ | ||
| 261 | #define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) | ||
| 262 | |||
| 263 | /* | ||
| 257 | * Do a full breadth-first scan of the rcu_node structures for the | 264 | * Do a full breadth-first scan of the rcu_node structures for the |
| 258 | * specified rcu_state structure. | 265 | * specified rcu_state structure. |
| 259 | */ | 266 | */ |
| @@ -281,6 +288,14 @@ struct rcu_node { | |||
| 281 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | 288 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) |
| 282 | 289 | ||
| 283 | /* | 290 | /* |
| 291 | * Iterate over all possible CPUs in a leaf RCU node. | ||
| 292 | */ | ||
| 293 | #define for_each_leaf_node_possible_cpu(rnp, cpu) \ | ||
| 294 | for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ | ||
| 295 | cpu <= rnp->grphi; \ | ||
| 296 | cpu = cpumask_next((cpu), cpu_possible_mask)) | ||
| 297 | |||
| 298 | /* | ||
| 284 | * Union to allow "aggregate OR" operation on the need for a quiescent | 299 | * Union to allow "aggregate OR" operation on the need for a quiescent |
| 285 | * state by the normal and expedited grace periods. | 300 | * state by the normal and expedited grace periods. |
| 286 | */ | 301 | */ |
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h new file mode 100644 index 000000000000..6d86ab6ec2c9 --- /dev/null +++ b/kernel/rcu/tree_exp.h | |||
| @@ -0,0 +1,655 @@ | |||
| 1 | /* | ||
| 2 | * RCU expedited grace periods | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, you can access it online at | ||
| 16 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
| 17 | * | ||
| 18 | * Copyright IBM Corporation, 2016 | ||
| 19 | * | ||
| 20 | * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
| 21 | */ | ||
| 22 | |||
| 23 | /* Wrapper functions for expedited grace periods. */ | ||
| 24 | static void rcu_exp_gp_seq_start(struct rcu_state *rsp) | ||
| 25 | { | ||
| 26 | rcu_seq_start(&rsp->expedited_sequence); | ||
| 27 | } | ||
| 28 | static void rcu_exp_gp_seq_end(struct rcu_state *rsp) | ||
| 29 | { | ||
| 30 | rcu_seq_end(&rsp->expedited_sequence); | ||
| 31 | smp_mb(); /* Ensure that consecutive grace periods serialize. */ | ||
| 32 | } | ||
| 33 | static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) | ||
| 34 | { | ||
| 35 | unsigned long s; | ||
| 36 | |||
| 37 | smp_mb(); /* Caller's modifications seen first by other CPUs. */ | ||
| 38 | s = rcu_seq_snap(&rsp->expedited_sequence); | ||
| 39 | trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); | ||
| 40 | return s; | ||
| 41 | } | ||
| 42 | static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) | ||
| 43 | { | ||
| 44 | return rcu_seq_done(&rsp->expedited_sequence, s); | ||
| 45 | } | ||
| 46 | |||
| 47 | /* | ||
| 48 | * Reset the ->expmaskinit values in the rcu_node tree to reflect any | ||
| 49 | * recent CPU-online activity. Note that these masks are not cleared | ||
| 50 | * when CPUs go offline, so they reflect the union of all CPUs that have | ||
| 51 | * ever been online. This means that this function normally takes its | ||
| 52 | * no-work-to-do fastpath. | ||
| 53 | */ | ||
| 54 | static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) | ||
| 55 | { | ||
| 56 | bool done; | ||
| 57 | unsigned long flags; | ||
| 58 | unsigned long mask; | ||
| 59 | unsigned long oldmask; | ||
| 60 | int ncpus = READ_ONCE(rsp->ncpus); | ||
| 61 | struct rcu_node *rnp; | ||
| 62 | struct rcu_node *rnp_up; | ||
| 63 | |||
| 64 | /* If no new CPUs onlined since last time, nothing to do. */ | ||
| 65 | if (likely(ncpus == rsp->ncpus_snap)) | ||
| 66 | return; | ||
| 67 | rsp->ncpus_snap = ncpus; | ||
| 68 | |||
| 69 | /* | ||
| 70 | * Each pass through the following loop propagates newly onlined | ||
| 71 | * CPUs for the current rcu_node structure up the rcu_node tree. | ||
| 72 | */ | ||
| 73 | rcu_for_each_leaf_node(rsp, rnp) { | ||
| 74 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 75 | if (rnp->expmaskinit == rnp->expmaskinitnext) { | ||
| 76 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 77 | continue; /* No new CPUs, nothing to do. */ | ||
| 78 | } | ||
| 79 | |||
| 80 | /* Update this node's mask, track old value for propagation. */ | ||
| 81 | oldmask = rnp->expmaskinit; | ||
| 82 | rnp->expmaskinit = rnp->expmaskinitnext; | ||
| 83 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 84 | |||
| 85 | /* If was already nonzero, nothing to propagate. */ | ||
| 86 | if (oldmask) | ||
| 87 | continue; | ||
| 88 | |||
| 89 | /* Propagate the new CPU up the tree. */ | ||
| 90 | mask = rnp->grpmask; | ||
| 91 | rnp_up = rnp->parent; | ||
| 92 | done = false; | ||
| 93 | while (rnp_up) { | ||
| 94 | raw_spin_lock_irqsave_rcu_node(rnp_up, flags); | ||
| 95 | if (rnp_up->expmaskinit) | ||
| 96 | done = true; | ||
| 97 | rnp_up->expmaskinit |= mask; | ||
| 98 | raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); | ||
| 99 | if (done) | ||
| 100 | break; | ||
| 101 | mask = rnp_up->grpmask; | ||
| 102 | rnp_up = rnp_up->parent; | ||
| 103 | } | ||
| 104 | } | ||
| 105 | } | ||
| 106 | |||
| 107 | /* | ||
| 108 | * Reset the ->expmask values in the rcu_node tree in preparation for | ||
| 109 | * a new expedited grace period. | ||
| 110 | */ | ||
| 111 | static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) | ||
| 112 | { | ||
| 113 | unsigned long flags; | ||
| 114 | struct rcu_node *rnp; | ||
| 115 | |||
| 116 | sync_exp_reset_tree_hotplug(rsp); | ||
| 117 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
| 118 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 119 | WARN_ON_ONCE(rnp->expmask); | ||
| 120 | rnp->expmask = rnp->expmaskinit; | ||
| 121 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 122 | } | ||
| 123 | } | ||
| 124 | |||
| 125 | /* | ||
| 126 | * Return non-zero if there is no RCU expedited grace period in progress | ||
| 127 | * for the specified rcu_node structure, in other words, if all CPUs and | ||
| 128 | * tasks covered by the specified rcu_node structure have done their bit | ||
| 129 | * for the current expedited grace period. Works only for preemptible | ||
| 130 | * RCU -- other RCU implementation use other means. | ||
| 131 | * | ||
| 132 | * Caller must hold the rcu_state's exp_mutex. | ||
| 133 | */ | ||
| 134 | static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) | ||
| 135 | { | ||
| 136 | return rnp->exp_tasks == NULL && | ||
| 137 | READ_ONCE(rnp->expmask) == 0; | ||
| 138 | } | ||
| 139 | |||
| 140 | /* | ||
| 141 | * Report the exit from RCU read-side critical section for the last task | ||
| 142 | * that queued itself during or before the current expedited preemptible-RCU | ||
| 143 | * grace period. This event is reported either to the rcu_node structure on | ||
| 144 | * which the task was queued or to one of that rcu_node structure's ancestors, | ||
| 145 | * recursively up the tree. (Calm down, calm down, we do the recursion | ||
| 146 | * iteratively!) | ||
| 147 | * | ||
| 148 | * Caller must hold the rcu_state's exp_mutex and the specified rcu_node | ||
| 149 | * structure's ->lock. | ||
| 150 | */ | ||
| 151 | static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | ||
| 152 | bool wake, unsigned long flags) | ||
| 153 | __releases(rnp->lock) | ||
| 154 | { | ||
| 155 | unsigned long mask; | ||
| 156 | |||
| 157 | for (;;) { | ||
| 158 | if (!sync_rcu_preempt_exp_done(rnp)) { | ||
| 159 | if (!rnp->expmask) | ||
| 160 | rcu_initiate_boost(rnp, flags); | ||
| 161 | else | ||
| 162 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 163 | break; | ||
| 164 | } | ||
| 165 | if (rnp->parent == NULL) { | ||
| 166 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 167 | if (wake) { | ||
| 168 | smp_mb(); /* EGP done before wake_up(). */ | ||
| 169 | swake_up(&rsp->expedited_wq); | ||
| 170 | } | ||
| 171 | break; | ||
| 172 | } | ||
| 173 | mask = rnp->grpmask; | ||
| 174 | raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ | ||
| 175 | rnp = rnp->parent; | ||
| 176 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ | ||
| 177 | WARN_ON_ONCE(!(rnp->expmask & mask)); | ||
| 178 | rnp->expmask &= ~mask; | ||
| 179 | } | ||
| 180 | } | ||
| 181 | |||
| 182 | /* | ||
| 183 | * Report expedited quiescent state for specified node. This is a | ||
| 184 | * lock-acquisition wrapper function for __rcu_report_exp_rnp(). | ||
| 185 | * | ||
| 186 | * Caller must hold the rcu_state's exp_mutex. | ||
| 187 | */ | ||
| 188 | static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, | ||
| 189 | struct rcu_node *rnp, bool wake) | ||
| 190 | { | ||
| 191 | unsigned long flags; | ||
| 192 | |||
| 193 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 194 | __rcu_report_exp_rnp(rsp, rnp, wake, flags); | ||
| 195 | } | ||
| 196 | |||
| 197 | /* | ||
| 198 | * Report expedited quiescent state for multiple CPUs, all covered by the | ||
| 199 | * specified leaf rcu_node structure. Caller must hold the rcu_state's | ||
| 200 | * exp_mutex. | ||
| 201 | */ | ||
| 202 | static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, | ||
| 203 | unsigned long mask, bool wake) | ||
| 204 | { | ||
| 205 | unsigned long flags; | ||
| 206 | |||
| 207 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 208 | if (!(rnp->expmask & mask)) { | ||
| 209 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 210 | return; | ||
| 211 | } | ||
| 212 | rnp->expmask &= ~mask; | ||
| 213 | __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ | ||
| 214 | } | ||
| 215 | |||
| 216 | /* | ||
| 217 | * Report expedited quiescent state for specified rcu_data (CPU). | ||
| 218 | */ | ||
| 219 | static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, | ||
| 220 | bool wake) | ||
| 221 | { | ||
| 222 | rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); | ||
| 223 | } | ||
| 224 | |||
| 225 | /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ | ||
| 226 | static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, | ||
| 227 | unsigned long s) | ||
| 228 | { | ||
| 229 | if (rcu_exp_gp_seq_done(rsp, s)) { | ||
| 230 | trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); | ||
| 231 | /* Ensure test happens before caller kfree(). */ | ||
| 232 | smp_mb__before_atomic(); /* ^^^ */ | ||
| 233 | atomic_long_inc(stat); | ||
| 234 | return true; | ||
| 235 | } | ||
| 236 | return false; | ||
| 237 | } | ||
| 238 | |||
| 239 | /* | ||
| 240 | * Funnel-lock acquisition for expedited grace periods. Returns true | ||
| 241 | * if some other task completed an expedited grace period that this task | ||
| 242 | * can piggy-back on, and with no mutex held. Otherwise, returns false | ||
| 243 | * with the mutex held, indicating that the caller must actually do the | ||
| 244 | * expedited grace period. | ||
| 245 | */ | ||
| 246 | static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) | ||
| 247 | { | ||
| 248 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); | ||
| 249 | struct rcu_node *rnp = rdp->mynode; | ||
| 250 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
| 251 | |||
| 252 | /* Low-contention fastpath. */ | ||
| 253 | if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && | ||
| 254 | (rnp == rnp_root || | ||
| 255 | ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && | ||
| 256 | mutex_trylock(&rsp->exp_mutex)) | ||
| 257 | goto fastpath; | ||
| 258 | |||
| 259 | /* | ||
| 260 | * Each pass through the following loop works its way up | ||
| 261 | * the rcu_node tree, returning if others have done the work or | ||
| 262 | * otherwise falls through to acquire rsp->exp_mutex. The mapping | ||
| 263 | * from CPU to rcu_node structure can be inexact, as it is just | ||
| 264 | * promoting locality and is not strictly needed for correctness. | ||
| 265 | */ | ||
| 266 | for (; rnp != NULL; rnp = rnp->parent) { | ||
| 267 | if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) | ||
| 268 | return true; | ||
| 269 | |||
| 270 | /* Work not done, either wait here or go up. */ | ||
| 271 | spin_lock(&rnp->exp_lock); | ||
| 272 | if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { | ||
| 273 | |||
| 274 | /* Someone else doing GP, so wait for them. */ | ||
| 275 | spin_unlock(&rnp->exp_lock); | ||
| 276 | trace_rcu_exp_funnel_lock(rsp->name, rnp->level, | ||
| 277 | rnp->grplo, rnp->grphi, | ||
| 278 | TPS("wait")); | ||
| 279 | wait_event(rnp->exp_wq[(s >> 1) & 0x3], | ||
| 280 | sync_exp_work_done(rsp, | ||
| 281 | &rdp->exp_workdone2, s)); | ||
| 282 | return true; | ||
| 283 | } | ||
| 284 | rnp->exp_seq_rq = s; /* Followers can wait on us. */ | ||
| 285 | spin_unlock(&rnp->exp_lock); | ||
| 286 | trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, | ||
| 287 | rnp->grphi, TPS("nxtlvl")); | ||
| 288 | } | ||
| 289 | mutex_lock(&rsp->exp_mutex); | ||
| 290 | fastpath: | ||
| 291 | if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { | ||
| 292 | mutex_unlock(&rsp->exp_mutex); | ||
| 293 | return true; | ||
| 294 | } | ||
| 295 | rcu_exp_gp_seq_start(rsp); | ||
| 296 | trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); | ||
| 297 | return false; | ||
| 298 | } | ||
| 299 | |||
| 300 | /* Invoked on each online non-idle CPU for expedited quiescent state. */ | ||
| 301 | static void sync_sched_exp_handler(void *data) | ||
| 302 | { | ||
| 303 | struct rcu_data *rdp; | ||
| 304 | struct rcu_node *rnp; | ||
| 305 | struct rcu_state *rsp = data; | ||
| 306 | |||
| 307 | rdp = this_cpu_ptr(rsp->rda); | ||
| 308 | rnp = rdp->mynode; | ||
| 309 | if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || | ||
| 310 | __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) | ||
| 311 | return; | ||
| 312 | if (rcu_is_cpu_rrupt_from_idle()) { | ||
| 313 | rcu_report_exp_rdp(&rcu_sched_state, | ||
| 314 | this_cpu_ptr(&rcu_sched_data), true); | ||
| 315 | return; | ||
| 316 | } | ||
| 317 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); | ||
| 318 | resched_cpu(smp_processor_id()); | ||
| 319 | } | ||
| 320 | |||
| 321 | /* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ | ||
| 322 | static void sync_sched_exp_online_cleanup(int cpu) | ||
| 323 | { | ||
| 324 | struct rcu_data *rdp; | ||
| 325 | int ret; | ||
| 326 | struct rcu_node *rnp; | ||
| 327 | struct rcu_state *rsp = &rcu_sched_state; | ||
| 328 | |||
| 329 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 330 | rnp = rdp->mynode; | ||
| 331 | if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) | ||
| 332 | return; | ||
| 333 | ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); | ||
| 334 | WARN_ON_ONCE(ret); | ||
| 335 | } | ||
| 336 | |||
| 337 | /* | ||
| 338 | * Select the nodes that the upcoming expedited grace period needs | ||
| 339 | * to wait for. | ||
| 340 | */ | ||
| 341 | static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, | ||
| 342 | smp_call_func_t func) | ||
| 343 | { | ||
| 344 | int cpu; | ||
| 345 | unsigned long flags; | ||
| 346 | unsigned long mask_ofl_test; | ||
| 347 | unsigned long mask_ofl_ipi; | ||
| 348 | int ret; | ||
| 349 | struct rcu_node *rnp; | ||
| 350 | |||
| 351 | sync_exp_reset_tree(rsp); | ||
| 352 | rcu_for_each_leaf_node(rsp, rnp) { | ||
| 353 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 354 | |||
| 355 | /* Each pass checks a CPU for identity, offline, and idle. */ | ||
| 356 | mask_ofl_test = 0; | ||
| 357 | for_each_leaf_node_possible_cpu(rnp, cpu) { | ||
| 358 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 359 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
| 360 | |||
| 361 | if (raw_smp_processor_id() == cpu || | ||
| 362 | !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) | ||
| 363 | mask_ofl_test |= rdp->grpmask; | ||
| 364 | } | ||
| 365 | mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; | ||
| 366 | |||
| 367 | /* | ||
| 368 | * Need to wait for any blocked tasks as well. Note that | ||
| 369 | * additional blocking tasks will also block the expedited | ||
| 370 | * GP until such time as the ->expmask bits are cleared. | ||
| 371 | */ | ||
| 372 | if (rcu_preempt_has_tasks(rnp)) | ||
| 373 | rnp->exp_tasks = rnp->blkd_tasks.next; | ||
| 374 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 375 | |||
| 376 | /* IPI the remaining CPUs for expedited quiescent state. */ | ||
| 377 | for_each_leaf_node_possible_cpu(rnp, cpu) { | ||
| 378 | unsigned long mask = leaf_node_cpu_bit(rnp, cpu); | ||
| 379 | if (!(mask_ofl_ipi & mask)) | ||
| 380 | continue; | ||
| 381 | retry_ipi: | ||
| 382 | ret = smp_call_function_single(cpu, func, rsp, 0); | ||
| 383 | if (!ret) { | ||
| 384 | mask_ofl_ipi &= ~mask; | ||
| 385 | continue; | ||
| 386 | } | ||
| 387 | /* Failed, raced with offline. */ | ||
| 388 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 389 | if (cpu_online(cpu) && | ||
| 390 | (rnp->expmask & mask)) { | ||
| 391 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 392 | schedule_timeout_uninterruptible(1); | ||
| 393 | if (cpu_online(cpu) && | ||
| 394 | (rnp->expmask & mask)) | ||
| 395 | goto retry_ipi; | ||
| 396 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 397 | } | ||
| 398 | if (!(rnp->expmask & mask)) | ||
| 399 | mask_ofl_ipi &= ~mask; | ||
| 400 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 401 | } | ||
| 402 | /* Report quiescent states for those that went offline. */ | ||
| 403 | mask_ofl_test |= mask_ofl_ipi; | ||
| 404 | if (mask_ofl_test) | ||
| 405 | rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); | ||
| 406 | } | ||
| 407 | } | ||
| 408 | |||
| 409 | static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | ||
| 410 | { | ||
| 411 | int cpu; | ||
| 412 | unsigned long jiffies_stall; | ||
| 413 | unsigned long jiffies_start; | ||
| 414 | unsigned long mask; | ||
| 415 | int ndetected; | ||
| 416 | struct rcu_node *rnp; | ||
| 417 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
| 418 | int ret; | ||
| 419 | |||
| 420 | jiffies_stall = rcu_jiffies_till_stall_check(); | ||
| 421 | jiffies_start = jiffies; | ||
| 422 | |||
| 423 | for (;;) { | ||
| 424 | ret = swait_event_timeout( | ||
| 425 | rsp->expedited_wq, | ||
| 426 | sync_rcu_preempt_exp_done(rnp_root), | ||
| 427 | jiffies_stall); | ||
| 428 | if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) | ||
| 429 | return; | ||
| 430 | if (ret < 0) { | ||
| 431 | /* Hit a signal, disable CPU stall warnings. */ | ||
| 432 | swait_event(rsp->expedited_wq, | ||
| 433 | sync_rcu_preempt_exp_done(rnp_root)); | ||
| 434 | return; | ||
| 435 | } | ||
| 436 | pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", | ||
| 437 | rsp->name); | ||
| 438 | ndetected = 0; | ||
| 439 | rcu_for_each_leaf_node(rsp, rnp) { | ||
| 440 | ndetected += rcu_print_task_exp_stall(rnp); | ||
| 441 | for_each_leaf_node_possible_cpu(rnp, cpu) { | ||
| 442 | struct rcu_data *rdp; | ||
| 443 | |||
| 444 | mask = leaf_node_cpu_bit(rnp, cpu); | ||
| 445 | if (!(rnp->expmask & mask)) | ||
| 446 | continue; | ||
| 447 | ndetected++; | ||
| 448 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 449 | pr_cont(" %d-%c%c%c", cpu, | ||
| 450 | "O."[!!cpu_online(cpu)], | ||
| 451 | "o."[!!(rdp->grpmask & rnp->expmaskinit)], | ||
| 452 | "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); | ||
| 453 | } | ||
| 454 | } | ||
| 455 | pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", | ||
| 456 | jiffies - jiffies_start, rsp->expedited_sequence, | ||
| 457 | rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); | ||
| 458 | if (ndetected) { | ||
| 459 | pr_err("blocking rcu_node structures:"); | ||
| 460 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
| 461 | if (rnp == rnp_root) | ||
| 462 | continue; /* printed unconditionally */ | ||
| 463 | if (sync_rcu_preempt_exp_done(rnp)) | ||
| 464 | continue; | ||
| 465 | pr_cont(" l=%u:%d-%d:%#lx/%c", | ||
| 466 | rnp->level, rnp->grplo, rnp->grphi, | ||
| 467 | rnp->expmask, | ||
| 468 | ".T"[!!rnp->exp_tasks]); | ||
| 469 | } | ||
| 470 | pr_cont("\n"); | ||
| 471 | } | ||
| 472 | rcu_for_each_leaf_node(rsp, rnp) { | ||
| 473 | for_each_leaf_node_possible_cpu(rnp, cpu) { | ||
| 474 | mask = leaf_node_cpu_bit(rnp, cpu); | ||
| 475 | if (!(rnp->expmask & mask)) | ||
| 476 | continue; | ||
| 477 | dump_cpu_task(cpu); | ||
| 478 | } | ||
| 479 | } | ||
| 480 | jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; | ||
| 481 | } | ||
| 482 | } | ||
| 483 | |||
| 484 | /* | ||
| 485 | * Wait for the current expedited grace period to complete, and then | ||
| 486 | * wake up everyone who piggybacked on the just-completed expedited | ||
| 487 | * grace period. Also update all the ->exp_seq_rq counters as needed | ||
| 488 | * in order to avoid counter-wrap problems. | ||
| 489 | */ | ||
| 490 | static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) | ||
| 491 | { | ||
| 492 | struct rcu_node *rnp; | ||
| 493 | |||
| 494 | synchronize_sched_expedited_wait(rsp); | ||
| 495 | rcu_exp_gp_seq_end(rsp); | ||
| 496 | trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); | ||
| 497 | |||
| 498 | /* | ||
| 499 | * Switch over to wakeup mode, allowing the next GP, but -only- the | ||
| 500 | * next GP, to proceed. | ||
| 501 | */ | ||
| 502 | mutex_lock(&rsp->exp_wake_mutex); | ||
| 503 | mutex_unlock(&rsp->exp_mutex); | ||
| 504 | |||
| 505 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
| 506 | if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { | ||
| 507 | spin_lock(&rnp->exp_lock); | ||
| 508 | /* Recheck, avoid hang in case someone just arrived. */ | ||
| 509 | if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) | ||
| 510 | rnp->exp_seq_rq = s; | ||
| 511 | spin_unlock(&rnp->exp_lock); | ||
| 512 | } | ||
| 513 | wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); | ||
| 514 | } | ||
| 515 | trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); | ||
| 516 | mutex_unlock(&rsp->exp_wake_mutex); | ||
| 517 | } | ||
| 518 | |||
| 519 | /** | ||
| 520 | * synchronize_sched_expedited - Brute-force RCU-sched grace period | ||
| 521 | * | ||
| 522 | * Wait for an RCU-sched grace period to elapse, but use a "big hammer" | ||
| 523 | * approach to force the grace period to end quickly. This consumes | ||
| 524 | * significant time on all CPUs and is unfriendly to real-time workloads, | ||
| 525 | * so is thus not recommended for any sort of common-case code. In fact, | ||
| 526 | * if you are using synchronize_sched_expedited() in a loop, please | ||
| 527 | * restructure your code to batch your updates, and then use a single | ||
| 528 | * synchronize_sched() instead. | ||
| 529 | * | ||
| 530 | * This implementation can be thought of as an application of sequence | ||
| 531 | * locking to expedited grace periods, but using the sequence counter to | ||
| 532 | * determine when someone else has already done the work instead of for | ||
| 533 | * retrying readers. | ||
| 534 | */ | ||
| 535 | void synchronize_sched_expedited(void) | ||
| 536 | { | ||
| 537 | unsigned long s; | ||
| 538 | struct rcu_state *rsp = &rcu_sched_state; | ||
| 539 | |||
| 540 | /* If only one CPU, this is automatically a grace period. */ | ||
| 541 | if (rcu_blocking_is_gp()) | ||
| 542 | return; | ||
| 543 | |||
| 544 | /* If expedited grace periods are prohibited, fall back to normal. */ | ||
| 545 | if (rcu_gp_is_normal()) { | ||
| 546 | wait_rcu_gp(call_rcu_sched); | ||
| 547 | return; | ||
| 548 | } | ||
| 549 | |||
| 550 | /* Take a snapshot of the sequence number. */ | ||
| 551 | s = rcu_exp_gp_seq_snap(rsp); | ||
| 552 | if (exp_funnel_lock(rsp, s)) | ||
| 553 | return; /* Someone else did our work for us. */ | ||
| 554 | |||
| 555 | /* Initialize the rcu_node tree in preparation for the wait. */ | ||
| 556 | sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); | ||
| 557 | |||
| 558 | /* Wait and clean up, including waking everyone. */ | ||
| 559 | rcu_exp_wait_wake(rsp, s); | ||
| 560 | } | ||
| 561 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
| 562 | |||
| 563 | #ifdef CONFIG_PREEMPT_RCU | ||
| 564 | |||
| 565 | /* | ||
| 566 | * Remote handler for smp_call_function_single(). If there is an | ||
| 567 | * RCU read-side critical section in effect, request that the | ||
| 568 | * next rcu_read_unlock() record the quiescent state up the | ||
| 569 | * ->expmask fields in the rcu_node tree. Otherwise, immediately | ||
| 570 | * report the quiescent state. | ||
| 571 | */ | ||
| 572 | static void sync_rcu_exp_handler(void *info) | ||
| 573 | { | ||
| 574 | struct rcu_data *rdp; | ||
| 575 | struct rcu_state *rsp = info; | ||
| 576 | struct task_struct *t = current; | ||
| 577 | |||
| 578 | /* | ||
| 579 | * Within an RCU read-side critical section, request that the next | ||
| 580 | * rcu_read_unlock() report. Unless this RCU read-side critical | ||
| 581 | * section has already blocked, in which case it is already set | ||
| 582 | * up for the expedited grace period to wait on it. | ||
| 583 | */ | ||
| 584 | if (t->rcu_read_lock_nesting > 0 && | ||
| 585 | !t->rcu_read_unlock_special.b.blocked) { | ||
| 586 | t->rcu_read_unlock_special.b.exp_need_qs = true; | ||
| 587 | return; | ||
| 588 | } | ||
| 589 | |||
| 590 | /* | ||
| 591 | * We are either exiting an RCU read-side critical section (negative | ||
| 592 | * values of t->rcu_read_lock_nesting) or are not in one at all | ||
| 593 | * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU | ||
| 594 | * read-side critical section that blocked before this expedited | ||
| 595 | * grace period started. Either way, we can immediately report | ||
| 596 | * the quiescent state. | ||
| 597 | */ | ||
| 598 | rdp = this_cpu_ptr(rsp->rda); | ||
| 599 | rcu_report_exp_rdp(rsp, rdp, true); | ||
| 600 | } | ||
| 601 | |||
| 602 | /** | ||
| 603 | * synchronize_rcu_expedited - Brute-force RCU grace period | ||
| 604 | * | ||
| 605 | * Wait for an RCU-preempt grace period, but expedite it. The basic | ||
| 606 | * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler | ||
| 607 | * checks whether the CPU is in an RCU-preempt critical section, and | ||
| 608 | * if so, it sets a flag that causes the outermost rcu_read_unlock() | ||
| 609 | * to report the quiescent state. On the other hand, if the CPU is | ||
| 610 | * not in an RCU read-side critical section, the IPI handler reports | ||
| 611 | * the quiescent state immediately. | ||
| 612 | * | ||
| 613 | * Although this is a greate improvement over previous expedited | ||
| 614 | * implementations, it is still unfriendly to real-time workloads, so is | ||
| 615 | * thus not recommended for any sort of common-case code. In fact, if | ||
| 616 | * you are using synchronize_rcu_expedited() in a loop, please restructure | ||
| 617 | * your code to batch your updates, and then Use a single synchronize_rcu() | ||
| 618 | * instead. | ||
| 619 | */ | ||
| 620 | void synchronize_rcu_expedited(void) | ||
| 621 | { | ||
| 622 | struct rcu_state *rsp = rcu_state_p; | ||
| 623 | unsigned long s; | ||
| 624 | |||
| 625 | /* If expedited grace periods are prohibited, fall back to normal. */ | ||
| 626 | if (rcu_gp_is_normal()) { | ||
| 627 | wait_rcu_gp(call_rcu); | ||
| 628 | return; | ||
| 629 | } | ||
| 630 | |||
| 631 | s = rcu_exp_gp_seq_snap(rsp); | ||
| 632 | if (exp_funnel_lock(rsp, s)) | ||
| 633 | return; /* Someone else did our work for us. */ | ||
| 634 | |||
| 635 | /* Initialize the rcu_node tree in preparation for the wait. */ | ||
| 636 | sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); | ||
| 637 | |||
| 638 | /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ | ||
| 639 | rcu_exp_wait_wake(rsp, s); | ||
| 640 | } | ||
| 641 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
| 642 | |||
| 643 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
| 644 | |||
| 645 | /* | ||
| 646 | * Wait for an rcu-preempt grace period, but make it happen quickly. | ||
| 647 | * But because preemptible RCU does not exist, map to rcu-sched. | ||
| 648 | */ | ||
| 649 | void synchronize_rcu_expedited(void) | ||
| 650 | { | ||
| 651 | synchronize_sched_expedited(); | ||
| 652 | } | ||
| 653 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
| 654 | |||
| 655 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ff1cd4e1188d..0082fce402a0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
| @@ -79,8 +79,6 @@ static void __init rcu_bootup_announce_oddness(void) | |||
| 79 | pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); | 79 | pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); |
| 80 | if (IS_ENABLED(CONFIG_PROVE_RCU)) | 80 | if (IS_ENABLED(CONFIG_PROVE_RCU)) |
| 81 | pr_info("\tRCU lockdep checking is enabled.\n"); | 81 | pr_info("\tRCU lockdep checking is enabled.\n"); |
| 82 | if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) | ||
| 83 | pr_info("\tRCU torture testing starts during boot.\n"); | ||
| 84 | if (RCU_NUM_LVLS >= 4) | 82 | if (RCU_NUM_LVLS >= 4) |
| 85 | pr_info("\tFour(or more)-level hierarchy is enabled.\n"); | 83 | pr_info("\tFour(or more)-level hierarchy is enabled.\n"); |
| 86 | if (RCU_FANOUT_LEAF != 16) | 84 | if (RCU_FANOUT_LEAF != 16) |
| @@ -681,84 +679,6 @@ void synchronize_rcu(void) | |||
| 681 | } | 679 | } |
| 682 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 680 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
| 683 | 681 | ||
| 684 | /* | ||
| 685 | * Remote handler for smp_call_function_single(). If there is an | ||
| 686 | * RCU read-side critical section in effect, request that the | ||
| 687 | * next rcu_read_unlock() record the quiescent state up the | ||
| 688 | * ->expmask fields in the rcu_node tree. Otherwise, immediately | ||
| 689 | * report the quiescent state. | ||
| 690 | */ | ||
| 691 | static void sync_rcu_exp_handler(void *info) | ||
| 692 | { | ||
| 693 | struct rcu_data *rdp; | ||
| 694 | struct rcu_state *rsp = info; | ||
| 695 | struct task_struct *t = current; | ||
| 696 | |||
| 697 | /* | ||
| 698 | * Within an RCU read-side critical section, request that the next | ||
| 699 | * rcu_read_unlock() report. Unless this RCU read-side critical | ||
| 700 | * section has already blocked, in which case it is already set | ||
| 701 | * up for the expedited grace period to wait on it. | ||
| 702 | */ | ||
| 703 | if (t->rcu_read_lock_nesting > 0 && | ||
| 704 | !t->rcu_read_unlock_special.b.blocked) { | ||
| 705 | t->rcu_read_unlock_special.b.exp_need_qs = true; | ||
| 706 | return; | ||
| 707 | } | ||
| 708 | |||
| 709 | /* | ||
| 710 | * We are either exiting an RCU read-side critical section (negative | ||
| 711 | * values of t->rcu_read_lock_nesting) or are not in one at all | ||
| 712 | * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU | ||
| 713 | * read-side critical section that blocked before this expedited | ||
| 714 | * grace period started. Either way, we can immediately report | ||
| 715 | * the quiescent state. | ||
| 716 | */ | ||
| 717 | rdp = this_cpu_ptr(rsp->rda); | ||
| 718 | rcu_report_exp_rdp(rsp, rdp, true); | ||
| 719 | } | ||
| 720 | |||
| 721 | /** | ||
| 722 | * synchronize_rcu_expedited - Brute-force RCU grace period | ||
| 723 | * | ||
| 724 | * Wait for an RCU-preempt grace period, but expedite it. The basic | ||
| 725 | * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler | ||
| 726 | * checks whether the CPU is in an RCU-preempt critical section, and | ||
| 727 | * if so, it sets a flag that causes the outermost rcu_read_unlock() | ||
| 728 | * to report the quiescent state. On the other hand, if the CPU is | ||
| 729 | * not in an RCU read-side critical section, the IPI handler reports | ||
| 730 | * the quiescent state immediately. | ||
| 731 | * | ||
| 732 | * Although this is a greate improvement over previous expedited | ||
| 733 | * implementations, it is still unfriendly to real-time workloads, so is | ||
| 734 | * thus not recommended for any sort of common-case code. In fact, if | ||
| 735 | * you are using synchronize_rcu_expedited() in a loop, please restructure | ||
| 736 | * your code to batch your updates, and then Use a single synchronize_rcu() | ||
| 737 | * instead. | ||
| 738 | */ | ||
| 739 | void synchronize_rcu_expedited(void) | ||
| 740 | { | ||
| 741 | struct rcu_state *rsp = rcu_state_p; | ||
| 742 | unsigned long s; | ||
| 743 | |||
| 744 | /* If expedited grace periods are prohibited, fall back to normal. */ | ||
| 745 | if (rcu_gp_is_normal()) { | ||
| 746 | wait_rcu_gp(call_rcu); | ||
| 747 | return; | ||
| 748 | } | ||
| 749 | |||
| 750 | s = rcu_exp_gp_seq_snap(rsp); | ||
| 751 | if (exp_funnel_lock(rsp, s)) | ||
| 752 | return; /* Someone else did our work for us. */ | ||
| 753 | |||
| 754 | /* Initialize the rcu_node tree in preparation for the wait. */ | ||
| 755 | sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); | ||
| 756 | |||
| 757 | /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ | ||
| 758 | rcu_exp_wait_wake(rsp, s); | ||
| 759 | } | ||
| 760 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
| 761 | |||
| 762 | /** | 682 | /** |
| 763 | * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. | 683 | * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. |
| 764 | * | 684 | * |
| @@ -883,16 +803,6 @@ static void rcu_preempt_check_callbacks(void) | |||
| 883 | } | 803 | } |
| 884 | 804 | ||
| 885 | /* | 805 | /* |
| 886 | * Wait for an rcu-preempt grace period, but make it happen quickly. | ||
| 887 | * But because preemptible RCU does not exist, map to rcu-sched. | ||
| 888 | */ | ||
| 889 | void synchronize_rcu_expedited(void) | ||
| 890 | { | ||
| 891 | synchronize_sched_expedited(); | ||
| 892 | } | ||
| 893 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
| 894 | |||
| 895 | /* | ||
| 896 | * Because preemptible RCU does not exist, rcu_barrier() is just | 806 | * Because preemptible RCU does not exist, rcu_barrier() is just |
| 897 | * another name for rcu_barrier_sched(). | 807 | * another name for rcu_barrier_sched(). |
| 898 | */ | 808 | */ |
| @@ -1254,8 +1164,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | |||
| 1254 | return; | 1164 | return; |
| 1255 | if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) | 1165 | if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) |
| 1256 | return; | 1166 | return; |
| 1257 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) | 1167 | for_each_leaf_node_possible_cpu(rnp, cpu) |
| 1258 | if ((mask & 0x1) && cpu != outgoingcpu) | 1168 | if ((mask & leaf_node_cpu_bit(rnp, cpu)) && |
| 1169 | cpu != outgoingcpu) | ||
| 1259 | cpumask_set_cpu(cpu, cm); | 1170 | cpumask_set_cpu(cpu, cm); |
| 1260 | if (cpumask_weight(cm) == 0) | 1171 | if (cpumask_weight(cm) == 0) |
| 1261 | cpumask_setall(cm); | 1172 | cpumask_setall(cm); |
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3e888cd5a594..f0d8322bc3ec 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
| @@ -528,6 +528,7 @@ static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; | |||
| 528 | module_param(rcu_task_stall_timeout, int, 0644); | 528 | module_param(rcu_task_stall_timeout, int, 0644); |
| 529 | 529 | ||
| 530 | static void rcu_spawn_tasks_kthread(void); | 530 | static void rcu_spawn_tasks_kthread(void); |
| 531 | static struct task_struct *rcu_tasks_kthread_ptr; | ||
| 531 | 532 | ||
| 532 | /* | 533 | /* |
| 533 | * Post an RCU-tasks callback. First call must be from process context | 534 | * Post an RCU-tasks callback. First call must be from process context |
| @@ -537,6 +538,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) | |||
| 537 | { | 538 | { |
| 538 | unsigned long flags; | 539 | unsigned long flags; |
| 539 | bool needwake; | 540 | bool needwake; |
| 541 | bool havetask = READ_ONCE(rcu_tasks_kthread_ptr); | ||
| 540 | 542 | ||
| 541 | rhp->next = NULL; | 543 | rhp->next = NULL; |
| 542 | rhp->func = func; | 544 | rhp->func = func; |
| @@ -545,7 +547,9 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) | |||
| 545 | *rcu_tasks_cbs_tail = rhp; | 547 | *rcu_tasks_cbs_tail = rhp; |
| 546 | rcu_tasks_cbs_tail = &rhp->next; | 548 | rcu_tasks_cbs_tail = &rhp->next; |
| 547 | raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); | 549 | raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); |
| 548 | if (needwake) { | 550 | /* We can't create the thread unless interrupts are enabled. */ |
| 551 | if ((needwake && havetask) || | ||
| 552 | (!havetask && !irqs_disabled_flags(flags))) { | ||
| 549 | rcu_spawn_tasks_kthread(); | 553 | rcu_spawn_tasks_kthread(); |
| 550 | wake_up(&rcu_tasks_cbs_wq); | 554 | wake_up(&rcu_tasks_cbs_wq); |
| 551 | } | 555 | } |
| @@ -790,7 +794,6 @@ static int __noreturn rcu_tasks_kthread(void *arg) | |||
| 790 | static void rcu_spawn_tasks_kthread(void) | 794 | static void rcu_spawn_tasks_kthread(void) |
| 791 | { | 795 | { |
| 792 | static DEFINE_MUTEX(rcu_tasks_kthread_mutex); | 796 | static DEFINE_MUTEX(rcu_tasks_kthread_mutex); |
| 793 | static struct task_struct *rcu_tasks_kthread_ptr; | ||
| 794 | struct task_struct *t; | 797 | struct task_struct *t; |
| 795 | 798 | ||
| 796 | if (READ_ONCE(rcu_tasks_kthread_ptr)) { | 799 | if (READ_ONCE(rcu_tasks_kthread_ptr)) { |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 97ee9ac7e97c..5c883fe8e440 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -1937,7 +1937,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | |||
| 1937 | * chain to provide order. Instead we do: | 1937 | * chain to provide order. Instead we do: |
| 1938 | * | 1938 | * |
| 1939 | * 1) smp_store_release(X->on_cpu, 0) | 1939 | * 1) smp_store_release(X->on_cpu, 0) |
| 1940 | * 2) smp_cond_acquire(!X->on_cpu) | 1940 | * 2) smp_cond_load_acquire(!X->on_cpu) |
| 1941 | * | 1941 | * |
| 1942 | * Example: | 1942 | * Example: |
| 1943 | * | 1943 | * |
| @@ -1948,7 +1948,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | |||
| 1948 | * sched-out X | 1948 | * sched-out X |
| 1949 | * smp_store_release(X->on_cpu, 0); | 1949 | * smp_store_release(X->on_cpu, 0); |
| 1950 | * | 1950 | * |
| 1951 | * smp_cond_acquire(!X->on_cpu); | 1951 | * smp_cond_load_acquire(&X->on_cpu, !VAL); |
| 1952 | * X->state = WAKING | 1952 | * X->state = WAKING |
| 1953 | * set_task_cpu(X,2) | 1953 | * set_task_cpu(X,2) |
| 1954 | * | 1954 | * |
| @@ -1974,7 +1974,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | |||
| 1974 | * This means that any means of doing remote wakeups must order the CPU doing | 1974 | * This means that any means of doing remote wakeups must order the CPU doing |
| 1975 | * the wakeup against the CPU the task is going to end up running on. This, | 1975 | * the wakeup against the CPU the task is going to end up running on. This, |
| 1976 | * however, is already required for the regular Program-Order guarantee above, | 1976 | * however, is already required for the regular Program-Order guarantee above, |
| 1977 | * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire). | 1977 | * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). |
| 1978 | * | 1978 | * |
| 1979 | */ | 1979 | */ |
| 1980 | 1980 | ||
| @@ -2047,7 +2047,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
| 2047 | * This ensures that tasks getting woken will be fully ordered against | 2047 | * This ensures that tasks getting woken will be fully ordered against |
| 2048 | * their previous state and preserve Program Order. | 2048 | * their previous state and preserve Program Order. |
| 2049 | */ | 2049 | */ |
| 2050 | smp_cond_acquire(!p->on_cpu); | 2050 | smp_cond_load_acquire(&p->on_cpu, !VAL); |
| 2051 | 2051 | ||
| 2052 | p->sched_contributes_to_load = !!task_contributes_to_load(p); | 2052 | p->sched_contributes_to_load = !!task_contributes_to_load(p); |
| 2053 | p->state = TASK_WAKING; | 2053 | p->state = TASK_WAKING; |
| @@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 2342 | 2342 | ||
| 2343 | __sched_fork(clone_flags, p); | 2343 | __sched_fork(clone_flags, p); |
| 2344 | /* | 2344 | /* |
| 2345 | * We mark the process as running here. This guarantees that | 2345 | * We mark the process as NEW here. This guarantees that |
| 2346 | * nobody will actually run it, and a signal or other external | 2346 | * nobody will actually run it, and a signal or other external |
| 2347 | * event cannot wake it up and insert it on the runqueue either. | 2347 | * event cannot wake it up and insert it on the runqueue either. |
| 2348 | */ | 2348 | */ |
| 2349 | p->state = TASK_RUNNING; | 2349 | p->state = TASK_NEW; |
| 2350 | 2350 | ||
| 2351 | /* | 2351 | /* |
| 2352 | * Make sure we do not leak PI boosting priority to the child. | 2352 | * Make sure we do not leak PI boosting priority to the child. |
| @@ -2383,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 2383 | p->sched_class = &fair_sched_class; | 2383 | p->sched_class = &fair_sched_class; |
| 2384 | } | 2384 | } |
| 2385 | 2385 | ||
| 2386 | if (p->sched_class->task_fork) | 2386 | init_entity_runnable_average(&p->se); |
| 2387 | p->sched_class->task_fork(p); | ||
| 2388 | 2387 | ||
| 2389 | /* | 2388 | /* |
| 2390 | * The child is not yet in the pid-hash so no cgroup attach races, | 2389 | * The child is not yet in the pid-hash so no cgroup attach races, |
| @@ -2394,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 2394 | * Silence PROVE_RCU. | 2393 | * Silence PROVE_RCU. |
| 2395 | */ | 2394 | */ |
| 2396 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2395 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
| 2397 | set_task_cpu(p, cpu); | 2396 | /* |
| 2397 | * We're setting the cpu for the first time, we don't migrate, | ||
| 2398 | * so use __set_task_cpu(). | ||
| 2399 | */ | ||
| 2400 | __set_task_cpu(p, cpu); | ||
| 2401 | if (p->sched_class->task_fork) | ||
| 2402 | p->sched_class->task_fork(p); | ||
| 2398 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 2403 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
| 2399 | 2404 | ||
| 2400 | #ifdef CONFIG_SCHED_INFO | 2405 | #ifdef CONFIG_SCHED_INFO |
| @@ -2526,16 +2531,18 @@ void wake_up_new_task(struct task_struct *p) | |||
| 2526 | struct rq_flags rf; | 2531 | struct rq_flags rf; |
| 2527 | struct rq *rq; | 2532 | struct rq *rq; |
| 2528 | 2533 | ||
| 2529 | /* Initialize new task's runnable average */ | ||
| 2530 | init_entity_runnable_average(&p->se); | ||
| 2531 | raw_spin_lock_irqsave(&p->pi_lock, rf.flags); | 2534 | raw_spin_lock_irqsave(&p->pi_lock, rf.flags); |
| 2535 | p->state = TASK_RUNNING; | ||
| 2532 | #ifdef CONFIG_SMP | 2536 | #ifdef CONFIG_SMP |
| 2533 | /* | 2537 | /* |
| 2534 | * Fork balancing, do it here and not earlier because: | 2538 | * Fork balancing, do it here and not earlier because: |
| 2535 | * - cpus_allowed can change in the fork path | 2539 | * - cpus_allowed can change in the fork path |
| 2536 | * - any previously selected cpu might disappear through hotplug | 2540 | * - any previously selected cpu might disappear through hotplug |
| 2541 | * | ||
| 2542 | * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, | ||
| 2543 | * as we're not fully set-up yet. | ||
| 2537 | */ | 2544 | */ |
| 2538 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); | 2545 | __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); |
| 2539 | #endif | 2546 | #endif |
| 2540 | rq = __task_rq_lock(p, &rf); | 2547 | rq = __task_rq_lock(p, &rf); |
| 2541 | post_init_entity_util_avg(&p->se); | 2548 | post_init_entity_util_avg(&p->se); |
| @@ -3161,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
| 3161 | pr_cont("\n"); | 3168 | pr_cont("\n"); |
| 3162 | } | 3169 | } |
| 3163 | #endif | 3170 | #endif |
| 3171 | if (panic_on_warn) | ||
| 3172 | panic("scheduling while atomic\n"); | ||
| 3173 | |||
| 3164 | dump_stack(); | 3174 | dump_stack(); |
| 3165 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); | 3175 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); |
| 3166 | } | 3176 | } |
| @@ -4752,7 +4762,8 @@ out_unlock: | |||
| 4752 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 4762 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
| 4753 | * @user_mask_ptr: user-space pointer to hold the current cpu mask | 4763 | * @user_mask_ptr: user-space pointer to hold the current cpu mask |
| 4754 | * | 4764 | * |
| 4755 | * Return: 0 on success. An error code otherwise. | 4765 | * Return: size of CPU mask copied to user_mask_ptr on success. An |
| 4766 | * error code otherwise. | ||
| 4756 | */ | 4767 | */ |
| 4757 | SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | 4768 | SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, |
| 4758 | unsigned long __user *, user_mask_ptr) | 4769 | unsigned long __user *, user_mask_ptr) |
| @@ -7233,7 +7244,6 @@ static void sched_rq_cpu_starting(unsigned int cpu) | |||
| 7233 | struct rq *rq = cpu_rq(cpu); | 7244 | struct rq *rq = cpu_rq(cpu); |
| 7234 | 7245 | ||
| 7235 | rq->calc_load_update = calc_load_update; | 7246 | rq->calc_load_update = calc_load_update; |
| 7236 | account_reset_rq(rq); | ||
| 7237 | update_max_interval(); | 7247 | update_max_interval(); |
| 7238 | } | 7248 | } |
| 7239 | 7249 | ||
| @@ -7713,6 +7723,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) | |||
| 7713 | INIT_LIST_HEAD(&tg->children); | 7723 | INIT_LIST_HEAD(&tg->children); |
| 7714 | list_add_rcu(&tg->siblings, &parent->children); | 7724 | list_add_rcu(&tg->siblings, &parent->children); |
| 7715 | spin_unlock_irqrestore(&task_group_lock, flags); | 7725 | spin_unlock_irqrestore(&task_group_lock, flags); |
| 7726 | |||
| 7727 | online_fair_sched_group(tg); | ||
| 7716 | } | 7728 | } |
| 7717 | 7729 | ||
| 7718 | /* rcu callback to free various structures associated with a task group */ | 7730 | /* rcu callback to free various structures associated with a task group */ |
| @@ -7741,27 +7753,9 @@ void sched_offline_group(struct task_group *tg) | |||
| 7741 | spin_unlock_irqrestore(&task_group_lock, flags); | 7753 | spin_unlock_irqrestore(&task_group_lock, flags); |
| 7742 | } | 7754 | } |
| 7743 | 7755 | ||
| 7744 | /* change task's runqueue when it moves between groups. | 7756 | static void sched_change_group(struct task_struct *tsk, int type) |
| 7745 | * The caller of this function should have put the task in its new group | ||
| 7746 | * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to | ||
| 7747 | * reflect its new group. | ||
| 7748 | */ | ||
| 7749 | void sched_move_task(struct task_struct *tsk) | ||
| 7750 | { | 7757 | { |
| 7751 | struct task_group *tg; | 7758 | struct task_group *tg; |
| 7752 | int queued, running; | ||
| 7753 | struct rq_flags rf; | ||
| 7754 | struct rq *rq; | ||
| 7755 | |||
| 7756 | rq = task_rq_lock(tsk, &rf); | ||
| 7757 | |||
| 7758 | running = task_current(rq, tsk); | ||
| 7759 | queued = task_on_rq_queued(tsk); | ||
| 7760 | |||
| 7761 | if (queued) | ||
| 7762 | dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); | ||
| 7763 | if (unlikely(running)) | ||
| 7764 | put_prev_task(rq, tsk); | ||
| 7765 | 7759 | ||
| 7766 | /* | 7760 | /* |
| 7767 | * All callers are synchronized by task_rq_lock(); we do not use RCU | 7761 | * All callers are synchronized by task_rq_lock(); we do not use RCU |
| @@ -7774,11 +7768,37 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7774 | tsk->sched_task_group = tg; | 7768 | tsk->sched_task_group = tg; |
| 7775 | 7769 | ||
| 7776 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7770 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7777 | if (tsk->sched_class->task_move_group) | 7771 | if (tsk->sched_class->task_change_group) |
| 7778 | tsk->sched_class->task_move_group(tsk); | 7772 | tsk->sched_class->task_change_group(tsk, type); |
| 7779 | else | 7773 | else |
| 7780 | #endif | 7774 | #endif |
| 7781 | set_task_rq(tsk, task_cpu(tsk)); | 7775 | set_task_rq(tsk, task_cpu(tsk)); |
| 7776 | } | ||
| 7777 | |||
| 7778 | /* | ||
| 7779 | * Change task's runqueue when it moves between groups. | ||
| 7780 | * | ||
| 7781 | * The caller of this function should have put the task in its new group by | ||
| 7782 | * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect | ||
| 7783 | * its new group. | ||
| 7784 | */ | ||
| 7785 | void sched_move_task(struct task_struct *tsk) | ||
| 7786 | { | ||
| 7787 | int queued, running; | ||
| 7788 | struct rq_flags rf; | ||
| 7789 | struct rq *rq; | ||
| 7790 | |||
| 7791 | rq = task_rq_lock(tsk, &rf); | ||
| 7792 | |||
| 7793 | running = task_current(rq, tsk); | ||
| 7794 | queued = task_on_rq_queued(tsk); | ||
| 7795 | |||
| 7796 | if (queued) | ||
| 7797 | dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); | ||
| 7798 | if (unlikely(running)) | ||
| 7799 | put_prev_task(rq, tsk); | ||
| 7800 | |||
| 7801 | sched_change_group(tsk, TASK_MOVE_GROUP); | ||
| 7782 | 7802 | ||
| 7783 | if (unlikely(running)) | 7803 | if (unlikely(running)) |
| 7784 | tsk->sched_class->set_curr_task(rq); | 7804 | tsk->sched_class->set_curr_task(rq); |
| @@ -8206,15 +8226,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) | |||
| 8206 | sched_free_group(tg); | 8226 | sched_free_group(tg); |
| 8207 | } | 8227 | } |
| 8208 | 8228 | ||
| 8229 | /* | ||
| 8230 | * This is called before wake_up_new_task(), therefore we really only | ||
| 8231 | * have to set its group bits, all the other stuff does not apply. | ||
| 8232 | */ | ||
| 8209 | static void cpu_cgroup_fork(struct task_struct *task) | 8233 | static void cpu_cgroup_fork(struct task_struct *task) |
| 8210 | { | 8234 | { |
| 8211 | sched_move_task(task); | 8235 | struct rq_flags rf; |
| 8236 | struct rq *rq; | ||
| 8237 | |||
| 8238 | rq = task_rq_lock(task, &rf); | ||
| 8239 | |||
| 8240 | sched_change_group(task, TASK_SET_GROUP); | ||
| 8241 | |||
| 8242 | task_rq_unlock(rq, task, &rf); | ||
| 8212 | } | 8243 | } |
| 8213 | 8244 | ||
| 8214 | static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) | 8245 | static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) |
| 8215 | { | 8246 | { |
| 8216 | struct task_struct *task; | 8247 | struct task_struct *task; |
| 8217 | struct cgroup_subsys_state *css; | 8248 | struct cgroup_subsys_state *css; |
| 8249 | int ret = 0; | ||
| 8218 | 8250 | ||
| 8219 | cgroup_taskset_for_each(task, css, tset) { | 8251 | cgroup_taskset_for_each(task, css, tset) { |
| 8220 | #ifdef CONFIG_RT_GROUP_SCHED | 8252 | #ifdef CONFIG_RT_GROUP_SCHED |
| @@ -8225,8 +8257,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) | |||
| 8225 | if (task->sched_class != &fair_sched_class) | 8257 | if (task->sched_class != &fair_sched_class) |
| 8226 | return -EINVAL; | 8258 | return -EINVAL; |
| 8227 | #endif | 8259 | #endif |
| 8260 | /* | ||
| 8261 | * Serialize against wake_up_new_task() such that if its | ||
| 8262 | * running, we're sure to observe its full state. | ||
| 8263 | */ | ||
| 8264 | raw_spin_lock_irq(&task->pi_lock); | ||
| 8265 | /* | ||
| 8266 | * Avoid calling sched_move_task() before wake_up_new_task() | ||
| 8267 | * has happened. This would lead to problems with PELT, due to | ||
| 8268 | * move wanting to detach+attach while we're not attached yet. | ||
| 8269 | */ | ||
| 8270 | if (task->state == TASK_NEW) | ||
| 8271 | ret = -EINVAL; | ||
| 8272 | raw_spin_unlock_irq(&task->pi_lock); | ||
| 8273 | |||
| 8274 | if (ret) | ||
| 8275 | break; | ||
| 8228 | } | 8276 | } |
| 8229 | return 0; | 8277 | return ret; |
| 8230 | } | 8278 | } |
| 8231 | 8279 | ||
| 8232 | static void cpu_cgroup_attach(struct cgroup_taskset *tset) | 8280 | static void cpu_cgroup_attach(struct cgroup_taskset *tset) |
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 41f85c4d0938..bc0b309c3f19 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
| @@ -25,15 +25,13 @@ enum cpuacct_stat_index { | |||
| 25 | CPUACCT_STAT_NSTATS, | 25 | CPUACCT_STAT_NSTATS, |
| 26 | }; | 26 | }; |
| 27 | 27 | ||
| 28 | enum cpuacct_usage_index { | 28 | static const char * const cpuacct_stat_desc[] = { |
| 29 | CPUACCT_USAGE_USER, /* ... user mode */ | 29 | [CPUACCT_STAT_USER] = "user", |
| 30 | CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ | 30 | [CPUACCT_STAT_SYSTEM] = "system", |
| 31 | |||
| 32 | CPUACCT_USAGE_NRUSAGE, | ||
| 33 | }; | 31 | }; |
| 34 | 32 | ||
| 35 | struct cpuacct_usage { | 33 | struct cpuacct_usage { |
| 36 | u64 usages[CPUACCT_USAGE_NRUSAGE]; | 34 | u64 usages[CPUACCT_STAT_NSTATS]; |
| 37 | }; | 35 | }; |
| 38 | 36 | ||
| 39 | /* track cpu usage of a group of tasks and its child groups */ | 37 | /* track cpu usage of a group of tasks and its child groups */ |
| @@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) | |||
| 108 | } | 106 | } |
| 109 | 107 | ||
| 110 | static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, | 108 | static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, |
| 111 | enum cpuacct_usage_index index) | 109 | enum cpuacct_stat_index index) |
| 112 | { | 110 | { |
| 113 | struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | 111 | struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); |
| 114 | u64 data; | 112 | u64 data; |
| 115 | 113 | ||
| 116 | /* | 114 | /* |
| 117 | * We allow index == CPUACCT_USAGE_NRUSAGE here to read | 115 | * We allow index == CPUACCT_STAT_NSTATS here to read |
| 118 | * the sum of suages. | 116 | * the sum of suages. |
| 119 | */ | 117 | */ |
| 120 | BUG_ON(index > CPUACCT_USAGE_NRUSAGE); | 118 | BUG_ON(index > CPUACCT_STAT_NSTATS); |
| 121 | 119 | ||
| 122 | #ifndef CONFIG_64BIT | 120 | #ifndef CONFIG_64BIT |
| 123 | /* | 121 | /* |
| @@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, | |||
| 126 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | 124 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); |
| 127 | #endif | 125 | #endif |
| 128 | 126 | ||
| 129 | if (index == CPUACCT_USAGE_NRUSAGE) { | 127 | if (index == CPUACCT_STAT_NSTATS) { |
| 130 | int i = 0; | 128 | int i = 0; |
| 131 | 129 | ||
| 132 | data = 0; | 130 | data = 0; |
| 133 | for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) | 131 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) |
| 134 | data += cpuusage->usages[i]; | 132 | data += cpuusage->usages[i]; |
| 135 | } else { | 133 | } else { |
| 136 | data = cpuusage->usages[index]; | 134 | data = cpuusage->usages[index]; |
| @@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | |||
| 155 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | 153 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); |
| 156 | #endif | 154 | #endif |
| 157 | 155 | ||
| 158 | for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) | 156 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) |
| 159 | cpuusage->usages[i] = val; | 157 | cpuusage->usages[i] = val; |
| 160 | 158 | ||
| 161 | #ifndef CONFIG_64BIT | 159 | #ifndef CONFIG_64BIT |
| @@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | |||
| 165 | 163 | ||
| 166 | /* return total cpu usage (in nanoseconds) of a group */ | 164 | /* return total cpu usage (in nanoseconds) of a group */ |
| 167 | static u64 __cpuusage_read(struct cgroup_subsys_state *css, | 165 | static u64 __cpuusage_read(struct cgroup_subsys_state *css, |
| 168 | enum cpuacct_usage_index index) | 166 | enum cpuacct_stat_index index) |
| 169 | { | 167 | { |
| 170 | struct cpuacct *ca = css_ca(css); | 168 | struct cpuacct *ca = css_ca(css); |
| 171 | u64 totalcpuusage = 0; | 169 | u64 totalcpuusage = 0; |
| @@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css, | |||
| 180 | static u64 cpuusage_user_read(struct cgroup_subsys_state *css, | 178 | static u64 cpuusage_user_read(struct cgroup_subsys_state *css, |
| 181 | struct cftype *cft) | 179 | struct cftype *cft) |
| 182 | { | 180 | { |
| 183 | return __cpuusage_read(css, CPUACCT_USAGE_USER); | 181 | return __cpuusage_read(css, CPUACCT_STAT_USER); |
| 184 | } | 182 | } |
| 185 | 183 | ||
| 186 | static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, | 184 | static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, |
| 187 | struct cftype *cft) | 185 | struct cftype *cft) |
| 188 | { | 186 | { |
| 189 | return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); | 187 | return __cpuusage_read(css, CPUACCT_STAT_SYSTEM); |
| 190 | } | 188 | } |
| 191 | 189 | ||
| 192 | static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) | 190 | static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) |
| 193 | { | 191 | { |
| 194 | return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); | 192 | return __cpuusage_read(css, CPUACCT_STAT_NSTATS); |
| 195 | } | 193 | } |
| 196 | 194 | ||
| 197 | static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, | 195 | static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, |
| @@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, | |||
| 213 | } | 211 | } |
| 214 | 212 | ||
| 215 | static int __cpuacct_percpu_seq_show(struct seq_file *m, | 213 | static int __cpuacct_percpu_seq_show(struct seq_file *m, |
| 216 | enum cpuacct_usage_index index) | 214 | enum cpuacct_stat_index index) |
| 217 | { | 215 | { |
| 218 | struct cpuacct *ca = css_ca(seq_css(m)); | 216 | struct cpuacct *ca = css_ca(seq_css(m)); |
| 219 | u64 percpu; | 217 | u64 percpu; |
| @@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m, | |||
| 229 | 227 | ||
| 230 | static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) | 228 | static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) |
| 231 | { | 229 | { |
| 232 | return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); | 230 | return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER); |
| 233 | } | 231 | } |
| 234 | 232 | ||
| 235 | static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) | 233 | static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) |
| 236 | { | 234 | { |
| 237 | return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); | 235 | return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM); |
| 238 | } | 236 | } |
| 239 | 237 | ||
| 240 | static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) | 238 | static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) |
| 241 | { | 239 | { |
| 242 | return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); | 240 | return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS); |
| 243 | } | 241 | } |
| 244 | 242 | ||
| 245 | static const char * const cpuacct_stat_desc[] = { | 243 | static int cpuacct_all_seq_show(struct seq_file *m, void *V) |
| 246 | [CPUACCT_STAT_USER] = "user", | 244 | { |
| 247 | [CPUACCT_STAT_SYSTEM] = "system", | 245 | struct cpuacct *ca = css_ca(seq_css(m)); |
| 248 | }; | 246 | int index; |
| 247 | int cpu; | ||
| 248 | |||
| 249 | seq_puts(m, "cpu"); | ||
| 250 | for (index = 0; index < CPUACCT_STAT_NSTATS; index++) | ||
| 251 | seq_printf(m, " %s", cpuacct_stat_desc[index]); | ||
| 252 | seq_puts(m, "\n"); | ||
| 253 | |||
| 254 | for_each_possible_cpu(cpu) { | ||
| 255 | struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
| 256 | |||
| 257 | seq_printf(m, "%d", cpu); | ||
| 258 | |||
| 259 | for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { | ||
| 260 | #ifndef CONFIG_64BIT | ||
| 261 | /* | ||
| 262 | * Take rq->lock to make 64-bit read safe on 32-bit | ||
| 263 | * platforms. | ||
| 264 | */ | ||
| 265 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | ||
| 266 | #endif | ||
| 267 | |||
| 268 | seq_printf(m, " %llu", cpuusage->usages[index]); | ||
| 269 | |||
| 270 | #ifndef CONFIG_64BIT | ||
| 271 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
| 272 | #endif | ||
| 273 | } | ||
| 274 | seq_puts(m, "\n"); | ||
| 275 | } | ||
| 276 | return 0; | ||
| 277 | } | ||
| 249 | 278 | ||
| 250 | static int cpuacct_stats_show(struct seq_file *sf, void *v) | 279 | static int cpuacct_stats_show(struct seq_file *sf, void *v) |
| 251 | { | 280 | { |
| 252 | struct cpuacct *ca = css_ca(seq_css(sf)); | 281 | struct cpuacct *ca = css_ca(seq_css(sf)); |
| 282 | s64 val[CPUACCT_STAT_NSTATS]; | ||
| 253 | int cpu; | 283 | int cpu; |
| 254 | s64 val = 0; | 284 | int stat; |
| 255 | 285 | ||
| 286 | memset(val, 0, sizeof(val)); | ||
| 256 | for_each_possible_cpu(cpu) { | 287 | for_each_possible_cpu(cpu) { |
| 257 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | 288 | u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; |
| 258 | val += kcpustat->cpustat[CPUTIME_USER]; | ||
| 259 | val += kcpustat->cpustat[CPUTIME_NICE]; | ||
| 260 | } | ||
| 261 | val = cputime64_to_clock_t(val); | ||
| 262 | seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); | ||
| 263 | 289 | ||
| 264 | val = 0; | 290 | val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; |
| 265 | for_each_possible_cpu(cpu) { | 291 | val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; |
| 266 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | 292 | val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; |
| 267 | val += kcpustat->cpustat[CPUTIME_SYSTEM]; | 293 | val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; |
| 268 | val += kcpustat->cpustat[CPUTIME_IRQ]; | 294 | val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; |
| 269 | val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; | ||
| 270 | } | 295 | } |
| 271 | 296 | ||
| 272 | val = cputime64_to_clock_t(val); | 297 | for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { |
| 273 | seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); | 298 | seq_printf(sf, "%s %lld\n", |
| 299 | cpuacct_stat_desc[stat], | ||
| 300 | cputime64_to_clock_t(val[stat])); | ||
| 301 | } | ||
| 274 | 302 | ||
| 275 | return 0; | 303 | return 0; |
| 276 | } | 304 | } |
| @@ -302,6 +330,10 @@ static struct cftype files[] = { | |||
| 302 | .seq_show = cpuacct_percpu_sys_seq_show, | 330 | .seq_show = cpuacct_percpu_sys_seq_show, |
| 303 | }, | 331 | }, |
| 304 | { | 332 | { |
| 333 | .name = "usage_all", | ||
| 334 | .seq_show = cpuacct_all_seq_show, | ||
| 335 | }, | ||
| 336 | { | ||
| 305 | .name = "stat", | 337 | .name = "stat", |
| 306 | .seq_show = cpuacct_stats_show, | 338 | .seq_show = cpuacct_stats_show, |
| 307 | }, | 339 | }, |
| @@ -316,11 +348,11 @@ static struct cftype files[] = { | |||
| 316 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) | 348 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) |
| 317 | { | 349 | { |
| 318 | struct cpuacct *ca; | 350 | struct cpuacct *ca; |
| 319 | int index = CPUACCT_USAGE_SYSTEM; | 351 | int index = CPUACCT_STAT_SYSTEM; |
| 320 | struct pt_regs *regs = task_pt_regs(tsk); | 352 | struct pt_regs *regs = task_pt_regs(tsk); |
| 321 | 353 | ||
| 322 | if (regs && user_mode(regs)) | 354 | if (regs && user_mode(regs)) |
| 323 | index = CPUACCT_USAGE_USER; | 355 | index = CPUACCT_STAT_USER; |
| 324 | 356 | ||
| 325 | rcu_read_lock(); | 357 | rcu_read_lock(); |
| 326 | 358 | ||
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 14c4aa25cc45..a84641b222c1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
| @@ -47,6 +47,8 @@ struct sugov_cpu { | |||
| 47 | struct update_util_data update_util; | 47 | struct update_util_data update_util; |
| 48 | struct sugov_policy *sg_policy; | 48 | struct sugov_policy *sg_policy; |
| 49 | 49 | ||
| 50 | unsigned int cached_raw_freq; | ||
| 51 | |||
| 50 | /* The fields below are only needed when sharing a policy. */ | 52 | /* The fields below are only needed when sharing a policy. */ |
| 51 | unsigned long util; | 53 | unsigned long util; |
| 52 | unsigned long max; | 54 | unsigned long max; |
| @@ -106,7 +108,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, | |||
| 106 | 108 | ||
| 107 | /** | 109 | /** |
| 108 | * get_next_freq - Compute a new frequency for a given cpufreq policy. | 110 | * get_next_freq - Compute a new frequency for a given cpufreq policy. |
| 109 | * @policy: cpufreq policy object to compute the new frequency for. | 111 | * @sg_cpu: schedutil cpu object to compute the new frequency for. |
| 110 | * @util: Current CPU utilization. | 112 | * @util: Current CPU utilization. |
| 111 | * @max: CPU capacity. | 113 | * @max: CPU capacity. |
| 112 | * | 114 | * |
| @@ -121,14 +123,25 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, | |||
| 121 | * next_freq = C * curr_freq * util_raw / max | 123 | * next_freq = C * curr_freq * util_raw / max |
| 122 | * | 124 | * |
| 123 | * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. | 125 | * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. |
| 126 | * | ||
| 127 | * The lowest driver-supported frequency which is equal or greater than the raw | ||
| 128 | * next_freq (as calculated above) is returned, subject to policy min/max and | ||
| 129 | * cpufreq driver limitations. | ||
| 124 | */ | 130 | */ |
| 125 | static unsigned int get_next_freq(struct cpufreq_policy *policy, | 131 | static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, |
| 126 | unsigned long util, unsigned long max) | 132 | unsigned long max) |
| 127 | { | 133 | { |
| 134 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | ||
| 135 | struct cpufreq_policy *policy = sg_policy->policy; | ||
| 128 | unsigned int freq = arch_scale_freq_invariant() ? | 136 | unsigned int freq = arch_scale_freq_invariant() ? |
| 129 | policy->cpuinfo.max_freq : policy->cur; | 137 | policy->cpuinfo.max_freq : policy->cur; |
| 130 | 138 | ||
| 131 | return (freq + (freq >> 2)) * util / max; | 139 | freq = (freq + (freq >> 2)) * util / max; |
| 140 | |||
| 141 | if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX) | ||
| 142 | return sg_policy->next_freq; | ||
| 143 | sg_cpu->cached_raw_freq = freq; | ||
| 144 | return cpufreq_driver_resolve_freq(policy, freq); | ||
| 132 | } | 145 | } |
| 133 | 146 | ||
| 134 | static void sugov_update_single(struct update_util_data *hook, u64 time, | 147 | static void sugov_update_single(struct update_util_data *hook, u64 time, |
| @@ -143,13 +156,14 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, | |||
| 143 | return; | 156 | return; |
| 144 | 157 | ||
| 145 | next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : | 158 | next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : |
| 146 | get_next_freq(policy, util, max); | 159 | get_next_freq(sg_cpu, util, max); |
| 147 | sugov_update_commit(sg_policy, time, next_f); | 160 | sugov_update_commit(sg_policy, time, next_f); |
| 148 | } | 161 | } |
| 149 | 162 | ||
| 150 | static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy, | 163 | static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, |
| 151 | unsigned long util, unsigned long max) | 164 | unsigned long util, unsigned long max) |
| 152 | { | 165 | { |
| 166 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | ||
| 153 | struct cpufreq_policy *policy = sg_policy->policy; | 167 | struct cpufreq_policy *policy = sg_policy->policy; |
| 154 | unsigned int max_f = policy->cpuinfo.max_freq; | 168 | unsigned int max_f = policy->cpuinfo.max_freq; |
| 155 | u64 last_freq_update_time = sg_policy->last_freq_update_time; | 169 | u64 last_freq_update_time = sg_policy->last_freq_update_time; |
| @@ -189,7 +203,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy, | |||
| 189 | } | 203 | } |
| 190 | } | 204 | } |
| 191 | 205 | ||
| 192 | return get_next_freq(policy, util, max); | 206 | return get_next_freq(sg_cpu, util, max); |
| 193 | } | 207 | } |
| 194 | 208 | ||
| 195 | static void sugov_update_shared(struct update_util_data *hook, u64 time, | 209 | static void sugov_update_shared(struct update_util_data *hook, u64 time, |
| @@ -206,7 +220,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, | |||
| 206 | sg_cpu->last_update = time; | 220 | sg_cpu->last_update = time; |
| 207 | 221 | ||
| 208 | if (sugov_should_update_freq(sg_policy, time)) { | 222 | if (sugov_should_update_freq(sg_policy, time)) { |
| 209 | next_f = sugov_next_freq_shared(sg_policy, util, max); | 223 | next_f = sugov_next_freq_shared(sg_cpu, util, max); |
| 210 | sugov_update_commit(sg_policy, time, next_f); | 224 | sugov_update_commit(sg_policy, time, next_f); |
| 211 | } | 225 | } |
| 212 | 226 | ||
| @@ -394,7 +408,7 @@ static int sugov_init(struct cpufreq_policy *policy) | |||
| 394 | return ret; | 408 | return ret; |
| 395 | } | 409 | } |
| 396 | 410 | ||
| 397 | static int sugov_exit(struct cpufreq_policy *policy) | 411 | static void sugov_exit(struct cpufreq_policy *policy) |
| 398 | { | 412 | { |
| 399 | struct sugov_policy *sg_policy = policy->governor_data; | 413 | struct sugov_policy *sg_policy = policy->governor_data; |
| 400 | struct sugov_tunables *tunables = sg_policy->tunables; | 414 | struct sugov_tunables *tunables = sg_policy->tunables; |
| @@ -412,7 +426,6 @@ static int sugov_exit(struct cpufreq_policy *policy) | |||
| 412 | mutex_unlock(&global_tunables_lock); | 426 | mutex_unlock(&global_tunables_lock); |
| 413 | 427 | ||
| 414 | sugov_policy_free(sg_policy); | 428 | sugov_policy_free(sg_policy); |
| 415 | return 0; | ||
| 416 | } | 429 | } |
| 417 | 430 | ||
| 418 | static int sugov_start(struct cpufreq_policy *policy) | 431 | static int sugov_start(struct cpufreq_policy *policy) |
| @@ -434,6 +447,7 @@ static int sugov_start(struct cpufreq_policy *policy) | |||
| 434 | sg_cpu->util = ULONG_MAX; | 447 | sg_cpu->util = ULONG_MAX; |
| 435 | sg_cpu->max = 0; | 448 | sg_cpu->max = 0; |
| 436 | sg_cpu->last_update = 0; | 449 | sg_cpu->last_update = 0; |
| 450 | sg_cpu->cached_raw_freq = 0; | ||
| 437 | cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, | 451 | cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, |
| 438 | sugov_update_shared); | 452 | sugov_update_shared); |
| 439 | } else { | 453 | } else { |
| @@ -444,7 +458,7 @@ static int sugov_start(struct cpufreq_policy *policy) | |||
| 444 | return 0; | 458 | return 0; |
| 445 | } | 459 | } |
| 446 | 460 | ||
| 447 | static int sugov_stop(struct cpufreq_policy *policy) | 461 | static void sugov_stop(struct cpufreq_policy *policy) |
| 448 | { | 462 | { |
| 449 | struct sugov_policy *sg_policy = policy->governor_data; | 463 | struct sugov_policy *sg_policy = policy->governor_data; |
| 450 | unsigned int cpu; | 464 | unsigned int cpu; |
| @@ -456,53 +470,29 @@ static int sugov_stop(struct cpufreq_policy *policy) | |||
| 456 | 470 | ||
| 457 | irq_work_sync(&sg_policy->irq_work); | 471 | irq_work_sync(&sg_policy->irq_work); |
| 458 | cancel_work_sync(&sg_policy->work); | 472 | cancel_work_sync(&sg_policy->work); |
| 459 | return 0; | ||
| 460 | } | 473 | } |
| 461 | 474 | ||
| 462 | static int sugov_limits(struct cpufreq_policy *policy) | 475 | static void sugov_limits(struct cpufreq_policy *policy) |
| 463 | { | 476 | { |
| 464 | struct sugov_policy *sg_policy = policy->governor_data; | 477 | struct sugov_policy *sg_policy = policy->governor_data; |
| 465 | 478 | ||
| 466 | if (!policy->fast_switch_enabled) { | 479 | if (!policy->fast_switch_enabled) { |
| 467 | mutex_lock(&sg_policy->work_lock); | 480 | mutex_lock(&sg_policy->work_lock); |
| 468 | 481 | cpufreq_policy_apply_limits(policy); | |
| 469 | if (policy->max < policy->cur) | ||
| 470 | __cpufreq_driver_target(policy, policy->max, | ||
| 471 | CPUFREQ_RELATION_H); | ||
| 472 | else if (policy->min > policy->cur) | ||
| 473 | __cpufreq_driver_target(policy, policy->min, | ||
| 474 | CPUFREQ_RELATION_L); | ||
| 475 | |||
| 476 | mutex_unlock(&sg_policy->work_lock); | 482 | mutex_unlock(&sg_policy->work_lock); |
| 477 | } | 483 | } |
| 478 | 484 | ||
| 479 | sg_policy->need_freq_update = true; | 485 | sg_policy->need_freq_update = true; |
| 480 | return 0; | ||
| 481 | } | ||
| 482 | |||
| 483 | int sugov_governor(struct cpufreq_policy *policy, unsigned int event) | ||
| 484 | { | ||
| 485 | if (event == CPUFREQ_GOV_POLICY_INIT) { | ||
| 486 | return sugov_init(policy); | ||
| 487 | } else if (policy->governor_data) { | ||
| 488 | switch (event) { | ||
| 489 | case CPUFREQ_GOV_POLICY_EXIT: | ||
| 490 | return sugov_exit(policy); | ||
| 491 | case CPUFREQ_GOV_START: | ||
| 492 | return sugov_start(policy); | ||
| 493 | case CPUFREQ_GOV_STOP: | ||
| 494 | return sugov_stop(policy); | ||
| 495 | case CPUFREQ_GOV_LIMITS: | ||
| 496 | return sugov_limits(policy); | ||
| 497 | } | ||
| 498 | } | ||
| 499 | return -EINVAL; | ||
| 500 | } | 486 | } |
| 501 | 487 | ||
| 502 | static struct cpufreq_governor schedutil_gov = { | 488 | static struct cpufreq_governor schedutil_gov = { |
| 503 | .name = "schedutil", | 489 | .name = "schedutil", |
| 504 | .governor = sugov_governor, | ||
| 505 | .owner = THIS_MODULE, | 490 | .owner = THIS_MODULE, |
| 491 | .init = sugov_init, | ||
| 492 | .exit = sugov_exit, | ||
| 493 | .start = sugov_start, | ||
| 494 | .stop = sugov_stop, | ||
| 495 | .limits = sugov_limits, | ||
| 506 | }; | 496 | }; |
| 507 | 497 | ||
| 508 | static int __init sugov_module_init(void) | 498 | static int __init sugov_module_init(void) |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 75f98c5498d5..1934f658c036 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
| @@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); | |||
| 49 | */ | 49 | */ |
| 50 | void irqtime_account_irq(struct task_struct *curr) | 50 | void irqtime_account_irq(struct task_struct *curr) |
| 51 | { | 51 | { |
| 52 | unsigned long flags; | ||
| 53 | s64 delta; | 52 | s64 delta; |
| 54 | int cpu; | 53 | int cpu; |
| 55 | 54 | ||
| 56 | if (!sched_clock_irqtime) | 55 | if (!sched_clock_irqtime) |
| 57 | return; | 56 | return; |
| 58 | 57 | ||
| 59 | local_irq_save(flags); | ||
| 60 | |||
| 61 | cpu = smp_processor_id(); | 58 | cpu = smp_processor_id(); |
| 62 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | 59 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); |
| 63 | __this_cpu_add(irq_start_time, delta); | 60 | __this_cpu_add(irq_start_time, delta); |
| @@ -75,44 +72,53 @@ void irqtime_account_irq(struct task_struct *curr) | |||
| 75 | __this_cpu_add(cpu_softirq_time, delta); | 72 | __this_cpu_add(cpu_softirq_time, delta); |
| 76 | 73 | ||
| 77 | irq_time_write_end(); | 74 | irq_time_write_end(); |
| 78 | local_irq_restore(flags); | ||
| 79 | } | 75 | } |
| 80 | EXPORT_SYMBOL_GPL(irqtime_account_irq); | 76 | EXPORT_SYMBOL_GPL(irqtime_account_irq); |
| 81 | 77 | ||
| 82 | static int irqtime_account_hi_update(void) | 78 | static cputime_t irqtime_account_hi_update(cputime_t maxtime) |
| 83 | { | 79 | { |
| 84 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 80 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
| 85 | unsigned long flags; | 81 | unsigned long flags; |
| 86 | u64 latest_ns; | 82 | cputime_t irq_cputime; |
| 87 | int ret = 0; | ||
| 88 | 83 | ||
| 89 | local_irq_save(flags); | 84 | local_irq_save(flags); |
| 90 | latest_ns = this_cpu_read(cpu_hardirq_time); | 85 | irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - |
| 91 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) | 86 | cpustat[CPUTIME_IRQ]; |
| 92 | ret = 1; | 87 | irq_cputime = min(irq_cputime, maxtime); |
| 88 | cpustat[CPUTIME_IRQ] += irq_cputime; | ||
| 93 | local_irq_restore(flags); | 89 | local_irq_restore(flags); |
| 94 | return ret; | 90 | return irq_cputime; |
| 95 | } | 91 | } |
| 96 | 92 | ||
| 97 | static int irqtime_account_si_update(void) | 93 | static cputime_t irqtime_account_si_update(cputime_t maxtime) |
| 98 | { | 94 | { |
| 99 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 95 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
| 100 | unsigned long flags; | 96 | unsigned long flags; |
| 101 | u64 latest_ns; | 97 | cputime_t softirq_cputime; |
| 102 | int ret = 0; | ||
| 103 | 98 | ||
| 104 | local_irq_save(flags); | 99 | local_irq_save(flags); |
| 105 | latest_ns = this_cpu_read(cpu_softirq_time); | 100 | softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - |
| 106 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) | 101 | cpustat[CPUTIME_SOFTIRQ]; |
| 107 | ret = 1; | 102 | softirq_cputime = min(softirq_cputime, maxtime); |
| 103 | cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; | ||
| 108 | local_irq_restore(flags); | 104 | local_irq_restore(flags); |
| 109 | return ret; | 105 | return softirq_cputime; |
| 110 | } | 106 | } |
| 111 | 107 | ||
| 112 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 108 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
| 113 | 109 | ||
| 114 | #define sched_clock_irqtime (0) | 110 | #define sched_clock_irqtime (0) |
| 115 | 111 | ||
| 112 | static cputime_t irqtime_account_hi_update(cputime_t dummy) | ||
| 113 | { | ||
| 114 | return 0; | ||
| 115 | } | ||
| 116 | |||
| 117 | static cputime_t irqtime_account_si_update(cputime_t dummy) | ||
| 118 | { | ||
| 119 | return 0; | ||
| 120 | } | ||
| 121 | |||
| 116 | #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ | 122 | #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ |
| 117 | 123 | ||
| 118 | static inline void task_group_account_field(struct task_struct *p, int index, | 124 | static inline void task_group_account_field(struct task_struct *p, int index, |
| @@ -257,29 +263,42 @@ void account_idle_time(cputime_t cputime) | |||
| 257 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | 263 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; |
| 258 | } | 264 | } |
| 259 | 265 | ||
| 260 | static __always_inline bool steal_account_process_tick(void) | 266 | static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) |
| 261 | { | 267 | { |
| 262 | #ifdef CONFIG_PARAVIRT | 268 | #ifdef CONFIG_PARAVIRT |
| 263 | if (static_key_false(¶virt_steal_enabled)) { | 269 | if (static_key_false(¶virt_steal_enabled)) { |
| 270 | cputime_t steal_cputime; | ||
| 264 | u64 steal; | 271 | u64 steal; |
| 265 | unsigned long steal_jiffies; | ||
| 266 | 272 | ||
| 267 | steal = paravirt_steal_clock(smp_processor_id()); | 273 | steal = paravirt_steal_clock(smp_processor_id()); |
| 268 | steal -= this_rq()->prev_steal_time; | 274 | steal -= this_rq()->prev_steal_time; |
| 269 | 275 | ||
| 270 | /* | 276 | steal_cputime = min(nsecs_to_cputime(steal), maxtime); |
| 271 | * steal is in nsecs but our caller is expecting steal | 277 | account_steal_time(steal_cputime); |
| 272 | * time in jiffies. Lets cast the result to jiffies | 278 | this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime); |
| 273 | * granularity and account the rest on the next rounds. | ||
| 274 | */ | ||
| 275 | steal_jiffies = nsecs_to_jiffies(steal); | ||
| 276 | this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); | ||
| 277 | 279 | ||
| 278 | account_steal_time(jiffies_to_cputime(steal_jiffies)); | 280 | return steal_cputime; |
| 279 | return steal_jiffies; | ||
| 280 | } | 281 | } |
| 281 | #endif | 282 | #endif |
| 282 | return false; | 283 | return 0; |
| 284 | } | ||
| 285 | |||
| 286 | /* | ||
| 287 | * Account how much elapsed time was spent in steal, irq, or softirq time. | ||
| 288 | */ | ||
| 289 | static inline cputime_t account_other_time(cputime_t max) | ||
| 290 | { | ||
| 291 | cputime_t accounted; | ||
| 292 | |||
| 293 | accounted = steal_account_process_time(max); | ||
| 294 | |||
| 295 | if (accounted < max) | ||
| 296 | accounted += irqtime_account_hi_update(max - accounted); | ||
| 297 | |||
| 298 | if (accounted < max) | ||
| 299 | accounted += irqtime_account_si_update(max - accounted); | ||
| 300 | |||
| 301 | return accounted; | ||
| 283 | } | 302 | } |
| 284 | 303 | ||
| 285 | /* | 304 | /* |
| @@ -342,21 +361,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
| 342 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | 361 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
| 343 | struct rq *rq, int ticks) | 362 | struct rq *rq, int ticks) |
| 344 | { | 363 | { |
| 345 | cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); | 364 | u64 cputime = (__force u64) cputime_one_jiffy * ticks; |
| 346 | u64 cputime = (__force u64) cputime_one_jiffy; | 365 | cputime_t scaled, other; |
| 347 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 348 | 366 | ||
| 349 | if (steal_account_process_tick()) | 367 | /* |
| 368 | * When returning from idle, many ticks can get accounted at | ||
| 369 | * once, including some ticks of steal, irq, and softirq time. | ||
| 370 | * Subtract those ticks from the amount of time accounted to | ||
| 371 | * idle, or potentially user or system time. Due to rounding, | ||
| 372 | * other time can exceed ticks occasionally. | ||
| 373 | */ | ||
| 374 | other = account_other_time(cputime); | ||
| 375 | if (other >= cputime) | ||
| 350 | return; | 376 | return; |
| 377 | cputime -= other; | ||
| 378 | scaled = cputime_to_scaled(cputime); | ||
| 351 | 379 | ||
| 352 | cputime *= ticks; | 380 | if (this_cpu_ksoftirqd() == p) { |
| 353 | scaled *= ticks; | ||
| 354 | |||
| 355 | if (irqtime_account_hi_update()) { | ||
| 356 | cpustat[CPUTIME_IRQ] += cputime; | ||
| 357 | } else if (irqtime_account_si_update()) { | ||
| 358 | cpustat[CPUTIME_SOFTIRQ] += cputime; | ||
| 359 | } else if (this_cpu_ksoftirqd() == p) { | ||
| 360 | /* | 381 | /* |
| 361 | * ksoftirqd time do not get accounted in cpu_softirq_time. | 382 | * ksoftirqd time do not get accounted in cpu_softirq_time. |
| 362 | * So, we have to handle it separately here. | 383 | * So, we have to handle it separately here. |
| @@ -406,6 +427,10 @@ void vtime_common_task_switch(struct task_struct *prev) | |||
| 406 | } | 427 | } |
| 407 | #endif | 428 | #endif |
| 408 | 429 | ||
| 430 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | ||
| 431 | |||
| 432 | |||
| 433 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | ||
| 409 | /* | 434 | /* |
| 410 | * Archs that account the whole time spent in the idle task | 435 | * Archs that account the whole time spent in the idle task |
| 411 | * (outside irq) as idle time can rely on this and just implement | 436 | * (outside irq) as idle time can rely on this and just implement |
| @@ -415,33 +440,16 @@ void vtime_common_task_switch(struct task_struct *prev) | |||
| 415 | * vtime_account(). | 440 | * vtime_account(). |
| 416 | */ | 441 | */ |
| 417 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | 442 | #ifndef __ARCH_HAS_VTIME_ACCOUNT |
| 418 | void vtime_common_account_irq_enter(struct task_struct *tsk) | 443 | void vtime_account_irq_enter(struct task_struct *tsk) |
| 419 | { | 444 | { |
| 420 | if (!in_interrupt()) { | 445 | if (!in_interrupt() && is_idle_task(tsk)) |
| 421 | /* | 446 | vtime_account_idle(tsk); |
| 422 | * If we interrupted user, context_tracking_in_user() | 447 | else |
| 423 | * is 1 because the context tracking don't hook | 448 | vtime_account_system(tsk); |
| 424 | * on irq entry/exit. This way we know if | ||
| 425 | * we need to flush user time on kernel entry. | ||
| 426 | */ | ||
| 427 | if (context_tracking_in_user()) { | ||
| 428 | vtime_account_user(tsk); | ||
| 429 | return; | ||
| 430 | } | ||
| 431 | |||
| 432 | if (is_idle_task(tsk)) { | ||
| 433 | vtime_account_idle(tsk); | ||
| 434 | return; | ||
| 435 | } | ||
| 436 | } | ||
| 437 | vtime_account_system(tsk); | ||
| 438 | } | 449 | } |
| 439 | EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); | 450 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); |
| 440 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | 451 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ |
| 441 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | ||
| 442 | 452 | ||
| 443 | |||
| 444 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | ||
| 445 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 453 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
| 446 | { | 454 | { |
| 447 | *ut = p->utime; | 455 | *ut = p->utime; |
| @@ -466,7 +474,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime | |||
| 466 | */ | 474 | */ |
| 467 | void account_process_tick(struct task_struct *p, int user_tick) | 475 | void account_process_tick(struct task_struct *p, int user_tick) |
| 468 | { | 476 | { |
| 469 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 477 | cputime_t cputime, scaled, steal; |
| 470 | struct rq *rq = this_rq(); | 478 | struct rq *rq = this_rq(); |
| 471 | 479 | ||
| 472 | if (vtime_accounting_cpu_enabled()) | 480 | if (vtime_accounting_cpu_enabled()) |
| @@ -477,26 +485,21 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
| 477 | return; | 485 | return; |
| 478 | } | 486 | } |
| 479 | 487 | ||
| 480 | if (steal_account_process_tick()) | 488 | cputime = cputime_one_jiffy; |
| 489 | steal = steal_account_process_time(cputime); | ||
| 490 | |||
| 491 | if (steal >= cputime) | ||
| 481 | return; | 492 | return; |
| 482 | 493 | ||
| 494 | cputime -= steal; | ||
| 495 | scaled = cputime_to_scaled(cputime); | ||
| 496 | |||
| 483 | if (user_tick) | 497 | if (user_tick) |
| 484 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 498 | account_user_time(p, cputime, scaled); |
| 485 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 499 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
| 486 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, | 500 | account_system_time(p, HARDIRQ_OFFSET, cputime, scaled); |
| 487 | one_jiffy_scaled); | ||
| 488 | else | 501 | else |
| 489 | account_idle_time(cputime_one_jiffy); | 502 | account_idle_time(cputime); |
| 490 | } | ||
| 491 | |||
| 492 | /* | ||
| 493 | * Account multiple ticks of steal time. | ||
| 494 | * @p: the process from which the cpu time has been stolen | ||
| 495 | * @ticks: number of stolen ticks | ||
| 496 | */ | ||
| 497 | void account_steal_ticks(unsigned long ticks) | ||
| 498 | { | ||
| 499 | account_steal_time(jiffies_to_cputime(ticks)); | ||
| 500 | } | 503 | } |
| 501 | 504 | ||
| 502 | /* | 505 | /* |
| @@ -681,12 +684,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) | |||
| 681 | static cputime_t get_vtime_delta(struct task_struct *tsk) | 684 | static cputime_t get_vtime_delta(struct task_struct *tsk) |
| 682 | { | 685 | { |
| 683 | unsigned long now = READ_ONCE(jiffies); | 686 | unsigned long now = READ_ONCE(jiffies); |
| 684 | unsigned long delta = now - tsk->vtime_snap; | 687 | cputime_t delta, other; |
| 685 | 688 | ||
| 689 | delta = jiffies_to_cputime(now - tsk->vtime_snap); | ||
| 690 | other = account_other_time(delta); | ||
| 686 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); | 691 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); |
| 687 | tsk->vtime_snap = now; | 692 | tsk->vtime_snap = now; |
| 688 | 693 | ||
| 689 | return jiffies_to_cputime(delta); | 694 | return delta - other; |
| 690 | } | 695 | } |
| 691 | 696 | ||
| 692 | static void __vtime_account_system(struct task_struct *tsk) | 697 | static void __vtime_account_system(struct task_struct *tsk) |
| @@ -706,16 +711,6 @@ void vtime_account_system(struct task_struct *tsk) | |||
| 706 | write_seqcount_end(&tsk->vtime_seqcount); | 711 | write_seqcount_end(&tsk->vtime_seqcount); |
| 707 | } | 712 | } |
| 708 | 713 | ||
| 709 | void vtime_gen_account_irq_exit(struct task_struct *tsk) | ||
| 710 | { | ||
| 711 | write_seqcount_begin(&tsk->vtime_seqcount); | ||
| 712 | if (vtime_delta(tsk)) | ||
| 713 | __vtime_account_system(tsk); | ||
| 714 | if (context_tracking_in_user()) | ||
| 715 | tsk->vtime_snap_whence = VTIME_USER; | ||
| 716 | write_seqcount_end(&tsk->vtime_seqcount); | ||
| 717 | } | ||
| 718 | |||
| 719 | void vtime_account_user(struct task_struct *tsk) | 714 | void vtime_account_user(struct task_struct *tsk) |
| 720 | { | 715 | { |
| 721 | cputime_t delta_cpu; | 716 | cputime_t delta_cpu; |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0368c393a336..2a0a9995256d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 879 | 879 | ||
| 880 | nr_switches = p->nvcsw + p->nivcsw; | 880 | nr_switches = p->nvcsw + p->nivcsw; |
| 881 | 881 | ||
| 882 | #ifdef CONFIG_SCHEDSTATS | ||
| 883 | P(se.nr_migrations); | 882 | P(se.nr_migrations); |
| 884 | 883 | ||
| 884 | #ifdef CONFIG_SCHEDSTATS | ||
| 885 | if (schedstat_enabled()) { | 885 | if (schedstat_enabled()) { |
| 886 | u64 avg_atom, avg_per_cpu; | 886 | u64 avg_atom, avg_per_cpu; |
| 887 | 887 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c8c5d2d48424..4088eedea763 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -690,6 +690,11 @@ void init_entity_runnable_average(struct sched_entity *se) | |||
| 690 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ | 690 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ |
| 691 | } | 691 | } |
| 692 | 692 | ||
| 693 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); | ||
| 694 | static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); | ||
| 695 | static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); | ||
| 696 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); | ||
| 697 | |||
| 693 | /* | 698 | /* |
| 694 | * With new tasks being created, their initial util_avgs are extrapolated | 699 | * With new tasks being created, their initial util_avgs are extrapolated |
| 695 | * based on the cfs_rq's current util_avg: | 700 | * based on the cfs_rq's current util_avg: |
| @@ -720,6 +725,8 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
| 720 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 725 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 721 | struct sched_avg *sa = &se->avg; | 726 | struct sched_avg *sa = &se->avg; |
| 722 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; | 727 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; |
| 728 | u64 now = cfs_rq_clock_task(cfs_rq); | ||
| 729 | int tg_update; | ||
| 723 | 730 | ||
| 724 | if (cap > 0) { | 731 | if (cap > 0) { |
| 725 | if (cfs_rq->avg.util_avg != 0) { | 732 | if (cfs_rq->avg.util_avg != 0) { |
| @@ -733,16 +740,42 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
| 733 | } | 740 | } |
| 734 | sa->util_sum = sa->util_avg * LOAD_AVG_MAX; | 741 | sa->util_sum = sa->util_avg * LOAD_AVG_MAX; |
| 735 | } | 742 | } |
| 743 | |||
| 744 | if (entity_is_task(se)) { | ||
| 745 | struct task_struct *p = task_of(se); | ||
| 746 | if (p->sched_class != &fair_sched_class) { | ||
| 747 | /* | ||
| 748 | * For !fair tasks do: | ||
| 749 | * | ||
| 750 | update_cfs_rq_load_avg(now, cfs_rq, false); | ||
| 751 | attach_entity_load_avg(cfs_rq, se); | ||
| 752 | switched_from_fair(rq, p); | ||
| 753 | * | ||
| 754 | * such that the next switched_to_fair() has the | ||
| 755 | * expected state. | ||
| 756 | */ | ||
| 757 | se->avg.last_update_time = now; | ||
| 758 | return; | ||
| 759 | } | ||
| 760 | } | ||
| 761 | |||
| 762 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | ||
| 763 | attach_entity_load_avg(cfs_rq, se); | ||
| 764 | if (tg_update) | ||
| 765 | update_tg_load_avg(cfs_rq, false); | ||
| 736 | } | 766 | } |
| 737 | 767 | ||
| 738 | #else | 768 | #else /* !CONFIG_SMP */ |
| 739 | void init_entity_runnable_average(struct sched_entity *se) | 769 | void init_entity_runnable_average(struct sched_entity *se) |
| 740 | { | 770 | { |
| 741 | } | 771 | } |
| 742 | void post_init_entity_util_avg(struct sched_entity *se) | 772 | void post_init_entity_util_avg(struct sched_entity *se) |
| 743 | { | 773 | { |
| 744 | } | 774 | } |
| 745 | #endif | 775 | static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) |
| 776 | { | ||
| 777 | } | ||
| 778 | #endif /* CONFIG_SMP */ | ||
| 746 | 779 | ||
| 747 | /* | 780 | /* |
| 748 | * Update the current task's runtime statistics. | 781 | * Update the current task's runtime statistics. |
| @@ -1303,6 +1336,8 @@ static void task_numa_assign(struct task_numa_env *env, | |||
| 1303 | { | 1336 | { |
| 1304 | if (env->best_task) | 1337 | if (env->best_task) |
| 1305 | put_task_struct(env->best_task); | 1338 | put_task_struct(env->best_task); |
| 1339 | if (p) | ||
| 1340 | get_task_struct(p); | ||
| 1306 | 1341 | ||
| 1307 | env->best_task = p; | 1342 | env->best_task = p; |
| 1308 | env->best_imp = imp; | 1343 | env->best_imp = imp; |
| @@ -1370,31 +1405,11 @@ static void task_numa_compare(struct task_numa_env *env, | |||
| 1370 | long imp = env->p->numa_group ? groupimp : taskimp; | 1405 | long imp = env->p->numa_group ? groupimp : taskimp; |
| 1371 | long moveimp = imp; | 1406 | long moveimp = imp; |
| 1372 | int dist = env->dist; | 1407 | int dist = env->dist; |
| 1373 | bool assigned = false; | ||
| 1374 | 1408 | ||
| 1375 | rcu_read_lock(); | 1409 | rcu_read_lock(); |
| 1376 | 1410 | cur = task_rcu_dereference(&dst_rq->curr); | |
| 1377 | raw_spin_lock_irq(&dst_rq->lock); | 1411 | if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) |
| 1378 | cur = dst_rq->curr; | ||
| 1379 | /* | ||
| 1380 | * No need to move the exiting task or idle task. | ||
| 1381 | */ | ||
| 1382 | if ((cur->flags & PF_EXITING) || is_idle_task(cur)) | ||
| 1383 | cur = NULL; | 1412 | cur = NULL; |
| 1384 | else { | ||
| 1385 | /* | ||
| 1386 | * The task_struct must be protected here to protect the | ||
| 1387 | * p->numa_faults access in the task_weight since the | ||
| 1388 | * numa_faults could already be freed in the following path: | ||
| 1389 | * finish_task_switch() | ||
| 1390 | * --> put_task_struct() | ||
| 1391 | * --> __put_task_struct() | ||
| 1392 | * --> task_numa_free() | ||
| 1393 | */ | ||
| 1394 | get_task_struct(cur); | ||
| 1395 | } | ||
| 1396 | |||
| 1397 | raw_spin_unlock_irq(&dst_rq->lock); | ||
| 1398 | 1413 | ||
| 1399 | /* | 1414 | /* |
| 1400 | * Because we have preemption enabled we can get migrated around and | 1415 | * Because we have preemption enabled we can get migrated around and |
| @@ -1477,7 +1492,6 @@ balance: | |||
| 1477 | */ | 1492 | */ |
| 1478 | if (!load_too_imbalanced(src_load, dst_load, env)) { | 1493 | if (!load_too_imbalanced(src_load, dst_load, env)) { |
| 1479 | imp = moveimp - 1; | 1494 | imp = moveimp - 1; |
| 1480 | put_task_struct(cur); | ||
| 1481 | cur = NULL; | 1495 | cur = NULL; |
| 1482 | goto assign; | 1496 | goto assign; |
| 1483 | } | 1497 | } |
| @@ -1503,16 +1517,9 @@ balance: | |||
| 1503 | env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); | 1517 | env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); |
| 1504 | 1518 | ||
| 1505 | assign: | 1519 | assign: |
| 1506 | assigned = true; | ||
| 1507 | task_numa_assign(env, cur, imp); | 1520 | task_numa_assign(env, cur, imp); |
| 1508 | unlock: | 1521 | unlock: |
| 1509 | rcu_read_unlock(); | 1522 | rcu_read_unlock(); |
| 1510 | /* | ||
| 1511 | * The dst_rq->curr isn't assigned. The protection for task_struct is | ||
| 1512 | * finished. | ||
| 1513 | */ | ||
| 1514 | if (cur && !assigned) | ||
| 1515 | put_task_struct(cur); | ||
| 1516 | } | 1523 | } |
| 1517 | 1524 | ||
| 1518 | static void task_numa_find_cpu(struct task_numa_env *env, | 1525 | static void task_numa_find_cpu(struct task_numa_env *env, |
| @@ -2866,8 +2873,6 @@ void set_task_rq_fair(struct sched_entity *se, | |||
| 2866 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} | 2873 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} |
| 2867 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 2874 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 2868 | 2875 | ||
| 2869 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); | ||
| 2870 | |||
| 2871 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | 2876 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) |
| 2872 | { | 2877 | { |
| 2873 | struct rq *rq = rq_of(cfs_rq); | 2878 | struct rq *rq = rq_of(cfs_rq); |
| @@ -2914,7 +2919,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | |||
| 2914 | WRITE_ONCE(*ptr, res); \ | 2919 | WRITE_ONCE(*ptr, res); \ |
| 2915 | } while (0) | 2920 | } while (0) |
| 2916 | 2921 | ||
| 2917 | /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ | 2922 | /** |
| 2923 | * update_cfs_rq_load_avg - update the cfs_rq's load/util averages | ||
| 2924 | * @now: current time, as per cfs_rq_clock_task() | ||
| 2925 | * @cfs_rq: cfs_rq to update | ||
| 2926 | * @update_freq: should we call cfs_rq_util_change() or will the call do so | ||
| 2927 | * | ||
| 2928 | * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) | ||
| 2929 | * avg. The immediate corollary is that all (fair) tasks must be attached, see | ||
| 2930 | * post_init_entity_util_avg(). | ||
| 2931 | * | ||
| 2932 | * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. | ||
| 2933 | * | ||
| 2934 | * Returns true if the load decayed or we removed utilization. It is expected | ||
| 2935 | * that one calls update_tg_load_avg() on this condition, but after you've | ||
| 2936 | * modified the cfs_rq avg (attach/detach), such that we propagate the new | ||
| 2937 | * avg up. | ||
| 2938 | */ | ||
| 2918 | static inline int | 2939 | static inline int |
| 2919 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | 2940 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) |
| 2920 | { | 2941 | { |
| @@ -2969,6 +2990,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) | |||
| 2969 | update_tg_load_avg(cfs_rq, 0); | 2990 | update_tg_load_avg(cfs_rq, 0); |
| 2970 | } | 2991 | } |
| 2971 | 2992 | ||
| 2993 | /** | ||
| 2994 | * attach_entity_load_avg - attach this entity to its cfs_rq load avg | ||
| 2995 | * @cfs_rq: cfs_rq to attach to | ||
| 2996 | * @se: sched_entity to attach | ||
| 2997 | * | ||
| 2998 | * Must call update_cfs_rq_load_avg() before this, since we rely on | ||
| 2999 | * cfs_rq->avg.last_update_time being current. | ||
| 3000 | */ | ||
| 2972 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3001 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 2973 | { | 3002 | { |
| 2974 | if (!sched_feat(ATTACH_AGE_LOAD)) | 3003 | if (!sched_feat(ATTACH_AGE_LOAD)) |
| @@ -2977,6 +3006,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
| 2977 | /* | 3006 | /* |
| 2978 | * If we got migrated (either between CPUs or between cgroups) we'll | 3007 | * If we got migrated (either between CPUs or between cgroups) we'll |
| 2979 | * have aged the average right before clearing @last_update_time. | 3008 | * have aged the average right before clearing @last_update_time. |
| 3009 | * | ||
| 3010 | * Or we're fresh through post_init_entity_util_avg(). | ||
| 2980 | */ | 3011 | */ |
| 2981 | if (se->avg.last_update_time) { | 3012 | if (se->avg.last_update_time) { |
| 2982 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), | 3013 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), |
| @@ -2998,6 +3029,14 @@ skip_aging: | |||
| 2998 | cfs_rq_util_change(cfs_rq); | 3029 | cfs_rq_util_change(cfs_rq); |
| 2999 | } | 3030 | } |
| 3000 | 3031 | ||
| 3032 | /** | ||
| 3033 | * detach_entity_load_avg - detach this entity from its cfs_rq load avg | ||
| 3034 | * @cfs_rq: cfs_rq to detach from | ||
| 3035 | * @se: sched_entity to detach | ||
| 3036 | * | ||
| 3037 | * Must call update_cfs_rq_load_avg() before this, since we rely on | ||
| 3038 | * cfs_rq->avg.last_update_time being current. | ||
| 3039 | */ | ||
| 3001 | static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3040 | static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 3002 | { | 3041 | { |
| 3003 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), | 3042 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), |
| @@ -3082,11 +3121,14 @@ void remove_entity_load_avg(struct sched_entity *se) | |||
| 3082 | u64 last_update_time; | 3121 | u64 last_update_time; |
| 3083 | 3122 | ||
| 3084 | /* | 3123 | /* |
| 3085 | * Newly created task or never used group entity should not be removed | 3124 | * tasks cannot exit without having gone through wake_up_new_task() -> |
| 3086 | * from its (source) cfs_rq | 3125 | * post_init_entity_util_avg() which will have added things to the |
| 3126 | * cfs_rq, so we can remove unconditionally. | ||
| 3127 | * | ||
| 3128 | * Similarly for groups, they will have passed through | ||
| 3129 | * post_init_entity_util_avg() before unregister_sched_fair_group() | ||
| 3130 | * calls this. | ||
| 3087 | */ | 3131 | */ |
| 3088 | if (se->avg.last_update_time == 0) | ||
| 3089 | return; | ||
| 3090 | 3132 | ||
| 3091 | last_update_time = cfs_rq_last_update_time(cfs_rq); | 3133 | last_update_time = cfs_rq_last_update_time(cfs_rq); |
| 3092 | 3134 | ||
| @@ -3109,6 +3151,12 @@ static int idle_balance(struct rq *this_rq); | |||
| 3109 | 3151 | ||
| 3110 | #else /* CONFIG_SMP */ | 3152 | #else /* CONFIG_SMP */ |
| 3111 | 3153 | ||
| 3154 | static inline int | ||
| 3155 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | ||
| 3156 | { | ||
| 3157 | return 0; | ||
| 3158 | } | ||
| 3159 | |||
| 3112 | static inline void update_load_avg(struct sched_entity *se, int not_used) | 3160 | static inline void update_load_avg(struct sched_entity *se, int not_used) |
| 3113 | { | 3161 | { |
| 3114 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 3162 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| @@ -3698,7 +3746,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | |||
| 3698 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | 3746 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) |
| 3699 | { | 3747 | { |
| 3700 | if (unlikely(cfs_rq->throttle_count)) | 3748 | if (unlikely(cfs_rq->throttle_count)) |
| 3701 | return cfs_rq->throttled_clock_task; | 3749 | return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; |
| 3702 | 3750 | ||
| 3703 | return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; | 3751 | return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; |
| 3704 | } | 3752 | } |
| @@ -3836,13 +3884,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) | |||
| 3836 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | 3884 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; |
| 3837 | 3885 | ||
| 3838 | cfs_rq->throttle_count--; | 3886 | cfs_rq->throttle_count--; |
| 3839 | #ifdef CONFIG_SMP | ||
| 3840 | if (!cfs_rq->throttle_count) { | 3887 | if (!cfs_rq->throttle_count) { |
| 3841 | /* adjust cfs_rq_clock_task() */ | 3888 | /* adjust cfs_rq_clock_task() */ |
| 3842 | cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - | 3889 | cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - |
| 3843 | cfs_rq->throttled_clock_task; | 3890 | cfs_rq->throttled_clock_task; |
| 3844 | } | 3891 | } |
| 3845 | #endif | ||
| 3846 | 3892 | ||
| 3847 | return 0; | 3893 | return 0; |
| 3848 | } | 3894 | } |
| @@ -4195,26 +4241,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | |||
| 4195 | if (!cfs_bandwidth_used()) | 4241 | if (!cfs_bandwidth_used()) |
| 4196 | return; | 4242 | return; |
| 4197 | 4243 | ||
| 4198 | /* Synchronize hierarchical throttle counter: */ | ||
| 4199 | if (unlikely(!cfs_rq->throttle_uptodate)) { | ||
| 4200 | struct rq *rq = rq_of(cfs_rq); | ||
| 4201 | struct cfs_rq *pcfs_rq; | ||
| 4202 | struct task_group *tg; | ||
| 4203 | |||
| 4204 | cfs_rq->throttle_uptodate = 1; | ||
| 4205 | |||
| 4206 | /* Get closest up-to-date node, because leaves go first: */ | ||
| 4207 | for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { | ||
| 4208 | pcfs_rq = tg->cfs_rq[cpu_of(rq)]; | ||
| 4209 | if (pcfs_rq->throttle_uptodate) | ||
| 4210 | break; | ||
| 4211 | } | ||
| 4212 | if (tg) { | ||
| 4213 | cfs_rq->throttle_count = pcfs_rq->throttle_count; | ||
| 4214 | cfs_rq->throttled_clock_task = rq_clock_task(rq); | ||
| 4215 | } | ||
| 4216 | } | ||
| 4217 | |||
| 4218 | /* an active group must be handled by the update_curr()->put() path */ | 4244 | /* an active group must be handled by the update_curr()->put() path */ |
| 4219 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) | 4245 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) |
| 4220 | return; | 4246 | return; |
| @@ -4229,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | |||
| 4229 | throttle_cfs_rq(cfs_rq); | 4255 | throttle_cfs_rq(cfs_rq); |
| 4230 | } | 4256 | } |
| 4231 | 4257 | ||
| 4258 | static void sync_throttle(struct task_group *tg, int cpu) | ||
| 4259 | { | ||
| 4260 | struct cfs_rq *pcfs_rq, *cfs_rq; | ||
| 4261 | |||
| 4262 | if (!cfs_bandwidth_used()) | ||
| 4263 | return; | ||
| 4264 | |||
| 4265 | if (!tg->parent) | ||
| 4266 | return; | ||
| 4267 | |||
| 4268 | cfs_rq = tg->cfs_rq[cpu]; | ||
| 4269 | pcfs_rq = tg->parent->cfs_rq[cpu]; | ||
| 4270 | |||
| 4271 | cfs_rq->throttle_count = pcfs_rq->throttle_count; | ||
| 4272 | pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); | ||
| 4273 | } | ||
| 4274 | |||
| 4232 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ | 4275 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ |
| 4233 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 4276 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
| 4234 | { | 4277 | { |
| @@ -4368,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | |||
| 4368 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} | 4411 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} |
| 4369 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } | 4412 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } |
| 4370 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | 4413 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} |
| 4414 | static inline void sync_throttle(struct task_group *tg, int cpu) {} | ||
| 4371 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 4415 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
| 4372 | 4416 | ||
| 4373 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | 4417 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) |
| @@ -4476,7 +4520,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 4476 | * | 4520 | * |
| 4477 | * note: in the case of encountering a throttled cfs_rq we will | 4521 | * note: in the case of encountering a throttled cfs_rq we will |
| 4478 | * post the final h_nr_running increment below. | 4522 | * post the final h_nr_running increment below. |
| 4479 | */ | 4523 | */ |
| 4480 | if (cfs_rq_throttled(cfs_rq)) | 4524 | if (cfs_rq_throttled(cfs_rq)) |
| 4481 | break; | 4525 | break; |
| 4482 | cfs_rq->h_nr_running++; | 4526 | cfs_rq->h_nr_running++; |
| @@ -8317,31 +8361,17 @@ static void task_fork_fair(struct task_struct *p) | |||
| 8317 | { | 8361 | { |
| 8318 | struct cfs_rq *cfs_rq; | 8362 | struct cfs_rq *cfs_rq; |
| 8319 | struct sched_entity *se = &p->se, *curr; | 8363 | struct sched_entity *se = &p->se, *curr; |
| 8320 | int this_cpu = smp_processor_id(); | ||
| 8321 | struct rq *rq = this_rq(); | 8364 | struct rq *rq = this_rq(); |
| 8322 | unsigned long flags; | ||
| 8323 | |||
| 8324 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
| 8325 | 8365 | ||
| 8366 | raw_spin_lock(&rq->lock); | ||
| 8326 | update_rq_clock(rq); | 8367 | update_rq_clock(rq); |
| 8327 | 8368 | ||
| 8328 | cfs_rq = task_cfs_rq(current); | 8369 | cfs_rq = task_cfs_rq(current); |
| 8329 | curr = cfs_rq->curr; | 8370 | curr = cfs_rq->curr; |
| 8330 | 8371 | if (curr) { | |
| 8331 | /* | 8372 | update_curr(cfs_rq); |
| 8332 | * Not only the cpu but also the task_group of the parent might have | ||
| 8333 | * been changed after parent->se.parent,cfs_rq were copied to | ||
| 8334 | * child->se.parent,cfs_rq. So call __set_task_cpu() to make those | ||
| 8335 | * of child point to valid ones. | ||
| 8336 | */ | ||
| 8337 | rcu_read_lock(); | ||
| 8338 | __set_task_cpu(p, this_cpu); | ||
| 8339 | rcu_read_unlock(); | ||
| 8340 | |||
| 8341 | update_curr(cfs_rq); | ||
| 8342 | |||
| 8343 | if (curr) | ||
| 8344 | se->vruntime = curr->vruntime; | 8373 | se->vruntime = curr->vruntime; |
| 8374 | } | ||
| 8345 | place_entity(cfs_rq, se, 1); | 8375 | place_entity(cfs_rq, se, 1); |
| 8346 | 8376 | ||
| 8347 | if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { | 8377 | if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { |
| @@ -8354,8 +8384,7 @@ static void task_fork_fair(struct task_struct *p) | |||
| 8354 | } | 8384 | } |
| 8355 | 8385 | ||
| 8356 | se->vruntime -= cfs_rq->min_vruntime; | 8386 | se->vruntime -= cfs_rq->min_vruntime; |
| 8357 | 8387 | raw_spin_unlock(&rq->lock); | |
| 8358 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
| 8359 | } | 8388 | } |
| 8360 | 8389 | ||
| 8361 | /* | 8390 | /* |
| @@ -8411,6 +8440,8 @@ static void detach_task_cfs_rq(struct task_struct *p) | |||
| 8411 | { | 8440 | { |
| 8412 | struct sched_entity *se = &p->se; | 8441 | struct sched_entity *se = &p->se; |
| 8413 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 8442 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 8443 | u64 now = cfs_rq_clock_task(cfs_rq); | ||
| 8444 | int tg_update; | ||
| 8414 | 8445 | ||
| 8415 | if (!vruntime_normalized(p)) { | 8446 | if (!vruntime_normalized(p)) { |
| 8416 | /* | 8447 | /* |
| @@ -8422,13 +8453,18 @@ static void detach_task_cfs_rq(struct task_struct *p) | |||
| 8422 | } | 8453 | } |
| 8423 | 8454 | ||
| 8424 | /* Catch up with the cfs_rq and remove our load when we leave */ | 8455 | /* Catch up with the cfs_rq and remove our load when we leave */ |
| 8456 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | ||
| 8425 | detach_entity_load_avg(cfs_rq, se); | 8457 | detach_entity_load_avg(cfs_rq, se); |
| 8458 | if (tg_update) | ||
| 8459 | update_tg_load_avg(cfs_rq, false); | ||
| 8426 | } | 8460 | } |
| 8427 | 8461 | ||
| 8428 | static void attach_task_cfs_rq(struct task_struct *p) | 8462 | static void attach_task_cfs_rq(struct task_struct *p) |
| 8429 | { | 8463 | { |
| 8430 | struct sched_entity *se = &p->se; | 8464 | struct sched_entity *se = &p->se; |
| 8431 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 8465 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 8466 | u64 now = cfs_rq_clock_task(cfs_rq); | ||
| 8467 | int tg_update; | ||
| 8432 | 8468 | ||
| 8433 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8469 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 8434 | /* | 8470 | /* |
| @@ -8439,7 +8475,10 @@ static void attach_task_cfs_rq(struct task_struct *p) | |||
| 8439 | #endif | 8475 | #endif |
| 8440 | 8476 | ||
| 8441 | /* Synchronize task with its cfs_rq */ | 8477 | /* Synchronize task with its cfs_rq */ |
| 8478 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | ||
| 8442 | attach_entity_load_avg(cfs_rq, se); | 8479 | attach_entity_load_avg(cfs_rq, se); |
| 8480 | if (tg_update) | ||
| 8481 | update_tg_load_avg(cfs_rq, false); | ||
| 8443 | 8482 | ||
| 8444 | if (!vruntime_normalized(p)) | 8483 | if (!vruntime_normalized(p)) |
| 8445 | se->vruntime += cfs_rq->min_vruntime; | 8484 | se->vruntime += cfs_rq->min_vruntime; |
| @@ -8499,6 +8538,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 8499 | } | 8538 | } |
| 8500 | 8539 | ||
| 8501 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8540 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 8541 | static void task_set_group_fair(struct task_struct *p) | ||
| 8542 | { | ||
| 8543 | struct sched_entity *se = &p->se; | ||
| 8544 | |||
| 8545 | set_task_rq(p, task_cpu(p)); | ||
| 8546 | se->depth = se->parent ? se->parent->depth + 1 : 0; | ||
| 8547 | } | ||
| 8548 | |||
| 8502 | static void task_move_group_fair(struct task_struct *p) | 8549 | static void task_move_group_fair(struct task_struct *p) |
| 8503 | { | 8550 | { |
| 8504 | detach_task_cfs_rq(p); | 8551 | detach_task_cfs_rq(p); |
| @@ -8511,6 +8558,19 @@ static void task_move_group_fair(struct task_struct *p) | |||
| 8511 | attach_task_cfs_rq(p); | 8558 | attach_task_cfs_rq(p); |
| 8512 | } | 8559 | } |
| 8513 | 8560 | ||
| 8561 | static void task_change_group_fair(struct task_struct *p, int type) | ||
| 8562 | { | ||
| 8563 | switch (type) { | ||
| 8564 | case TASK_SET_GROUP: | ||
| 8565 | task_set_group_fair(p); | ||
| 8566 | break; | ||
| 8567 | |||
| 8568 | case TASK_MOVE_GROUP: | ||
| 8569 | task_move_group_fair(p); | ||
| 8570 | break; | ||
| 8571 | } | ||
| 8572 | } | ||
| 8573 | |||
| 8514 | void free_fair_sched_group(struct task_group *tg) | 8574 | void free_fair_sched_group(struct task_group *tg) |
| 8515 | { | 8575 | { |
| 8516 | int i; | 8576 | int i; |
| @@ -8562,10 +8622,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 8562 | init_cfs_rq(cfs_rq); | 8622 | init_cfs_rq(cfs_rq); |
| 8563 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | 8623 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
| 8564 | init_entity_runnable_average(se); | 8624 | init_entity_runnable_average(se); |
| 8565 | |||
| 8566 | raw_spin_lock_irq(&rq->lock); | ||
| 8567 | post_init_entity_util_avg(se); | ||
| 8568 | raw_spin_unlock_irq(&rq->lock); | ||
| 8569 | } | 8625 | } |
| 8570 | 8626 | ||
| 8571 | return 1; | 8627 | return 1; |
| @@ -8576,6 +8632,23 @@ err: | |||
| 8576 | return 0; | 8632 | return 0; |
| 8577 | } | 8633 | } |
| 8578 | 8634 | ||
| 8635 | void online_fair_sched_group(struct task_group *tg) | ||
| 8636 | { | ||
| 8637 | struct sched_entity *se; | ||
| 8638 | struct rq *rq; | ||
| 8639 | int i; | ||
| 8640 | |||
| 8641 | for_each_possible_cpu(i) { | ||
| 8642 | rq = cpu_rq(i); | ||
| 8643 | se = tg->se[i]; | ||
| 8644 | |||
| 8645 | raw_spin_lock_irq(&rq->lock); | ||
| 8646 | post_init_entity_util_avg(se); | ||
| 8647 | sync_throttle(tg, i); | ||
| 8648 | raw_spin_unlock_irq(&rq->lock); | ||
| 8649 | } | ||
| 8650 | } | ||
| 8651 | |||
| 8579 | void unregister_fair_sched_group(struct task_group *tg) | 8652 | void unregister_fair_sched_group(struct task_group *tg) |
| 8580 | { | 8653 | { |
| 8581 | unsigned long flags; | 8654 | unsigned long flags; |
| @@ -8680,6 +8753,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 8680 | return 1; | 8753 | return 1; |
| 8681 | } | 8754 | } |
| 8682 | 8755 | ||
| 8756 | void online_fair_sched_group(struct task_group *tg) { } | ||
| 8757 | |||
| 8683 | void unregister_fair_sched_group(struct task_group *tg) { } | 8758 | void unregister_fair_sched_group(struct task_group *tg) { } |
| 8684 | 8759 | ||
| 8685 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8760 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| @@ -8739,7 +8814,7 @@ const struct sched_class fair_sched_class = { | |||
| 8739 | .update_curr = update_curr_fair, | 8814 | .update_curr = update_curr_fair, |
| 8740 | 8815 | ||
| 8741 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8816 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 8742 | .task_move_group = task_move_group_fair, | 8817 | .task_change_group = task_change_group_fair, |
| 8743 | #endif | 8818 | #endif |
| 8744 | }; | 8819 | }; |
| 8745 | 8820 | ||
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c5aeedf4e93a..9fb873cfc75c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
| @@ -201,6 +201,8 @@ exit_idle: | |||
| 201 | */ | 201 | */ |
| 202 | static void cpu_idle_loop(void) | 202 | static void cpu_idle_loop(void) |
| 203 | { | 203 | { |
| 204 | int cpu = smp_processor_id(); | ||
| 205 | |||
| 204 | while (1) { | 206 | while (1) { |
| 205 | /* | 207 | /* |
| 206 | * If the arch has a polling bit, we maintain an invariant: | 208 | * If the arch has a polling bit, we maintain an invariant: |
| @@ -219,7 +221,7 @@ static void cpu_idle_loop(void) | |||
| 219 | check_pgt_cache(); | 221 | check_pgt_cache(); |
| 220 | rmb(); | 222 | rmb(); |
| 221 | 223 | ||
| 222 | if (cpu_is_offline(smp_processor_id())) { | 224 | if (cpu_is_offline(cpu)) { |
| 223 | cpuhp_report_idle_dead(); | 225 | cpuhp_report_idle_dead(); |
| 224 | arch_cpu_idle_dead(); | 226 | arch_cpu_idle_dead(); |
| 225 | } | 227 | } |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 898c0d2f18fe..c64fc5114004 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data); | |||
| 321 | 321 | ||
| 322 | extern void free_fair_sched_group(struct task_group *tg); | 322 | extern void free_fair_sched_group(struct task_group *tg); |
| 323 | extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); | 323 | extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); |
| 324 | extern void online_fair_sched_group(struct task_group *tg); | ||
| 324 | extern void unregister_fair_sched_group(struct task_group *tg); | 325 | extern void unregister_fair_sched_group(struct task_group *tg); |
| 325 | extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 326 | extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
| 326 | struct sched_entity *se, int cpu, | 327 | struct sched_entity *se, int cpu, |
| @@ -437,7 +438,7 @@ struct cfs_rq { | |||
| 437 | 438 | ||
| 438 | u64 throttled_clock, throttled_clock_task; | 439 | u64 throttled_clock, throttled_clock_task; |
| 439 | u64 throttled_clock_task_time; | 440 | u64 throttled_clock_task_time; |
| 440 | int throttled, throttle_count, throttle_uptodate; | 441 | int throttled, throttle_count; |
| 441 | struct list_head throttled_list; | 442 | struct list_head throttled_list; |
| 442 | #endif /* CONFIG_CFS_BANDWIDTH */ | 443 | #endif /* CONFIG_CFS_BANDWIDTH */ |
| 443 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 444 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| @@ -1113,7 +1114,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
| 1113 | * In particular, the load of prev->state in finish_task_switch() must | 1114 | * In particular, the load of prev->state in finish_task_switch() must |
| 1114 | * happen before this. | 1115 | * happen before this. |
| 1115 | * | 1116 | * |
| 1116 | * Pairs with the smp_cond_acquire() in try_to_wake_up(). | 1117 | * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). |
| 1117 | */ | 1118 | */ |
| 1118 | smp_store_release(&prev->on_cpu, 0); | 1119 | smp_store_release(&prev->on_cpu, 0); |
| 1119 | #endif | 1120 | #endif |
| @@ -1246,8 +1247,11 @@ struct sched_class { | |||
| 1246 | 1247 | ||
| 1247 | void (*update_curr) (struct rq *rq); | 1248 | void (*update_curr) (struct rq *rq); |
| 1248 | 1249 | ||
| 1250 | #define TASK_SET_GROUP 0 | ||
| 1251 | #define TASK_MOVE_GROUP 1 | ||
| 1252 | |||
| 1249 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1253 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 1250 | void (*task_move_group) (struct task_struct *p); | 1254 | void (*task_change_group) (struct task_struct *p, int type); |
| 1251 | #endif | 1255 | #endif |
| 1252 | }; | 1256 | }; |
| 1253 | 1257 | ||
| @@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {} | |||
| 1809 | #else /* arch_scale_freq_capacity */ | 1813 | #else /* arch_scale_freq_capacity */ |
| 1810 | #define arch_scale_freq_invariant() (false) | 1814 | #define arch_scale_freq_invariant() (false) |
| 1811 | #endif | 1815 | #endif |
| 1812 | |||
| 1813 | static inline void account_reset_rq(struct rq *rq) | ||
| 1814 | { | ||
| 1815 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 1816 | rq->prev_irq_time = 0; | ||
| 1817 | #endif | ||
| 1818 | #ifdef CONFIG_PARAVIRT | ||
| 1819 | rq->prev_steal_time = 0; | ||
| 1820 | #endif | ||
| 1821 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
| 1822 | rq->prev_steal_time_rq = 0; | ||
| 1823 | #endif | ||
| 1824 | } | ||
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 7002796f14a4..54d15eb2b701 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
| @@ -173,7 +173,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) | |||
| 173 | * | 173 | * |
| 174 | * Returns valid seccomp BPF response codes. | 174 | * Returns valid seccomp BPF response codes. |
| 175 | */ | 175 | */ |
| 176 | static u32 seccomp_run_filters(struct seccomp_data *sd) | 176 | static u32 seccomp_run_filters(const struct seccomp_data *sd) |
| 177 | { | 177 | { |
| 178 | struct seccomp_data sd_local; | 178 | struct seccomp_data sd_local; |
| 179 | u32 ret = SECCOMP_RET_ALLOW; | 179 | u32 ret = SECCOMP_RET_ALLOW; |
| @@ -554,20 +554,10 @@ void secure_computing_strict(int this_syscall) | |||
| 554 | BUG(); | 554 | BUG(); |
| 555 | } | 555 | } |
| 556 | #else | 556 | #else |
| 557 | int __secure_computing(void) | ||
| 558 | { | ||
| 559 | u32 phase1_result = seccomp_phase1(NULL); | ||
| 560 | |||
| 561 | if (likely(phase1_result == SECCOMP_PHASE1_OK)) | ||
| 562 | return 0; | ||
| 563 | else if (likely(phase1_result == SECCOMP_PHASE1_SKIP)) | ||
| 564 | return -1; | ||
| 565 | else | ||
| 566 | return seccomp_phase2(phase1_result); | ||
| 567 | } | ||
| 568 | 557 | ||
| 569 | #ifdef CONFIG_SECCOMP_FILTER | 558 | #ifdef CONFIG_SECCOMP_FILTER |
| 570 | static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) | 559 | static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, |
| 560 | const bool recheck_after_trace) | ||
| 571 | { | 561 | { |
| 572 | u32 filter_ret, action; | 562 | u32 filter_ret, action; |
| 573 | int data; | 563 | int data; |
| @@ -599,10 +589,46 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) | |||
| 599 | goto skip; | 589 | goto skip; |
| 600 | 590 | ||
| 601 | case SECCOMP_RET_TRACE: | 591 | case SECCOMP_RET_TRACE: |
| 602 | return filter_ret; /* Save the rest for phase 2. */ | 592 | /* We've been put in this state by the ptracer already. */ |
| 593 | if (recheck_after_trace) | ||
| 594 | return 0; | ||
| 595 | |||
| 596 | /* ENOSYS these calls if there is no tracer attached. */ | ||
| 597 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { | ||
| 598 | syscall_set_return_value(current, | ||
| 599 | task_pt_regs(current), | ||
| 600 | -ENOSYS, 0); | ||
| 601 | goto skip; | ||
| 602 | } | ||
| 603 | |||
| 604 | /* Allow the BPF to provide the event message */ | ||
| 605 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | ||
| 606 | /* | ||
| 607 | * The delivery of a fatal signal during event | ||
| 608 | * notification may silently skip tracer notification. | ||
| 609 | * Terminating the task now avoids executing a system | ||
| 610 | * call that may not be intended. | ||
| 611 | */ | ||
| 612 | if (fatal_signal_pending(current)) | ||
| 613 | do_exit(SIGSYS); | ||
| 614 | /* Check if the tracer forced the syscall to be skipped. */ | ||
| 615 | this_syscall = syscall_get_nr(current, task_pt_regs(current)); | ||
| 616 | if (this_syscall < 0) | ||
| 617 | goto skip; | ||
| 618 | |||
| 619 | /* | ||
| 620 | * Recheck the syscall, since it may have changed. This | ||
| 621 | * intentionally uses a NULL struct seccomp_data to force | ||
| 622 | * a reload of all registers. This does not goto skip since | ||
| 623 | * a skip would have already been reported. | ||
| 624 | */ | ||
| 625 | if (__seccomp_filter(this_syscall, NULL, true)) | ||
| 626 | return -1; | ||
| 627 | |||
| 628 | return 0; | ||
| 603 | 629 | ||
| 604 | case SECCOMP_RET_ALLOW: | 630 | case SECCOMP_RET_ALLOW: |
| 605 | return SECCOMP_PHASE1_OK; | 631 | return 0; |
| 606 | 632 | ||
| 607 | case SECCOMP_RET_KILL: | 633 | case SECCOMP_RET_KILL: |
| 608 | default: | 634 | default: |
| @@ -614,96 +640,38 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) | |||
| 614 | 640 | ||
| 615 | skip: | 641 | skip: |
| 616 | audit_seccomp(this_syscall, 0, action); | 642 | audit_seccomp(this_syscall, 0, action); |
| 617 | return SECCOMP_PHASE1_SKIP; | 643 | return -1; |
| 644 | } | ||
| 645 | #else | ||
| 646 | static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, | ||
| 647 | const bool recheck_after_trace) | ||
| 648 | { | ||
| 649 | BUG(); | ||
| 618 | } | 650 | } |
| 619 | #endif | 651 | #endif |
| 620 | 652 | ||
| 621 | /** | 653 | int __secure_computing(const struct seccomp_data *sd) |
| 622 | * seccomp_phase1() - run fast path seccomp checks on the current syscall | ||
| 623 | * @arg sd: The seccomp_data or NULL | ||
| 624 | * | ||
| 625 | * This only reads pt_regs via the syscall_xyz helpers. The only change | ||
| 626 | * it will make to pt_regs is via syscall_set_return_value, and it will | ||
| 627 | * only do that if it returns SECCOMP_PHASE1_SKIP. | ||
| 628 | * | ||
| 629 | * If sd is provided, it will not read pt_regs at all. | ||
| 630 | * | ||
| 631 | * It may also call do_exit or force a signal; these actions must be | ||
| 632 | * safe. | ||
| 633 | * | ||
| 634 | * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should | ||
| 635 | * be processed normally. | ||
| 636 | * | ||
| 637 | * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be | ||
| 638 | * invoked. In this case, seccomp_phase1 will have set the return value | ||
| 639 | * using syscall_set_return_value. | ||
| 640 | * | ||
| 641 | * If it returns anything else, then the return value should be passed | ||
| 642 | * to seccomp_phase2 from a context in which ptrace hooks are safe. | ||
| 643 | */ | ||
| 644 | u32 seccomp_phase1(struct seccomp_data *sd) | ||
| 645 | { | 654 | { |
| 646 | int mode = current->seccomp.mode; | 655 | int mode = current->seccomp.mode; |
| 647 | int this_syscall = sd ? sd->nr : | 656 | int this_syscall; |
| 648 | syscall_get_nr(current, task_pt_regs(current)); | ||
| 649 | 657 | ||
| 650 | if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && | 658 | if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && |
| 651 | unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) | 659 | unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) |
| 652 | return SECCOMP_PHASE1_OK; | 660 | return 0; |
| 661 | |||
| 662 | this_syscall = sd ? sd->nr : | ||
| 663 | syscall_get_nr(current, task_pt_regs(current)); | ||
| 653 | 664 | ||
| 654 | switch (mode) { | 665 | switch (mode) { |
| 655 | case SECCOMP_MODE_STRICT: | 666 | case SECCOMP_MODE_STRICT: |
| 656 | __secure_computing_strict(this_syscall); /* may call do_exit */ | 667 | __secure_computing_strict(this_syscall); /* may call do_exit */ |
| 657 | return SECCOMP_PHASE1_OK; | 668 | return 0; |
| 658 | #ifdef CONFIG_SECCOMP_FILTER | ||
| 659 | case SECCOMP_MODE_FILTER: | 669 | case SECCOMP_MODE_FILTER: |
| 660 | return __seccomp_phase1_filter(this_syscall, sd); | 670 | return __seccomp_filter(this_syscall, sd, false); |
| 661 | #endif | ||
| 662 | default: | 671 | default: |
| 663 | BUG(); | 672 | BUG(); |
| 664 | } | 673 | } |
| 665 | } | 674 | } |
| 666 | |||
| 667 | /** | ||
| 668 | * seccomp_phase2() - finish slow path seccomp work for the current syscall | ||
| 669 | * @phase1_result: The return value from seccomp_phase1() | ||
| 670 | * | ||
| 671 | * This must be called from a context in which ptrace hooks can be used. | ||
| 672 | * | ||
| 673 | * Returns 0 if the syscall should be processed or -1 to skip the syscall. | ||
| 674 | */ | ||
| 675 | int seccomp_phase2(u32 phase1_result) | ||
| 676 | { | ||
| 677 | struct pt_regs *regs = task_pt_regs(current); | ||
| 678 | u32 action = phase1_result & SECCOMP_RET_ACTION; | ||
| 679 | int data = phase1_result & SECCOMP_RET_DATA; | ||
| 680 | |||
| 681 | BUG_ON(action != SECCOMP_RET_TRACE); | ||
| 682 | |||
| 683 | audit_seccomp(syscall_get_nr(current, regs), 0, action); | ||
| 684 | |||
| 685 | /* Skip these calls if there is no tracer. */ | ||
| 686 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { | ||
| 687 | syscall_set_return_value(current, regs, | ||
| 688 | -ENOSYS, 0); | ||
| 689 | return -1; | ||
| 690 | } | ||
| 691 | |||
| 692 | /* Allow the BPF to provide the event message */ | ||
| 693 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | ||
| 694 | /* | ||
| 695 | * The delivery of a fatal signal during event | ||
| 696 | * notification may silently skip tracer notification. | ||
| 697 | * Terminating the task now avoids executing a system | ||
| 698 | * call that may not be intended. | ||
| 699 | */ | ||
| 700 | if (fatal_signal_pending(current)) | ||
| 701 | do_exit(SIGSYS); | ||
| 702 | if (syscall_get_nr(current, regs) < 0) | ||
| 703 | return -1; /* Explicit request to skip. */ | ||
| 704 | |||
| 705 | return 0; | ||
| 706 | } | ||
| 707 | #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ | 675 | #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ |
| 708 | 676 | ||
| 709 | long prctl_get_seccomp(void) | 677 | long prctl_get_seccomp(void) |
diff --git a/kernel/signal.c b/kernel/signal.c index 96e9bc40667f..af21afc00d08 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -2751,23 +2751,18 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) | |||
| 2751 | * @ts: upper bound on process time suspension | 2751 | * @ts: upper bound on process time suspension |
| 2752 | */ | 2752 | */ |
| 2753 | int do_sigtimedwait(const sigset_t *which, siginfo_t *info, | 2753 | int do_sigtimedwait(const sigset_t *which, siginfo_t *info, |
| 2754 | const struct timespec *ts) | 2754 | const struct timespec *ts) |
| 2755 | { | 2755 | { |
| 2756 | ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX }; | ||
| 2756 | struct task_struct *tsk = current; | 2757 | struct task_struct *tsk = current; |
| 2757 | long timeout = MAX_SCHEDULE_TIMEOUT; | ||
| 2758 | sigset_t mask = *which; | 2758 | sigset_t mask = *which; |
| 2759 | int sig; | 2759 | int sig, ret = 0; |
| 2760 | 2760 | ||
| 2761 | if (ts) { | 2761 | if (ts) { |
| 2762 | if (!timespec_valid(ts)) | 2762 | if (!timespec_valid(ts)) |
| 2763 | return -EINVAL; | 2763 | return -EINVAL; |
| 2764 | timeout = timespec_to_jiffies(ts); | 2764 | timeout = timespec_to_ktime(*ts); |
| 2765 | /* | 2765 | to = &timeout; |
| 2766 | * We can be close to the next tick, add another one | ||
| 2767 | * to ensure we will wait at least the time asked for. | ||
| 2768 | */ | ||
| 2769 | if (ts->tv_sec || ts->tv_nsec) | ||
| 2770 | timeout++; | ||
| 2771 | } | 2766 | } |
| 2772 | 2767 | ||
| 2773 | /* | 2768 | /* |
| @@ -2778,7 +2773,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, | |||
| 2778 | 2773 | ||
| 2779 | spin_lock_irq(&tsk->sighand->siglock); | 2774 | spin_lock_irq(&tsk->sighand->siglock); |
| 2780 | sig = dequeue_signal(tsk, &mask, info); | 2775 | sig = dequeue_signal(tsk, &mask, info); |
| 2781 | if (!sig && timeout) { | 2776 | if (!sig && timeout.tv64) { |
| 2782 | /* | 2777 | /* |
| 2783 | * None ready, temporarily unblock those we're interested | 2778 | * None ready, temporarily unblock those we're interested |
| 2784 | * while we are sleeping in so that we'll be awakened when | 2779 | * while we are sleeping in so that we'll be awakened when |
| @@ -2790,8 +2785,9 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, | |||
| 2790 | recalc_sigpending(); | 2785 | recalc_sigpending(); |
| 2791 | spin_unlock_irq(&tsk->sighand->siglock); | 2786 | spin_unlock_irq(&tsk->sighand->siglock); |
| 2792 | 2787 | ||
| 2793 | timeout = freezable_schedule_timeout_interruptible(timeout); | 2788 | __set_current_state(TASK_INTERRUPTIBLE); |
| 2794 | 2789 | ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns, | |
| 2790 | HRTIMER_MODE_REL); | ||
| 2795 | spin_lock_irq(&tsk->sighand->siglock); | 2791 | spin_lock_irq(&tsk->sighand->siglock); |
| 2796 | __set_task_blocked(tsk, &tsk->real_blocked); | 2792 | __set_task_blocked(tsk, &tsk->real_blocked); |
| 2797 | sigemptyset(&tsk->real_blocked); | 2793 | sigemptyset(&tsk->real_blocked); |
| @@ -2801,7 +2797,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, | |||
| 2801 | 2797 | ||
| 2802 | if (sig) | 2798 | if (sig) |
| 2803 | return sig; | 2799 | return sig; |
| 2804 | return timeout ? -EINTR : -EAGAIN; | 2800 | return ret ? -EINTR : -EAGAIN; |
| 2805 | } | 2801 | } |
| 2806 | 2802 | ||
| 2807 | /** | 2803 | /** |
diff --git a/kernel/smp.c b/kernel/smp.c index 74165443c240..3aa642d39c03 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -33,69 +33,54 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); | |||
| 33 | 33 | ||
| 34 | static void flush_smp_call_function_queue(bool warn_cpu_offline); | 34 | static void flush_smp_call_function_queue(bool warn_cpu_offline); |
| 35 | 35 | ||
| 36 | static int | 36 | int smpcfd_prepare_cpu(unsigned int cpu) |
| 37 | hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
| 38 | { | 37 | { |
| 39 | long cpu = (long)hcpu; | ||
| 40 | struct call_function_data *cfd = &per_cpu(cfd_data, cpu); | 38 | struct call_function_data *cfd = &per_cpu(cfd_data, cpu); |
| 41 | 39 | ||
| 42 | switch (action) { | 40 | if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, |
| 43 | case CPU_UP_PREPARE: | 41 | cpu_to_node(cpu))) |
| 44 | case CPU_UP_PREPARE_FROZEN: | 42 | return -ENOMEM; |
| 45 | if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, | 43 | cfd->csd = alloc_percpu(struct call_single_data); |
| 46 | cpu_to_node(cpu))) | 44 | if (!cfd->csd) { |
| 47 | return notifier_from_errno(-ENOMEM); | ||
| 48 | cfd->csd = alloc_percpu(struct call_single_data); | ||
| 49 | if (!cfd->csd) { | ||
| 50 | free_cpumask_var(cfd->cpumask); | ||
| 51 | return notifier_from_errno(-ENOMEM); | ||
| 52 | } | ||
| 53 | break; | ||
| 54 | |||
| 55 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 56 | case CPU_UP_CANCELED: | ||
| 57 | case CPU_UP_CANCELED_FROZEN: | ||
| 58 | /* Fall-through to the CPU_DEAD[_FROZEN] case. */ | ||
| 59 | |||
| 60 | case CPU_DEAD: | ||
| 61 | case CPU_DEAD_FROZEN: | ||
| 62 | free_cpumask_var(cfd->cpumask); | 45 | free_cpumask_var(cfd->cpumask); |
| 63 | free_percpu(cfd->csd); | 46 | return -ENOMEM; |
| 64 | break; | 47 | } |
| 65 | 48 | ||
| 66 | case CPU_DYING: | 49 | return 0; |
| 67 | case CPU_DYING_FROZEN: | 50 | } |
| 68 | /* | 51 | |
| 69 | * The IPIs for the smp-call-function callbacks queued by other | 52 | int smpcfd_dead_cpu(unsigned int cpu) |
| 70 | * CPUs might arrive late, either due to hardware latencies or | 53 | { |
| 71 | * because this CPU disabled interrupts (inside stop-machine) | 54 | struct call_function_data *cfd = &per_cpu(cfd_data, cpu); |
| 72 | * before the IPIs were sent. So flush out any pending callbacks | ||
| 73 | * explicitly (without waiting for the IPIs to arrive), to | ||
| 74 | * ensure that the outgoing CPU doesn't go offline with work | ||
| 75 | * still pending. | ||
| 76 | */ | ||
| 77 | flush_smp_call_function_queue(false); | ||
| 78 | break; | ||
| 79 | #endif | ||
| 80 | }; | ||
| 81 | 55 | ||
| 82 | return NOTIFY_OK; | 56 | free_cpumask_var(cfd->cpumask); |
| 57 | free_percpu(cfd->csd); | ||
| 58 | return 0; | ||
| 83 | } | 59 | } |
| 84 | 60 | ||
| 85 | static struct notifier_block hotplug_cfd_notifier = { | 61 | int smpcfd_dying_cpu(unsigned int cpu) |
| 86 | .notifier_call = hotplug_cfd, | 62 | { |
| 87 | }; | 63 | /* |
| 64 | * The IPIs for the smp-call-function callbacks queued by other | ||
| 65 | * CPUs might arrive late, either due to hardware latencies or | ||
| 66 | * because this CPU disabled interrupts (inside stop-machine) | ||
| 67 | * before the IPIs were sent. So flush out any pending callbacks | ||
| 68 | * explicitly (without waiting for the IPIs to arrive), to | ||
| 69 | * ensure that the outgoing CPU doesn't go offline with work | ||
| 70 | * still pending. | ||
| 71 | */ | ||
| 72 | flush_smp_call_function_queue(false); | ||
| 73 | return 0; | ||
| 74 | } | ||
| 88 | 75 | ||
| 89 | void __init call_function_init(void) | 76 | void __init call_function_init(void) |
| 90 | { | 77 | { |
| 91 | void *cpu = (void *)(long)smp_processor_id(); | ||
| 92 | int i; | 78 | int i; |
| 93 | 79 | ||
| 94 | for_each_possible_cpu(i) | 80 | for_each_possible_cpu(i) |
| 95 | init_llist_head(&per_cpu(call_single_queue, i)); | 81 | init_llist_head(&per_cpu(call_single_queue, i)); |
| 96 | 82 | ||
| 97 | hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); | 83 | smpcfd_prepare_cpu(smp_processor_id()); |
| 98 | register_cpu_notifier(&hotplug_cfd_notifier); | ||
| 99 | } | 84 | } |
| 100 | 85 | ||
| 101 | /* | 86 | /* |
| @@ -107,7 +92,7 @@ void __init call_function_init(void) | |||
| 107 | */ | 92 | */ |
| 108 | static __always_inline void csd_lock_wait(struct call_single_data *csd) | 93 | static __always_inline void csd_lock_wait(struct call_single_data *csd) |
| 109 | { | 94 | { |
| 110 | smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK)); | 95 | smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); |
| 111 | } | 96 | } |
| 112 | 97 | ||
| 113 | static __always_inline void csd_lock(struct call_single_data *csd) | 98 | static __always_inline void csd_lock(struct call_single_data *csd) |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index a467e6c28a3b..4a1ca5f6da7e 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -21,6 +21,7 @@ | |||
| 21 | #include <linux/smpboot.h> | 21 | #include <linux/smpboot.h> |
| 22 | #include <linux/atomic.h> | 22 | #include <linux/atomic.h> |
| 23 | #include <linux/lglock.h> | 23 | #include <linux/lglock.h> |
| 24 | #include <linux/nmi.h> | ||
| 24 | 25 | ||
| 25 | /* | 26 | /* |
| 26 | * Structure to determine completion condition and record errors. May | 27 | * Structure to determine completion condition and record errors. May |
| @@ -209,6 +210,13 @@ static int multi_cpu_stop(void *data) | |||
| 209 | break; | 210 | break; |
| 210 | } | 211 | } |
| 211 | ack_state(msdata); | 212 | ack_state(msdata); |
| 213 | } else if (curstate > MULTI_STOP_PREPARE) { | ||
| 214 | /* | ||
| 215 | * At this stage all other CPUs we depend on must spin | ||
| 216 | * in the same loop. Any reason for hard-lockup should | ||
| 217 | * be detected and reported on their side. | ||
| 218 | */ | ||
| 219 | touch_nmi_watchdog(); | ||
| 212 | } | 220 | } |
| 213 | } while (curstate != MULTI_STOP_EXIT); | 221 | } while (curstate != MULTI_STOP_EXIT); |
| 214 | 222 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 87b2fc38398b..53954631a4e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -1205,6 +1205,17 @@ static struct ctl_table kern_table[] = { | |||
| 1205 | .extra2 = &one, | 1205 | .extra2 = &one, |
| 1206 | }, | 1206 | }, |
| 1207 | #endif | 1207 | #endif |
| 1208 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) | ||
| 1209 | { | ||
| 1210 | .procname = "panic_on_rcu_stall", | ||
| 1211 | .data = &sysctl_panic_on_rcu_stall, | ||
| 1212 | .maxlen = sizeof(sysctl_panic_on_rcu_stall), | ||
| 1213 | .mode = 0644, | ||
| 1214 | .proc_handler = proc_dointvec_minmax, | ||
| 1215 | .extra1 = &zero, | ||
| 1216 | .extra2 = &one, | ||
| 1217 | }, | ||
| 1218 | #endif | ||
| 1208 | { } | 1219 | { } |
| 1209 | }; | 1220 | }; |
| 1210 | 1221 | ||
| @@ -1497,8 +1508,8 @@ static struct ctl_table vm_table[] = { | |||
| 1497 | #ifdef CONFIG_NUMA | 1508 | #ifdef CONFIG_NUMA |
| 1498 | { | 1509 | { |
| 1499 | .procname = "zone_reclaim_mode", | 1510 | .procname = "zone_reclaim_mode", |
| 1500 | .data = &zone_reclaim_mode, | 1511 | .data = &node_reclaim_mode, |
| 1501 | .maxlen = sizeof(zone_reclaim_mode), | 1512 | .maxlen = sizeof(node_reclaim_mode), |
| 1502 | .mode = 0644, | 1513 | .mode = 0644, |
| 1503 | .proc_handler = proc_dointvec, | 1514 | .proc_handler = proc_dointvec, |
| 1504 | .extra1 = &zero, | 1515 | .extra1 = &zero, |
diff --git a/kernel/task_work.c b/kernel/task_work.c index 53fa971d000d..6ab4842b00e8 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c | |||
| @@ -108,7 +108,6 @@ void task_work_run(void) | |||
| 108 | * fail, but it can play with *work and other entries. | 108 | * fail, but it can play with *work and other entries. |
| 109 | */ | 109 | */ |
| 110 | raw_spin_unlock_wait(&task->pi_lock); | 110 | raw_spin_unlock_wait(&task->pi_lock); |
| 111 | smp_mb(); | ||
| 112 | 111 | ||
| 113 | do { | 112 | do { |
| 114 | next = work->next; | 113 | next = work->next; |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index e840ed867a5d..c3aad685bbc0 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
| @@ -30,7 +30,6 @@ | |||
| 30 | * struct alarm_base - Alarm timer bases | 30 | * struct alarm_base - Alarm timer bases |
| 31 | * @lock: Lock for syncrhonized access to the base | 31 | * @lock: Lock for syncrhonized access to the base |
| 32 | * @timerqueue: Timerqueue head managing the list of events | 32 | * @timerqueue: Timerqueue head managing the list of events |
| 33 | * @timer: hrtimer used to schedule events while running | ||
| 34 | * @gettime: Function to read the time correlating to the base | 33 | * @gettime: Function to read the time correlating to the base |
| 35 | * @base_clockid: clockid for the base | 34 | * @base_clockid: clockid for the base |
| 36 | */ | 35 | */ |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index a9b76a40319e..2c5bc77c0bb0 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
| @@ -645,7 +645,7 @@ void tick_cleanup_dead_cpu(int cpu) | |||
| 645 | #endif | 645 | #endif |
| 646 | 646 | ||
| 647 | #ifdef CONFIG_SYSFS | 647 | #ifdef CONFIG_SYSFS |
| 648 | struct bus_type clockevents_subsys = { | 648 | static struct bus_type clockevents_subsys = { |
| 649 | .name = "clockevents", | 649 | .name = "clockevents", |
| 650 | .dev_name = "clockevent", | 650 | .dev_name = "clockevent", |
| 651 | }; | 651 | }; |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 56ece145a814..6a5a310a1a53 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -669,10 +669,12 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
| 669 | struct list_head *entry = &clocksource_list; | 669 | struct list_head *entry = &clocksource_list; |
| 670 | struct clocksource *tmp; | 670 | struct clocksource *tmp; |
| 671 | 671 | ||
| 672 | list_for_each_entry(tmp, &clocksource_list, list) | 672 | list_for_each_entry(tmp, &clocksource_list, list) { |
| 673 | /* Keep track of the place, where to insert */ | 673 | /* Keep track of the place, where to insert */ |
| 674 | if (tmp->rating >= cs->rating) | 674 | if (tmp->rating < cs->rating) |
| 675 | entry = &tmp->list; | 675 | break; |
| 676 | entry = &tmp->list; | ||
| 677 | } | ||
| 676 | list_add(&cs->list, entry); | 678 | list_add(&cs->list, entry); |
| 677 | } | 679 | } |
| 678 | 680 | ||
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e99df0ff1d42..9ba7c820fc23 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
| @@ -177,7 +177,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) | |||
| 177 | #endif | 177 | #endif |
| 178 | } | 178 | } |
| 179 | 179 | ||
| 180 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) | 180 | #ifdef CONFIG_NO_HZ_COMMON |
| 181 | static inline | 181 | static inline |
| 182 | struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, | 182 | struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, |
| 183 | int pinned) | 183 | int pinned) |
| @@ -1590,7 +1590,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, | |||
| 1590 | /* | 1590 | /* |
| 1591 | * Functions related to boot-time initialization: | 1591 | * Functions related to boot-time initialization: |
| 1592 | */ | 1592 | */ |
| 1593 | static void init_hrtimers_cpu(int cpu) | 1593 | int hrtimers_prepare_cpu(unsigned int cpu) |
| 1594 | { | 1594 | { |
| 1595 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); | 1595 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); |
| 1596 | int i; | 1596 | int i; |
| @@ -1602,6 +1602,7 @@ static void init_hrtimers_cpu(int cpu) | |||
| 1602 | 1602 | ||
| 1603 | cpu_base->cpu = cpu; | 1603 | cpu_base->cpu = cpu; |
| 1604 | hrtimer_init_hres(cpu_base); | 1604 | hrtimer_init_hres(cpu_base); |
| 1605 | return 0; | ||
| 1605 | } | 1606 | } |
| 1606 | 1607 | ||
| 1607 | #ifdef CONFIG_HOTPLUG_CPU | 1608 | #ifdef CONFIG_HOTPLUG_CPU |
| @@ -1636,7 +1637,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, | |||
| 1636 | } | 1637 | } |
| 1637 | } | 1638 | } |
| 1638 | 1639 | ||
| 1639 | static void migrate_hrtimers(int scpu) | 1640 | int hrtimers_dead_cpu(unsigned int scpu) |
| 1640 | { | 1641 | { |
| 1641 | struct hrtimer_cpu_base *old_base, *new_base; | 1642 | struct hrtimer_cpu_base *old_base, *new_base; |
| 1642 | int i; | 1643 | int i; |
| @@ -1665,45 +1666,14 @@ static void migrate_hrtimers(int scpu) | |||
| 1665 | /* Check, if we got expired work to do */ | 1666 | /* Check, if we got expired work to do */ |
| 1666 | __hrtimer_peek_ahead_timers(); | 1667 | __hrtimer_peek_ahead_timers(); |
| 1667 | local_irq_enable(); | 1668 | local_irq_enable(); |
| 1669 | return 0; | ||
| 1668 | } | 1670 | } |
| 1669 | 1671 | ||
| 1670 | #endif /* CONFIG_HOTPLUG_CPU */ | 1672 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 1671 | 1673 | ||
| 1672 | static int hrtimer_cpu_notify(struct notifier_block *self, | ||
| 1673 | unsigned long action, void *hcpu) | ||
| 1674 | { | ||
| 1675 | int scpu = (long)hcpu; | ||
| 1676 | |||
| 1677 | switch (action) { | ||
| 1678 | |||
| 1679 | case CPU_UP_PREPARE: | ||
| 1680 | case CPU_UP_PREPARE_FROZEN: | ||
| 1681 | init_hrtimers_cpu(scpu); | ||
| 1682 | break; | ||
| 1683 | |||
| 1684 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1685 | case CPU_DEAD: | ||
| 1686 | case CPU_DEAD_FROZEN: | ||
| 1687 | migrate_hrtimers(scpu); | ||
| 1688 | break; | ||
| 1689 | #endif | ||
| 1690 | |||
| 1691 | default: | ||
| 1692 | break; | ||
| 1693 | } | ||
| 1694 | |||
| 1695 | return NOTIFY_OK; | ||
| 1696 | } | ||
| 1697 | |||
| 1698 | static struct notifier_block hrtimers_nb = { | ||
| 1699 | .notifier_call = hrtimer_cpu_notify, | ||
| 1700 | }; | ||
| 1701 | |||
| 1702 | void __init hrtimers_init(void) | 1674 | void __init hrtimers_init(void) |
| 1703 | { | 1675 | { |
| 1704 | hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, | 1676 | hrtimers_prepare_cpu(smp_processor_id()); |
| 1705 | (void *)(long)smp_processor_id()); | ||
| 1706 | register_cpu_notifier(&hrtimers_nb); | ||
| 1707 | } | 1677 | } |
| 1708 | 1678 | ||
| 1709 | /** | 1679 | /** |
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index e622ba365a13..b0928ab3270f 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c | |||
| @@ -43,13 +43,13 @@ static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters) | |||
| 43 | int allowed_error_ns = usecs * 5; | 43 | int allowed_error_ns = usecs * 5; |
| 44 | 44 | ||
| 45 | for (i = 0; i < iters; ++i) { | 45 | for (i = 0; i < iters; ++i) { |
| 46 | struct timespec ts1, ts2; | 46 | s64 kt1, kt2; |
| 47 | int time_passed; | 47 | int time_passed; |
| 48 | 48 | ||
| 49 | ktime_get_ts(&ts1); | 49 | kt1 = ktime_get_ns(); |
| 50 | udelay(usecs); | 50 | udelay(usecs); |
| 51 | ktime_get_ts(&ts2); | 51 | kt2 = ktime_get_ns(); |
| 52 | time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1); | 52 | time_passed = kt2 - kt1; |
| 53 | 53 | ||
| 54 | if (i == 0 || time_passed < min) | 54 | if (i == 0 || time_passed < min) |
| 55 | min = time_passed; | 55 | min = time_passed; |
| @@ -87,11 +87,11 @@ static int udelay_test_show(struct seq_file *s, void *v) | |||
| 87 | if (usecs > 0 && iters > 0) { | 87 | if (usecs > 0 && iters > 0) { |
| 88 | return udelay_test_single(s, usecs, iters); | 88 | return udelay_test_single(s, usecs, iters); |
| 89 | } else if (usecs == 0) { | 89 | } else if (usecs == 0) { |
| 90 | struct timespec ts; | 90 | struct timespec64 ts; |
| 91 | 91 | ||
| 92 | ktime_get_ts(&ts); | 92 | ktime_get_ts64(&ts); |
| 93 | seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n", | 93 | seq_printf(s, "udelay() test (lpj=%ld kt=%lld.%09ld)\n", |
| 94 | loops_per_jiffy, ts.tv_sec, ts.tv_nsec); | 94 | loops_per_jiffy, (s64)ts.tv_sec, ts.tv_nsec); |
| 95 | seq_puts(s, "usage:\n"); | 95 | seq_puts(s, "usage:\n"); |
| 96 | seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); | 96 | seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); |
| 97 | seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); | 97 | seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); |
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 53d7184da0be..690b797f522e 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c | |||
| @@ -75,6 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) | |||
| 75 | } | 75 | } |
| 76 | 76 | ||
| 77 | static struct clock_event_device ce_broadcast_hrtimer = { | 77 | static struct clock_event_device ce_broadcast_hrtimer = { |
| 78 | .name = "bc_hrtimer", | ||
| 78 | .set_state_shutdown = bc_shutdown, | 79 | .set_state_shutdown = bc_shutdown, |
| 79 | .set_next_ktime = bc_set_next, | 80 | .set_next_ktime = bc_set_next, |
| 80 | .features = CLOCK_EVT_FEAT_ONESHOT | | 81 | .features = CLOCK_EVT_FEAT_ONESHOT | |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 966a5a6fdd0a..f738251000fe 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
| @@ -164,3 +164,4 @@ static inline void timers_update_migration(bool update_nohz) { } | |||
| 164 | DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); | 164 | DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); |
| 165 | 165 | ||
| 166 | extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); | 166 | extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); |
| 167 | void timer_clear_idle(void); | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 536ada80f6dd..204fdc86863d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -31,7 +31,7 @@ | |||
| 31 | #include <trace/events/timer.h> | 31 | #include <trace/events/timer.h> |
| 32 | 32 | ||
| 33 | /* | 33 | /* |
| 34 | * Per cpu nohz control structure | 34 | * Per-CPU nohz control structure |
| 35 | */ | 35 | */ |
| 36 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); | 36 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); |
| 37 | 37 | ||
| @@ -61,7 +61,7 @@ static void tick_do_update_jiffies64(ktime_t now) | |||
| 61 | if (delta.tv64 < tick_period.tv64) | 61 | if (delta.tv64 < tick_period.tv64) |
| 62 | return; | 62 | return; |
| 63 | 63 | ||
| 64 | /* Reevalute with jiffies_lock held */ | 64 | /* Reevaluate with jiffies_lock held */ |
| 65 | write_seqlock(&jiffies_lock); | 65 | write_seqlock(&jiffies_lock); |
| 66 | 66 | ||
| 67 | delta = ktime_sub(now, last_jiffies_update); | 67 | delta = ktime_sub(now, last_jiffies_update); |
| @@ -116,8 +116,8 @@ static void tick_sched_do_timer(ktime_t now) | |||
| 116 | #ifdef CONFIG_NO_HZ_COMMON | 116 | #ifdef CONFIG_NO_HZ_COMMON |
| 117 | /* | 117 | /* |
| 118 | * Check if the do_timer duty was dropped. We don't care about | 118 | * Check if the do_timer duty was dropped. We don't care about |
| 119 | * concurrency: This happens only when the cpu in charge went | 119 | * concurrency: This happens only when the CPU in charge went |
| 120 | * into a long sleep. If two cpus happen to assign themself to | 120 | * into a long sleep. If two CPUs happen to assign themselves to |
| 121 | * this duty, then the jiffies update is still serialized by | 121 | * this duty, then the jiffies update is still serialized by |
| 122 | * jiffies_lock. | 122 | * jiffies_lock. |
| 123 | */ | 123 | */ |
| @@ -349,7 +349,7 @@ void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bi | |||
| 349 | /* | 349 | /* |
| 350 | * Re-evaluate the need for the tick as we switch the current task. | 350 | * Re-evaluate the need for the tick as we switch the current task. |
| 351 | * It might need the tick due to per task/process properties: | 351 | * It might need the tick due to per task/process properties: |
| 352 | * perf events, posix cpu timers, ... | 352 | * perf events, posix CPU timers, ... |
| 353 | */ | 353 | */ |
| 354 | void __tick_nohz_task_switch(void) | 354 | void __tick_nohz_task_switch(void) |
| 355 | { | 355 | { |
| @@ -509,8 +509,8 @@ int tick_nohz_tick_stopped(void) | |||
| 509 | * | 509 | * |
| 510 | * In case the sched_tick was stopped on this CPU, we have to check if jiffies | 510 | * In case the sched_tick was stopped on this CPU, we have to check if jiffies |
| 511 | * must be updated. Otherwise an interrupt handler could use a stale jiffy | 511 | * must be updated. Otherwise an interrupt handler could use a stale jiffy |
| 512 | * value. We do this unconditionally on any cpu, as we don't know whether the | 512 | * value. We do this unconditionally on any CPU, as we don't know whether the |
| 513 | * cpu, which has the update task assigned is in a long sleep. | 513 | * CPU, which has the update task assigned is in a long sleep. |
| 514 | */ | 514 | */ |
| 515 | static void tick_nohz_update_jiffies(ktime_t now) | 515 | static void tick_nohz_update_jiffies(ktime_t now) |
| 516 | { | 516 | { |
| @@ -526,7 +526,7 @@ static void tick_nohz_update_jiffies(ktime_t now) | |||
| 526 | } | 526 | } |
| 527 | 527 | ||
| 528 | /* | 528 | /* |
| 529 | * Updates the per cpu time idle statistics counters | 529 | * Updates the per-CPU time idle statistics counters |
| 530 | */ | 530 | */ |
| 531 | static void | 531 | static void |
| 532 | update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) | 532 | update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) |
| @@ -566,12 +566,12 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts) | |||
| 566 | } | 566 | } |
| 567 | 567 | ||
| 568 | /** | 568 | /** |
| 569 | * get_cpu_idle_time_us - get the total idle time of a cpu | 569 | * get_cpu_idle_time_us - get the total idle time of a CPU |
| 570 | * @cpu: CPU number to query | 570 | * @cpu: CPU number to query |
| 571 | * @last_update_time: variable to store update time in. Do not update | 571 | * @last_update_time: variable to store update time in. Do not update |
| 572 | * counters if NULL. | 572 | * counters if NULL. |
| 573 | * | 573 | * |
| 574 | * Return the cummulative idle time (since boot) for a given | 574 | * Return the cumulative idle time (since boot) for a given |
| 575 | * CPU, in microseconds. | 575 | * CPU, in microseconds. |
| 576 | * | 576 | * |
| 577 | * This time is measured via accounting rather than sampling, | 577 | * This time is measured via accounting rather than sampling, |
| @@ -607,12 +607,12 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) | |||
| 607 | EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); | 607 | EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); |
| 608 | 608 | ||
| 609 | /** | 609 | /** |
| 610 | * get_cpu_iowait_time_us - get the total iowait time of a cpu | 610 | * get_cpu_iowait_time_us - get the total iowait time of a CPU |
| 611 | * @cpu: CPU number to query | 611 | * @cpu: CPU number to query |
| 612 | * @last_update_time: variable to store update time in. Do not update | 612 | * @last_update_time: variable to store update time in. Do not update |
| 613 | * counters if NULL. | 613 | * counters if NULL. |
| 614 | * | 614 | * |
| 615 | * Return the cummulative iowait time (since boot) for a given | 615 | * Return the cumulative iowait time (since boot) for a given |
| 616 | * CPU, in microseconds. | 616 | * CPU, in microseconds. |
| 617 | * | 617 | * |
| 618 | * This time is measured via accounting rather than sampling, | 618 | * This time is measured via accounting rather than sampling, |
| @@ -700,6 +700,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 700 | delta = next_tick - basemono; | 700 | delta = next_tick - basemono; |
| 701 | if (delta <= (u64)TICK_NSEC) { | 701 | if (delta <= (u64)TICK_NSEC) { |
| 702 | tick.tv64 = 0; | 702 | tick.tv64 = 0; |
| 703 | |||
| 704 | /* | ||
| 705 | * Tell the timer code that the base is not idle, i.e. undo | ||
| 706 | * the effect of get_next_timer_interrupt(): | ||
| 707 | */ | ||
| 708 | timer_clear_idle(); | ||
| 703 | /* | 709 | /* |
| 704 | * We've not stopped the tick yet, and there's a timer in the | 710 | * We've not stopped the tick yet, and there's a timer in the |
| 705 | * next period, so no point in stopping it either, bail. | 711 | * next period, so no point in stopping it either, bail. |
| @@ -726,14 +732,14 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 726 | } | 732 | } |
| 727 | 733 | ||
| 728 | /* | 734 | /* |
| 729 | * If this cpu is the one which updates jiffies, then give up | 735 | * If this CPU is the one which updates jiffies, then give up |
| 730 | * the assignment and let it be taken by the cpu which runs | 736 | * the assignment and let it be taken by the CPU which runs |
| 731 | * the tick timer next, which might be this cpu as well. If we | 737 | * the tick timer next, which might be this CPU as well. If we |
| 732 | * don't drop this here the jiffies might be stale and | 738 | * don't drop this here the jiffies might be stale and |
| 733 | * do_timer() never invoked. Keep track of the fact that it | 739 | * do_timer() never invoked. Keep track of the fact that it |
| 734 | * was the one which had the do_timer() duty last. If this cpu | 740 | * was the one which had the do_timer() duty last. If this CPU |
| 735 | * is the one which had the do_timer() duty last, we limit the | 741 | * is the one which had the do_timer() duty last, we limit the |
| 736 | * sleep time to the timekeeping max_deferement value. | 742 | * sleep time to the timekeeping max_deferment value. |
| 737 | * Otherwise we can sleep as long as we want. | 743 | * Otherwise we can sleep as long as we want. |
| 738 | */ | 744 | */ |
| 739 | delta = timekeeping_max_deferment(); | 745 | delta = timekeeping_max_deferment(); |
| @@ -809,6 +815,12 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | |||
| 809 | tick_do_update_jiffies64(now); | 815 | tick_do_update_jiffies64(now); |
| 810 | cpu_load_update_nohz_stop(); | 816 | cpu_load_update_nohz_stop(); |
| 811 | 817 | ||
| 818 | /* | ||
| 819 | * Clear the timer idle flag, so we avoid IPIs on remote queueing and | ||
| 820 | * the clock forward checks in the enqueue path: | ||
| 821 | */ | ||
| 822 | timer_clear_idle(); | ||
| 823 | |||
| 812 | calc_load_exit_idle(); | 824 | calc_load_exit_idle(); |
| 813 | touch_softlockup_watchdog_sched(); | 825 | touch_softlockup_watchdog_sched(); |
| 814 | /* | 826 | /* |
| @@ -841,9 +853,9 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) | |||
| 841 | static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | 853 | static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) |
| 842 | { | 854 | { |
| 843 | /* | 855 | /* |
| 844 | * If this cpu is offline and it is the one which updates | 856 | * If this CPU is offline and it is the one which updates |
| 845 | * jiffies, then give up the assignment and let it be taken by | 857 | * jiffies, then give up the assignment and let it be taken by |
| 846 | * the cpu which runs the tick timer next. If we don't drop | 858 | * the CPU which runs the tick timer next. If we don't drop |
| 847 | * this here the jiffies might be stale and do_timer() never | 859 | * this here the jiffies might be stale and do_timer() never |
| 848 | * invoked. | 860 | * invoked. |
| 849 | */ | 861 | */ |
| @@ -896,11 +908,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) | |||
| 896 | ktime_t now, expires; | 908 | ktime_t now, expires; |
| 897 | int cpu = smp_processor_id(); | 909 | int cpu = smp_processor_id(); |
| 898 | 910 | ||
| 899 | now = tick_nohz_start_idle(ts); | ||
| 900 | |||
| 901 | if (can_stop_idle_tick(cpu, ts)) { | 911 | if (can_stop_idle_tick(cpu, ts)) { |
| 902 | int was_stopped = ts->tick_stopped; | 912 | int was_stopped = ts->tick_stopped; |
| 903 | 913 | ||
| 914 | now = tick_nohz_start_idle(ts); | ||
| 904 | ts->idle_calls++; | 915 | ts->idle_calls++; |
| 905 | 916 | ||
| 906 | expires = tick_nohz_stop_sched_tick(ts, now, cpu); | 917 | expires = tick_nohz_stop_sched_tick(ts, now, cpu); |
| @@ -933,11 +944,11 @@ void tick_nohz_idle_enter(void) | |||
| 933 | WARN_ON_ONCE(irqs_disabled()); | 944 | WARN_ON_ONCE(irqs_disabled()); |
| 934 | 945 | ||
| 935 | /* | 946 | /* |
| 936 | * Update the idle state in the scheduler domain hierarchy | 947 | * Update the idle state in the scheduler domain hierarchy |
| 937 | * when tick_nohz_stop_sched_tick() is called from the idle loop. | 948 | * when tick_nohz_stop_sched_tick() is called from the idle loop. |
| 938 | * State will be updated to busy during the first busy tick after | 949 | * State will be updated to busy during the first busy tick after |
| 939 | * exiting idle. | 950 | * exiting idle. |
| 940 | */ | 951 | */ |
| 941 | set_cpu_sd_state_idle(); | 952 | set_cpu_sd_state_idle(); |
| 942 | 953 | ||
| 943 | local_irq_disable(); | 954 | local_irq_disable(); |
| @@ -1092,35 +1103,6 @@ static void tick_nohz_switch_to_nohz(void) | |||
| 1092 | tick_nohz_activate(ts, NOHZ_MODE_LOWRES); | 1103 | tick_nohz_activate(ts, NOHZ_MODE_LOWRES); |
| 1093 | } | 1104 | } |
| 1094 | 1105 | ||
| 1095 | /* | ||
| 1096 | * When NOHZ is enabled and the tick is stopped, we need to kick the | ||
| 1097 | * tick timer from irq_enter() so that the jiffies update is kept | ||
| 1098 | * alive during long running softirqs. That's ugly as hell, but | ||
| 1099 | * correctness is key even if we need to fix the offending softirq in | ||
| 1100 | * the first place. | ||
| 1101 | * | ||
| 1102 | * Note, this is different to tick_nohz_restart. We just kick the | ||
| 1103 | * timer and do not touch the other magic bits which need to be done | ||
| 1104 | * when idle is left. | ||
| 1105 | */ | ||
| 1106 | static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) | ||
| 1107 | { | ||
| 1108 | #if 0 | ||
| 1109 | /* Switch back to 2.6.27 behaviour */ | ||
| 1110 | ktime_t delta; | ||
| 1111 | |||
| 1112 | /* | ||
| 1113 | * Do not touch the tick device, when the next expiry is either | ||
| 1114 | * already reached or less/equal than the tick period. | ||
| 1115 | */ | ||
| 1116 | delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); | ||
| 1117 | if (delta.tv64 <= tick_period.tv64) | ||
| 1118 | return; | ||
| 1119 | |||
| 1120 | tick_nohz_restart(ts, now); | ||
| 1121 | #endif | ||
| 1122 | } | ||
| 1123 | |||
| 1124 | static inline void tick_nohz_irq_enter(void) | 1106 | static inline void tick_nohz_irq_enter(void) |
| 1125 | { | 1107 | { |
| 1126 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); | 1108 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| @@ -1131,10 +1113,8 @@ static inline void tick_nohz_irq_enter(void) | |||
| 1131 | now = ktime_get(); | 1113 | now = ktime_get(); |
| 1132 | if (ts->idle_active) | 1114 | if (ts->idle_active) |
| 1133 | tick_nohz_stop_idle(ts, now); | 1115 | tick_nohz_stop_idle(ts, now); |
| 1134 | if (ts->tick_stopped) { | 1116 | if (ts->tick_stopped) |
| 1135 | tick_nohz_update_jiffies(now); | 1117 | tick_nohz_update_jiffies(now); |
| 1136 | tick_nohz_kick_tick(ts, now); | ||
| 1137 | } | ||
| 1138 | } | 1118 | } |
| 1139 | 1119 | ||
| 1140 | #else | 1120 | #else |
| @@ -1211,7 +1191,7 @@ void tick_setup_sched_timer(void) | |||
| 1211 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 1191 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); |
| 1212 | ts->sched_timer.function = tick_sched_timer; | 1192 | ts->sched_timer.function = tick_sched_timer; |
| 1213 | 1193 | ||
| 1214 | /* Get the next period (per cpu) */ | 1194 | /* Get the next period (per-CPU) */ |
| 1215 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); | 1195 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); |
| 1216 | 1196 | ||
| 1217 | /* Offset the tick to avert jiffies_lock contention. */ | 1197 | /* Offset the tick to avert jiffies_lock contention. */ |
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c index 86628e755f38..7142580ad94f 100644 --- a/kernel/time/timeconv.c +++ b/kernel/time/timeconv.c | |||
| @@ -67,20 +67,21 @@ static const unsigned short __mon_yday[2][13] = { | |||
| 67 | #define SECS_PER_DAY (SECS_PER_HOUR * 24) | 67 | #define SECS_PER_DAY (SECS_PER_HOUR * 24) |
| 68 | 68 | ||
| 69 | /** | 69 | /** |
| 70 | * time_to_tm - converts the calendar time to local broken-down time | 70 | * time64_to_tm - converts the calendar time to local broken-down time |
| 71 | * | 71 | * |
| 72 | * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, | 72 | * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, |
| 73 | * Coordinated Universal Time (UTC). | 73 | * Coordinated Universal Time (UTC). |
| 74 | * @offset offset seconds adding to totalsecs. | 74 | * @offset offset seconds adding to totalsecs. |
| 75 | * @result pointer to struct tm variable to receive broken-down time | 75 | * @result pointer to struct tm variable to receive broken-down time |
| 76 | */ | 76 | */ |
| 77 | void time_to_tm(time_t totalsecs, int offset, struct tm *result) | 77 | void time64_to_tm(time64_t totalsecs, int offset, struct tm *result) |
| 78 | { | 78 | { |
| 79 | long days, rem, y; | 79 | long days, rem, y; |
| 80 | int remainder; | ||
| 80 | const unsigned short *ip; | 81 | const unsigned short *ip; |
| 81 | 82 | ||
| 82 | days = totalsecs / SECS_PER_DAY; | 83 | days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder); |
| 83 | rem = totalsecs % SECS_PER_DAY; | 84 | rem = remainder; |
| 84 | rem += offset; | 85 | rem += offset; |
| 85 | while (rem < 0) { | 86 | while (rem < 0) { |
| 86 | rem += SECS_PER_DAY; | 87 | rem += SECS_PER_DAY; |
| @@ -124,4 +125,4 @@ void time_to_tm(time_t totalsecs, int offset, struct tm *result) | |||
| 124 | result->tm_mon = y; | 125 | result->tm_mon = y; |
| 125 | result->tm_mday = days + 1; | 126 | result->tm_mday = days + 1; |
| 126 | } | 127 | } |
| 127 | EXPORT_SYMBOL(time_to_tm); | 128 | EXPORT_SYMBOL(time64_to_tm); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 479d25cd3d4f..3b65746c7f15 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -480,10 +480,12 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) | |||
| 480 | * users are removed, this can be killed. | 480 | * users are removed, this can be killed. |
| 481 | */ | 481 | */ |
| 482 | remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); | 482 | remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); |
| 483 | tk->tkr_mono.xtime_nsec -= remainder; | 483 | if (remainder != 0) { |
| 484 | tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; | 484 | tk->tkr_mono.xtime_nsec -= remainder; |
| 485 | tk->ntp_error += remainder << tk->ntp_error_shift; | 485 | tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; |
| 486 | tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; | 486 | tk->ntp_error += remainder << tk->ntp_error_shift; |
| 487 | tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; | ||
| 488 | } | ||
| 487 | } | 489 | } |
| 488 | #else | 490 | #else |
| 489 | #define old_vsyscall_fixup(tk) | 491 | #define old_vsyscall_fixup(tk) |
| @@ -2186,6 +2188,7 @@ struct timespec64 get_monotonic_coarse64(void) | |||
| 2186 | 2188 | ||
| 2187 | return now; | 2189 | return now; |
| 2188 | } | 2190 | } |
| 2191 | EXPORT_SYMBOL(get_monotonic_coarse64); | ||
| 2189 | 2192 | ||
| 2190 | /* | 2193 | /* |
| 2191 | * Must hold jiffies_lock | 2194 | * Must hold jiffies_lock |
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3a95f9728778..555670a5143c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
| @@ -59,43 +59,153 @@ __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; | |||
| 59 | EXPORT_SYMBOL(jiffies_64); | 59 | EXPORT_SYMBOL(jiffies_64); |
| 60 | 60 | ||
| 61 | /* | 61 | /* |
| 62 | * per-CPU timer vector definitions: | 62 | * The timer wheel has LVL_DEPTH array levels. Each level provides an array of |
| 63 | * LVL_SIZE buckets. Each level is driven by its own clock and therefor each | ||
| 64 | * level has a different granularity. | ||
| 65 | * | ||
| 66 | * The level granularity is: LVL_CLK_DIV ^ lvl | ||
| 67 | * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) | ||
| 68 | * | ||
| 69 | * The array level of a newly armed timer depends on the relative expiry | ||
| 70 | * time. The farther the expiry time is away the higher the array level and | ||
| 71 | * therefor the granularity becomes. | ||
| 72 | * | ||
| 73 | * Contrary to the original timer wheel implementation, which aims for 'exact' | ||
| 74 | * expiry of the timers, this implementation removes the need for recascading | ||
| 75 | * the timers into the lower array levels. The previous 'classic' timer wheel | ||
| 76 | * implementation of the kernel already violated the 'exact' expiry by adding | ||
| 77 | * slack to the expiry time to provide batched expiration. The granularity | ||
| 78 | * levels provide implicit batching. | ||
| 79 | * | ||
| 80 | * This is an optimization of the original timer wheel implementation for the | ||
| 81 | * majority of the timer wheel use cases: timeouts. The vast majority of | ||
| 82 | * timeout timers (networking, disk I/O ...) are canceled before expiry. If | ||
| 83 | * the timeout expires it indicates that normal operation is disturbed, so it | ||
| 84 | * does not matter much whether the timeout comes with a slight delay. | ||
| 85 | * | ||
| 86 | * The only exception to this are networking timers with a small expiry | ||
| 87 | * time. They rely on the granularity. Those fit into the first wheel level, | ||
| 88 | * which has HZ granularity. | ||
| 89 | * | ||
| 90 | * We don't have cascading anymore. timers with a expiry time above the | ||
| 91 | * capacity of the last wheel level are force expired at the maximum timeout | ||
| 92 | * value of the last wheel level. From data sampling we know that the maximum | ||
| 93 | * value observed is 5 days (network connection tracking), so this should not | ||
| 94 | * be an issue. | ||
| 95 | * | ||
| 96 | * The currently chosen array constants values are a good compromise between | ||
| 97 | * array size and granularity. | ||
| 98 | * | ||
| 99 | * This results in the following granularity and range levels: | ||
| 100 | * | ||
| 101 | * HZ 1000 steps | ||
| 102 | * Level Offset Granularity Range | ||
| 103 | * 0 0 1 ms 0 ms - 63 ms | ||
| 104 | * 1 64 8 ms 64 ms - 511 ms | ||
| 105 | * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s) | ||
| 106 | * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s) | ||
| 107 | * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m) | ||
| 108 | * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m) | ||
| 109 | * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h) | ||
| 110 | * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d) | ||
| 111 | * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d) | ||
| 112 | * | ||
| 113 | * HZ 300 | ||
| 114 | * Level Offset Granularity Range | ||
| 115 | * 0 0 3 ms 0 ms - 210 ms | ||
| 116 | * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s) | ||
| 117 | * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s) | ||
| 118 | * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m) | ||
| 119 | * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m) | ||
| 120 | * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h) | ||
| 121 | * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h) | ||
| 122 | * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d) | ||
| 123 | * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) | ||
| 124 | * | ||
| 125 | * HZ 250 | ||
| 126 | * Level Offset Granularity Range | ||
| 127 | * 0 0 4 ms 0 ms - 255 ms | ||
| 128 | * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s) | ||
| 129 | * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s) | ||
| 130 | * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m) | ||
| 131 | * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m) | ||
| 132 | * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h) | ||
| 133 | * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) | ||
| 134 | * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) | ||
| 135 | * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) | ||
| 136 | * | ||
| 137 | * HZ 100 | ||
| 138 | * Level Offset Granularity Range | ||
| 139 | * 0 0 10 ms 0 ms - 630 ms | ||
| 140 | * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s) | ||
| 141 | * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s) | ||
| 142 | * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m) | ||
| 143 | * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m) | ||
| 144 | * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h) | ||
| 145 | * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d) | ||
| 146 | * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d) | ||
| 63 | */ | 147 | */ |
| 64 | #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) | ||
| 65 | #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) | ||
| 66 | #define TVN_SIZE (1 << TVN_BITS) | ||
| 67 | #define TVR_SIZE (1 << TVR_BITS) | ||
| 68 | #define TVN_MASK (TVN_SIZE - 1) | ||
| 69 | #define TVR_MASK (TVR_SIZE - 1) | ||
| 70 | #define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) | ||
| 71 | |||
| 72 | struct tvec { | ||
| 73 | struct hlist_head vec[TVN_SIZE]; | ||
| 74 | }; | ||
| 75 | 148 | ||
| 76 | struct tvec_root { | 149 | /* Clock divisor for the next level */ |
| 77 | struct hlist_head vec[TVR_SIZE]; | 150 | #define LVL_CLK_SHIFT 3 |
| 78 | }; | 151 | #define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT) |
| 152 | #define LVL_CLK_MASK (LVL_CLK_DIV - 1) | ||
| 153 | #define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT) | ||
| 154 | #define LVL_GRAN(n) (1UL << LVL_SHIFT(n)) | ||
| 79 | 155 | ||
| 80 | struct tvec_base { | 156 | /* |
| 81 | spinlock_t lock; | 157 | * The time start value for each level to select the bucket at enqueue |
| 82 | struct timer_list *running_timer; | 158 | * time. |
| 83 | unsigned long timer_jiffies; | 159 | */ |
| 84 | unsigned long next_timer; | 160 | #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) |
| 85 | unsigned long active_timers; | 161 | |
| 86 | unsigned long all_timers; | 162 | /* Size of each clock level */ |
| 87 | int cpu; | 163 | #define LVL_BITS 6 |
| 88 | bool migration_enabled; | 164 | #define LVL_SIZE (1UL << LVL_BITS) |
| 89 | bool nohz_active; | 165 | #define LVL_MASK (LVL_SIZE - 1) |
| 90 | struct tvec_root tv1; | 166 | #define LVL_OFFS(n) ((n) * LVL_SIZE) |
| 91 | struct tvec tv2; | 167 | |
| 92 | struct tvec tv3; | 168 | /* Level depth */ |
| 93 | struct tvec tv4; | 169 | #if HZ > 100 |
| 94 | struct tvec tv5; | 170 | # define LVL_DEPTH 9 |
| 95 | } ____cacheline_aligned; | 171 | # else |
| 172 | # define LVL_DEPTH 8 | ||
| 173 | #endif | ||
| 174 | |||
| 175 | /* The cutoff (max. capacity of the wheel) */ | ||
| 176 | #define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH)) | ||
| 177 | #define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1)) | ||
| 178 | |||
| 179 | /* | ||
| 180 | * The resulting wheel size. If NOHZ is configured we allocate two | ||
| 181 | * wheels so we have a separate storage for the deferrable timers. | ||
| 182 | */ | ||
| 183 | #define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH) | ||
| 184 | |||
| 185 | #ifdef CONFIG_NO_HZ_COMMON | ||
| 186 | # define NR_BASES 2 | ||
| 187 | # define BASE_STD 0 | ||
| 188 | # define BASE_DEF 1 | ||
| 189 | #else | ||
| 190 | # define NR_BASES 1 | ||
| 191 | # define BASE_STD 0 | ||
| 192 | # define BASE_DEF 0 | ||
| 193 | #endif | ||
| 96 | 194 | ||
| 195 | struct timer_base { | ||
| 196 | spinlock_t lock; | ||
| 197 | struct timer_list *running_timer; | ||
| 198 | unsigned long clk; | ||
| 199 | unsigned long next_expiry; | ||
| 200 | unsigned int cpu; | ||
| 201 | bool migration_enabled; | ||
| 202 | bool nohz_active; | ||
| 203 | bool is_idle; | ||
| 204 | DECLARE_BITMAP(pending_map, WHEEL_SIZE); | ||
| 205 | struct hlist_head vectors[WHEEL_SIZE]; | ||
| 206 | } ____cacheline_aligned; | ||
| 97 | 207 | ||
| 98 | static DEFINE_PER_CPU(struct tvec_base, tvec_bases); | 208 | static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); |
| 99 | 209 | ||
| 100 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) | 210 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) |
| 101 | unsigned int sysctl_timer_migration = 1; | 211 | unsigned int sysctl_timer_migration = 1; |
| @@ -106,15 +216,17 @@ void timers_update_migration(bool update_nohz) | |||
| 106 | unsigned int cpu; | 216 | unsigned int cpu; |
| 107 | 217 | ||
| 108 | /* Avoid the loop, if nothing to update */ | 218 | /* Avoid the loop, if nothing to update */ |
| 109 | if (this_cpu_read(tvec_bases.migration_enabled) == on) | 219 | if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on) |
| 110 | return; | 220 | return; |
| 111 | 221 | ||
| 112 | for_each_possible_cpu(cpu) { | 222 | for_each_possible_cpu(cpu) { |
| 113 | per_cpu(tvec_bases.migration_enabled, cpu) = on; | 223 | per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on; |
| 224 | per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on; | ||
| 114 | per_cpu(hrtimer_bases.migration_enabled, cpu) = on; | 225 | per_cpu(hrtimer_bases.migration_enabled, cpu) = on; |
| 115 | if (!update_nohz) | 226 | if (!update_nohz) |
| 116 | continue; | 227 | continue; |
| 117 | per_cpu(tvec_bases.nohz_active, cpu) = true; | 228 | per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true; |
| 229 | per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true; | ||
| 118 | per_cpu(hrtimer_bases.nohz_active, cpu) = true; | 230 | per_cpu(hrtimer_bases.nohz_active, cpu) = true; |
| 119 | } | 231 | } |
| 120 | } | 232 | } |
| @@ -133,20 +245,6 @@ int timer_migration_handler(struct ctl_table *table, int write, | |||
| 133 | mutex_unlock(&mutex); | 245 | mutex_unlock(&mutex); |
| 134 | return ret; | 246 | return ret; |
| 135 | } | 247 | } |
| 136 | |||
| 137 | static inline struct tvec_base *get_target_base(struct tvec_base *base, | ||
| 138 | int pinned) | ||
| 139 | { | ||
| 140 | if (pinned || !base->migration_enabled) | ||
| 141 | return this_cpu_ptr(&tvec_bases); | ||
| 142 | return per_cpu_ptr(&tvec_bases, get_nohz_timer_target()); | ||
| 143 | } | ||
| 144 | #else | ||
| 145 | static inline struct tvec_base *get_target_base(struct tvec_base *base, | ||
| 146 | int pinned) | ||
| 147 | { | ||
| 148 | return this_cpu_ptr(&tvec_bases); | ||
| 149 | } | ||
| 150 | #endif | 248 | #endif |
| 151 | 249 | ||
| 152 | static unsigned long round_jiffies_common(unsigned long j, int cpu, | 250 | static unsigned long round_jiffies_common(unsigned long j, int cpu, |
| @@ -351,101 +449,126 @@ unsigned long round_jiffies_up_relative(unsigned long j) | |||
| 351 | } | 449 | } |
| 352 | EXPORT_SYMBOL_GPL(round_jiffies_up_relative); | 450 | EXPORT_SYMBOL_GPL(round_jiffies_up_relative); |
| 353 | 451 | ||
| 354 | /** | 452 | |
| 355 | * set_timer_slack - set the allowed slack for a timer | 453 | static inline unsigned int timer_get_idx(struct timer_list *timer) |
| 356 | * @timer: the timer to be modified | ||
| 357 | * @slack_hz: the amount of time (in jiffies) allowed for rounding | ||
| 358 | * | ||
| 359 | * Set the amount of time, in jiffies, that a certain timer has | ||
| 360 | * in terms of slack. By setting this value, the timer subsystem | ||
| 361 | * will schedule the actual timer somewhere between | ||
| 362 | * the time mod_timer() asks for, and that time plus the slack. | ||
| 363 | * | ||
| 364 | * By setting the slack to -1, a percentage of the delay is used | ||
| 365 | * instead. | ||
| 366 | */ | ||
| 367 | void set_timer_slack(struct timer_list *timer, int slack_hz) | ||
| 368 | { | 454 | { |
| 369 | timer->slack = slack_hz; | 455 | return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT; |
| 370 | } | 456 | } |
| 371 | EXPORT_SYMBOL_GPL(set_timer_slack); | ||
| 372 | 457 | ||
| 373 | static void | 458 | static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) |
| 374 | __internal_add_timer(struct tvec_base *base, struct timer_list *timer) | ||
| 375 | { | 459 | { |
| 376 | unsigned long expires = timer->expires; | 460 | timer->flags = (timer->flags & ~TIMER_ARRAYMASK) | |
| 377 | unsigned long idx = expires - base->timer_jiffies; | 461 | idx << TIMER_ARRAYSHIFT; |
| 378 | struct hlist_head *vec; | 462 | } |
| 379 | 463 | ||
| 380 | if (idx < TVR_SIZE) { | 464 | /* |
| 381 | int i = expires & TVR_MASK; | 465 | * Helper function to calculate the array index for a given expiry |
| 382 | vec = base->tv1.vec + i; | 466 | * time. |
| 383 | } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { | 467 | */ |
| 384 | int i = (expires >> TVR_BITS) & TVN_MASK; | 468 | static inline unsigned calc_index(unsigned expires, unsigned lvl) |
| 385 | vec = base->tv2.vec + i; | 469 | { |
| 386 | } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { | 470 | expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); |
| 387 | int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; | 471 | return LVL_OFFS(lvl) + (expires & LVL_MASK); |
| 388 | vec = base->tv3.vec + i; | 472 | } |
| 389 | } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { | 473 | |
| 390 | int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; | 474 | static int calc_wheel_index(unsigned long expires, unsigned long clk) |
| 391 | vec = base->tv4.vec + i; | 475 | { |
| 392 | } else if ((signed long) idx < 0) { | 476 | unsigned long delta = expires - clk; |
| 393 | /* | 477 | unsigned int idx; |
| 394 | * Can happen if you add a timer with expires == jiffies, | 478 | |
| 395 | * or you set a timer to go off in the past | 479 | if (delta < LVL_START(1)) { |
| 396 | */ | 480 | idx = calc_index(expires, 0); |
| 397 | vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); | 481 | } else if (delta < LVL_START(2)) { |
| 482 | idx = calc_index(expires, 1); | ||
| 483 | } else if (delta < LVL_START(3)) { | ||
| 484 | idx = calc_index(expires, 2); | ||
| 485 | } else if (delta < LVL_START(4)) { | ||
| 486 | idx = calc_index(expires, 3); | ||
| 487 | } else if (delta < LVL_START(5)) { | ||
| 488 | idx = calc_index(expires, 4); | ||
| 489 | } else if (delta < LVL_START(6)) { | ||
| 490 | idx = calc_index(expires, 5); | ||
| 491 | } else if (delta < LVL_START(7)) { | ||
| 492 | idx = calc_index(expires, 6); | ||
| 493 | } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { | ||
| 494 | idx = calc_index(expires, 7); | ||
| 495 | } else if ((long) delta < 0) { | ||
| 496 | idx = clk & LVL_MASK; | ||
| 398 | } else { | 497 | } else { |
| 399 | int i; | 498 | /* |
| 400 | /* If the timeout is larger than MAX_TVAL (on 64-bit | 499 | * Force expire obscene large timeouts to expire at the |
| 401 | * architectures or with CONFIG_BASE_SMALL=1) then we | 500 | * capacity limit of the wheel. |
| 402 | * use the maximum timeout. | ||
| 403 | */ | 501 | */ |
| 404 | if (idx > MAX_TVAL) { | 502 | if (expires >= WHEEL_TIMEOUT_CUTOFF) |
| 405 | idx = MAX_TVAL; | 503 | expires = WHEEL_TIMEOUT_MAX; |
| 406 | expires = idx + base->timer_jiffies; | 504 | |
| 407 | } | 505 | idx = calc_index(expires, LVL_DEPTH - 1); |
| 408 | i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; | ||
| 409 | vec = base->tv5.vec + i; | ||
| 410 | } | 506 | } |
| 507 | return idx; | ||
| 508 | } | ||
| 509 | |||
| 510 | /* | ||
| 511 | * Enqueue the timer into the hash bucket, mark it pending in | ||
| 512 | * the bitmap and store the index in the timer flags. | ||
| 513 | */ | ||
| 514 | static void enqueue_timer(struct timer_base *base, struct timer_list *timer, | ||
| 515 | unsigned int idx) | ||
| 516 | { | ||
| 517 | hlist_add_head(&timer->entry, base->vectors + idx); | ||
| 518 | __set_bit(idx, base->pending_map); | ||
| 519 | timer_set_idx(timer, idx); | ||
| 520 | } | ||
| 521 | |||
| 522 | static void | ||
| 523 | __internal_add_timer(struct timer_base *base, struct timer_list *timer) | ||
| 524 | { | ||
| 525 | unsigned int idx; | ||
| 411 | 526 | ||
| 412 | hlist_add_head(&timer->entry, vec); | 527 | idx = calc_wheel_index(timer->expires, base->clk); |
| 528 | enqueue_timer(base, timer, idx); | ||
| 413 | } | 529 | } |
| 414 | 530 | ||
| 415 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) | 531 | static void |
| 532 | trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) | ||
| 416 | { | 533 | { |
| 417 | /* Advance base->jiffies, if the base is empty */ | 534 | if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) |
| 418 | if (!base->all_timers++) | 535 | return; |
| 419 | base->timer_jiffies = jiffies; | ||
| 420 | 536 | ||
| 421 | __internal_add_timer(base, timer); | ||
| 422 | /* | 537 | /* |
| 423 | * Update base->active_timers and base->next_timer | 538 | * TODO: This wants some optimizing similar to the code below, but we |
| 539 | * will do that when we switch from push to pull for deferrable timers. | ||
| 424 | */ | 540 | */ |
| 425 | if (!(timer->flags & TIMER_DEFERRABLE)) { | 541 | if (timer->flags & TIMER_DEFERRABLE) { |
| 426 | if (!base->active_timers++ || | 542 | if (tick_nohz_full_cpu(base->cpu)) |
| 427 | time_before(timer->expires, base->next_timer)) | 543 | wake_up_nohz_cpu(base->cpu); |
| 428 | base->next_timer = timer->expires; | 544 | return; |
| 429 | } | 545 | } |
| 430 | 546 | ||
| 431 | /* | 547 | /* |
| 432 | * Check whether the other CPU is in dynticks mode and needs | 548 | * We might have to IPI the remote CPU if the base is idle and the |
| 433 | * to be triggered to reevaluate the timer wheel. | 549 | * timer is not deferrable. If the other CPU is on the way to idle |
| 434 | * We are protected against the other CPU fiddling | 550 | * then it can't set base->is_idle as we hold the base lock: |
| 435 | * with the timer by holding the timer base lock. This also | ||
| 436 | * makes sure that a CPU on the way to stop its tick can not | ||
| 437 | * evaluate the timer wheel. | ||
| 438 | * | ||
| 439 | * Spare the IPI for deferrable timers on idle targets though. | ||
| 440 | * The next busy ticks will take care of it. Except full dynticks | ||
| 441 | * require special care against races with idle_cpu(), lets deal | ||
| 442 | * with that later. | ||
| 443 | */ | 551 | */ |
| 444 | if (base->nohz_active) { | 552 | if (!base->is_idle) |
| 445 | if (!(timer->flags & TIMER_DEFERRABLE) || | 553 | return; |
| 446 | tick_nohz_full_cpu(base->cpu)) | 554 | |
| 447 | wake_up_nohz_cpu(base->cpu); | 555 | /* Check whether this is the new first expiring timer: */ |
| 448 | } | 556 | if (time_after_eq(timer->expires, base->next_expiry)) |
| 557 | return; | ||
| 558 | |||
| 559 | /* | ||
| 560 | * Set the next expiry time and kick the CPU so it can reevaluate the | ||
| 561 | * wheel: | ||
| 562 | */ | ||
| 563 | base->next_expiry = timer->expires; | ||
| 564 | wake_up_nohz_cpu(base->cpu); | ||
| 565 | } | ||
| 566 | |||
| 567 | static void | ||
| 568 | internal_add_timer(struct timer_base *base, struct timer_list *timer) | ||
| 569 | { | ||
| 570 | __internal_add_timer(base, timer); | ||
| 571 | trigger_dyntick_cpu(base, timer); | ||
| 449 | } | 572 | } |
| 450 | 573 | ||
| 451 | #ifdef CONFIG_TIMER_STATS | 574 | #ifdef CONFIG_TIMER_STATS |
| @@ -666,7 +789,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, | |||
| 666 | { | 789 | { |
| 667 | timer->entry.pprev = NULL; | 790 | timer->entry.pprev = NULL; |
| 668 | timer->flags = flags | raw_smp_processor_id(); | 791 | timer->flags = flags | raw_smp_processor_id(); |
| 669 | timer->slack = -1; | ||
| 670 | #ifdef CONFIG_TIMER_STATS | 792 | #ifdef CONFIG_TIMER_STATS |
| 671 | timer->start_site = NULL; | 793 | timer->start_site = NULL; |
| 672 | timer->start_pid = -1; | 794 | timer->start_pid = -1; |
| @@ -706,54 +828,125 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending) | |||
| 706 | entry->next = LIST_POISON2; | 828 | entry->next = LIST_POISON2; |
| 707 | } | 829 | } |
| 708 | 830 | ||
| 709 | static inline void | 831 | static int detach_if_pending(struct timer_list *timer, struct timer_base *base, |
| 710 | detach_expired_timer(struct timer_list *timer, struct tvec_base *base) | ||
| 711 | { | ||
| 712 | detach_timer(timer, true); | ||
| 713 | if (!(timer->flags & TIMER_DEFERRABLE)) | ||
| 714 | base->active_timers--; | ||
| 715 | base->all_timers--; | ||
| 716 | } | ||
| 717 | |||
| 718 | static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, | ||
| 719 | bool clear_pending) | 832 | bool clear_pending) |
| 720 | { | 833 | { |
| 834 | unsigned idx = timer_get_idx(timer); | ||
| 835 | |||
| 721 | if (!timer_pending(timer)) | 836 | if (!timer_pending(timer)) |
| 722 | return 0; | 837 | return 0; |
| 723 | 838 | ||
| 839 | if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) | ||
| 840 | __clear_bit(idx, base->pending_map); | ||
| 841 | |||
| 724 | detach_timer(timer, clear_pending); | 842 | detach_timer(timer, clear_pending); |
| 725 | if (!(timer->flags & TIMER_DEFERRABLE)) { | ||
| 726 | base->active_timers--; | ||
| 727 | if (timer->expires == base->next_timer) | ||
| 728 | base->next_timer = base->timer_jiffies; | ||
| 729 | } | ||
| 730 | /* If this was the last timer, advance base->jiffies */ | ||
| 731 | if (!--base->all_timers) | ||
| 732 | base->timer_jiffies = jiffies; | ||
| 733 | return 1; | 843 | return 1; |
| 734 | } | 844 | } |
| 735 | 845 | ||
| 846 | static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) | ||
| 847 | { | ||
| 848 | struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); | ||
| 849 | |||
| 850 | /* | ||
| 851 | * If the timer is deferrable and nohz is active then we need to use | ||
| 852 | * the deferrable base. | ||
| 853 | */ | ||
| 854 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && | ||
| 855 | (tflags & TIMER_DEFERRABLE)) | ||
| 856 | base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); | ||
| 857 | return base; | ||
| 858 | } | ||
| 859 | |||
| 860 | static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) | ||
| 861 | { | ||
| 862 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); | ||
| 863 | |||
| 864 | /* | ||
| 865 | * If the timer is deferrable and nohz is active then we need to use | ||
| 866 | * the deferrable base. | ||
| 867 | */ | ||
| 868 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && | ||
| 869 | (tflags & TIMER_DEFERRABLE)) | ||
| 870 | base = this_cpu_ptr(&timer_bases[BASE_DEF]); | ||
| 871 | return base; | ||
| 872 | } | ||
| 873 | |||
| 874 | static inline struct timer_base *get_timer_base(u32 tflags) | ||
| 875 | { | ||
| 876 | return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); | ||
| 877 | } | ||
| 878 | |||
| 879 | #ifdef CONFIG_NO_HZ_COMMON | ||
| 880 | static inline struct timer_base * | ||
| 881 | __get_target_base(struct timer_base *base, unsigned tflags) | ||
| 882 | { | ||
| 883 | #ifdef CONFIG_SMP | ||
| 884 | if ((tflags & TIMER_PINNED) || !base->migration_enabled) | ||
| 885 | return get_timer_this_cpu_base(tflags); | ||
| 886 | return get_timer_cpu_base(tflags, get_nohz_timer_target()); | ||
| 887 | #else | ||
| 888 | return get_timer_this_cpu_base(tflags); | ||
| 889 | #endif | ||
| 890 | } | ||
| 891 | |||
| 892 | static inline void forward_timer_base(struct timer_base *base) | ||
| 893 | { | ||
| 894 | /* | ||
| 895 | * We only forward the base when it's idle and we have a delta between | ||
| 896 | * base clock and jiffies. | ||
| 897 | */ | ||
| 898 | if (!base->is_idle || (long) (jiffies - base->clk) < 2) | ||
| 899 | return; | ||
| 900 | |||
| 901 | /* | ||
| 902 | * If the next expiry value is > jiffies, then we fast forward to | ||
| 903 | * jiffies otherwise we forward to the next expiry value. | ||
| 904 | */ | ||
| 905 | if (time_after(base->next_expiry, jiffies)) | ||
| 906 | base->clk = jiffies; | ||
| 907 | else | ||
| 908 | base->clk = base->next_expiry; | ||
| 909 | } | ||
| 910 | #else | ||
| 911 | static inline struct timer_base * | ||
| 912 | __get_target_base(struct timer_base *base, unsigned tflags) | ||
| 913 | { | ||
| 914 | return get_timer_this_cpu_base(tflags); | ||
| 915 | } | ||
| 916 | |||
| 917 | static inline void forward_timer_base(struct timer_base *base) { } | ||
| 918 | #endif | ||
| 919 | |||
| 920 | static inline struct timer_base * | ||
| 921 | get_target_base(struct timer_base *base, unsigned tflags) | ||
| 922 | { | ||
| 923 | struct timer_base *target = __get_target_base(base, tflags); | ||
| 924 | |||
| 925 | forward_timer_base(target); | ||
| 926 | return target; | ||
| 927 | } | ||
| 928 | |||
| 736 | /* | 929 | /* |
| 737 | * We are using hashed locking: holding per_cpu(tvec_bases).lock | 930 | * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means |
| 738 | * means that all timers which are tied to this base via timer->base are | 931 | * that all timers which are tied to this base are locked, and the base itself |
| 739 | * locked, and the base itself is locked too. | 932 | * is locked too. |
| 740 | * | 933 | * |
| 741 | * So __run_timers/migrate_timers can safely modify all timers which could | 934 | * So __run_timers/migrate_timers can safely modify all timers which could |
| 742 | * be found on ->tvX lists. | 935 | * be found in the base->vectors array. |
| 743 | * | 936 | * |
| 744 | * When the timer's base is locked and removed from the list, the | 937 | * When a timer is migrating then the TIMER_MIGRATING flag is set and we need |
| 745 | * TIMER_MIGRATING flag is set, FIXME | 938 | * to wait until the migration is done. |
| 746 | */ | 939 | */ |
| 747 | static struct tvec_base *lock_timer_base(struct timer_list *timer, | 940 | static struct timer_base *lock_timer_base(struct timer_list *timer, |
| 748 | unsigned long *flags) | 941 | unsigned long *flags) |
| 749 | __acquires(timer->base->lock) | 942 | __acquires(timer->base->lock) |
| 750 | { | 943 | { |
| 751 | for (;;) { | 944 | for (;;) { |
| 945 | struct timer_base *base; | ||
| 752 | u32 tf = timer->flags; | 946 | u32 tf = timer->flags; |
| 753 | struct tvec_base *base; | ||
| 754 | 947 | ||
| 755 | if (!(tf & TIMER_MIGRATING)) { | 948 | if (!(tf & TIMER_MIGRATING)) { |
| 756 | base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); | 949 | base = get_timer_base(tf); |
| 757 | spin_lock_irqsave(&base->lock, *flags); | 950 | spin_lock_irqsave(&base->lock, *flags); |
| 758 | if (timer->flags == tf) | 951 | if (timer->flags == tf) |
| 759 | return base; | 952 | return base; |
| @@ -764,13 +957,41 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, | |||
| 764 | } | 957 | } |
| 765 | 958 | ||
| 766 | static inline int | 959 | static inline int |
| 767 | __mod_timer(struct timer_list *timer, unsigned long expires, | 960 | __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) |
| 768 | bool pending_only, int pinned) | ||
| 769 | { | 961 | { |
| 770 | struct tvec_base *base, *new_base; | 962 | struct timer_base *base, *new_base; |
| 771 | unsigned long flags; | 963 | unsigned int idx = UINT_MAX; |
| 964 | unsigned long clk = 0, flags; | ||
| 772 | int ret = 0; | 965 | int ret = 0; |
| 773 | 966 | ||
| 967 | /* | ||
| 968 | * This is a common optimization triggered by the networking code - if | ||
| 969 | * the timer is re-modified to have the same timeout or ends up in the | ||
| 970 | * same array bucket then just return: | ||
| 971 | */ | ||
| 972 | if (timer_pending(timer)) { | ||
| 973 | if (timer->expires == expires) | ||
| 974 | return 1; | ||
| 975 | /* | ||
| 976 | * Take the current timer_jiffies of base, but without holding | ||
| 977 | * the lock! | ||
| 978 | */ | ||
| 979 | base = get_timer_base(timer->flags); | ||
| 980 | clk = base->clk; | ||
| 981 | |||
| 982 | idx = calc_wheel_index(expires, clk); | ||
| 983 | |||
| 984 | /* | ||
| 985 | * Retrieve and compare the array index of the pending | ||
| 986 | * timer. If it matches set the expiry to the new value so a | ||
| 987 | * subsequent call will exit in the expires check above. | ||
| 988 | */ | ||
| 989 | if (idx == timer_get_idx(timer)) { | ||
| 990 | timer->expires = expires; | ||
| 991 | return 1; | ||
| 992 | } | ||
| 993 | } | ||
| 994 | |||
| 774 | timer_stats_timer_set_start_info(timer); | 995 | timer_stats_timer_set_start_info(timer); |
| 775 | BUG_ON(!timer->function); | 996 | BUG_ON(!timer->function); |
| 776 | 997 | ||
| @@ -782,15 +1003,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires, | |||
| 782 | 1003 | ||
| 783 | debug_activate(timer, expires); | 1004 | debug_activate(timer, expires); |
| 784 | 1005 | ||
| 785 | new_base = get_target_base(base, pinned); | 1006 | new_base = get_target_base(base, timer->flags); |
| 786 | 1007 | ||
| 787 | if (base != new_base) { | 1008 | if (base != new_base) { |
| 788 | /* | 1009 | /* |
| 789 | * We are trying to schedule the timer on the local CPU. | 1010 | * We are trying to schedule the timer on the new base. |
| 790 | * However we can't change timer's base while it is running, | 1011 | * However we can't change timer's base while it is running, |
| 791 | * otherwise del_timer_sync() can't detect that the timer's | 1012 | * otherwise del_timer_sync() can't detect that the timer's |
| 792 | * handler yet has not finished. This also guarantees that | 1013 | * handler yet has not finished. This also guarantees that the |
| 793 | * the timer is serialized wrt itself. | 1014 | * timer is serialized wrt itself. |
| 794 | */ | 1015 | */ |
| 795 | if (likely(base->running_timer != timer)) { | 1016 | if (likely(base->running_timer != timer)) { |
| 796 | /* See the comment in lock_timer_base() */ | 1017 | /* See the comment in lock_timer_base() */ |
| @@ -805,7 +1026,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, | |||
| 805 | } | 1026 | } |
| 806 | 1027 | ||
| 807 | timer->expires = expires; | 1028 | timer->expires = expires; |
| 808 | internal_add_timer(base, timer); | 1029 | /* |
| 1030 | * If 'idx' was calculated above and the base time did not advance | ||
| 1031 | * between calculating 'idx' and taking the lock, only enqueue_timer() | ||
| 1032 | * and trigger_dyntick_cpu() is required. Otherwise we need to | ||
| 1033 | * (re)calculate the wheel index via internal_add_timer(). | ||
| 1034 | */ | ||
| 1035 | if (idx != UINT_MAX && clk == base->clk) { | ||
| 1036 | enqueue_timer(base, timer, idx); | ||
| 1037 | trigger_dyntick_cpu(base, timer); | ||
| 1038 | } else { | ||
| 1039 | internal_add_timer(base, timer); | ||
| 1040 | } | ||
| 809 | 1041 | ||
| 810 | out_unlock: | 1042 | out_unlock: |
| 811 | spin_unlock_irqrestore(&base->lock, flags); | 1043 | spin_unlock_irqrestore(&base->lock, flags); |
| @@ -825,49 +1057,10 @@ out_unlock: | |||
| 825 | */ | 1057 | */ |
| 826 | int mod_timer_pending(struct timer_list *timer, unsigned long expires) | 1058 | int mod_timer_pending(struct timer_list *timer, unsigned long expires) |
| 827 | { | 1059 | { |
| 828 | return __mod_timer(timer, expires, true, TIMER_NOT_PINNED); | 1060 | return __mod_timer(timer, expires, true); |
| 829 | } | 1061 | } |
| 830 | EXPORT_SYMBOL(mod_timer_pending); | 1062 | EXPORT_SYMBOL(mod_timer_pending); |
| 831 | 1063 | ||
| 832 | /* | ||
| 833 | * Decide where to put the timer while taking the slack into account | ||
| 834 | * | ||
| 835 | * Algorithm: | ||
| 836 | * 1) calculate the maximum (absolute) time | ||
| 837 | * 2) calculate the highest bit where the expires and new max are different | ||
| 838 | * 3) use this bit to make a mask | ||
| 839 | * 4) use the bitmask to round down the maximum time, so that all last | ||
| 840 | * bits are zeros | ||
| 841 | */ | ||
| 842 | static inline | ||
| 843 | unsigned long apply_slack(struct timer_list *timer, unsigned long expires) | ||
| 844 | { | ||
| 845 | unsigned long expires_limit, mask; | ||
| 846 | int bit; | ||
| 847 | |||
| 848 | if (timer->slack >= 0) { | ||
| 849 | expires_limit = expires + timer->slack; | ||
| 850 | } else { | ||
| 851 | long delta = expires - jiffies; | ||
| 852 | |||
| 853 | if (delta < 256) | ||
| 854 | return expires; | ||
| 855 | |||
| 856 | expires_limit = expires + delta / 256; | ||
| 857 | } | ||
| 858 | mask = expires ^ expires_limit; | ||
| 859 | if (mask == 0) | ||
| 860 | return expires; | ||
| 861 | |||
| 862 | bit = __fls(mask); | ||
| 863 | |||
| 864 | mask = (1UL << bit) - 1; | ||
| 865 | |||
| 866 | expires_limit = expires_limit & ~(mask); | ||
| 867 | |||
| 868 | return expires_limit; | ||
| 869 | } | ||
| 870 | |||
| 871 | /** | 1064 | /** |
| 872 | * mod_timer - modify a timer's timeout | 1065 | * mod_timer - modify a timer's timeout |
| 873 | * @timer: the timer to be modified | 1066 | * @timer: the timer to be modified |
| @@ -890,49 +1083,11 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) | |||
| 890 | */ | 1083 | */ |
| 891 | int mod_timer(struct timer_list *timer, unsigned long expires) | 1084 | int mod_timer(struct timer_list *timer, unsigned long expires) |
| 892 | { | 1085 | { |
| 893 | expires = apply_slack(timer, expires); | 1086 | return __mod_timer(timer, expires, false); |
| 894 | |||
| 895 | /* | ||
| 896 | * This is a common optimization triggered by the | ||
| 897 | * networking code - if the timer is re-modified | ||
| 898 | * to be the same thing then just return: | ||
| 899 | */ | ||
| 900 | if (timer_pending(timer) && timer->expires == expires) | ||
| 901 | return 1; | ||
| 902 | |||
| 903 | return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); | ||
| 904 | } | 1087 | } |
| 905 | EXPORT_SYMBOL(mod_timer); | 1088 | EXPORT_SYMBOL(mod_timer); |
| 906 | 1089 | ||
| 907 | /** | 1090 | /** |
| 908 | * mod_timer_pinned - modify a timer's timeout | ||
| 909 | * @timer: the timer to be modified | ||
| 910 | * @expires: new timeout in jiffies | ||
| 911 | * | ||
| 912 | * mod_timer_pinned() is a way to update the expire field of an | ||
| 913 | * active timer (if the timer is inactive it will be activated) | ||
| 914 | * and to ensure that the timer is scheduled on the current CPU. | ||
| 915 | * | ||
| 916 | * Note that this does not prevent the timer from being migrated | ||
| 917 | * when the current CPU goes offline. If this is a problem for | ||
| 918 | * you, use CPU-hotplug notifiers to handle it correctly, for | ||
| 919 | * example, cancelling the timer when the corresponding CPU goes | ||
| 920 | * offline. | ||
| 921 | * | ||
| 922 | * mod_timer_pinned(timer, expires) is equivalent to: | ||
| 923 | * | ||
| 924 | * del_timer(timer); timer->expires = expires; add_timer(timer); | ||
| 925 | */ | ||
| 926 | int mod_timer_pinned(struct timer_list *timer, unsigned long expires) | ||
| 927 | { | ||
| 928 | if (timer->expires == expires && timer_pending(timer)) | ||
| 929 | return 1; | ||
| 930 | |||
| 931 | return __mod_timer(timer, expires, false, TIMER_PINNED); | ||
| 932 | } | ||
| 933 | EXPORT_SYMBOL(mod_timer_pinned); | ||
| 934 | |||
| 935 | /** | ||
| 936 | * add_timer - start a timer | 1091 | * add_timer - start a timer |
| 937 | * @timer: the timer to be added | 1092 | * @timer: the timer to be added |
| 938 | * | 1093 | * |
| @@ -962,13 +1117,14 @@ EXPORT_SYMBOL(add_timer); | |||
| 962 | */ | 1117 | */ |
| 963 | void add_timer_on(struct timer_list *timer, int cpu) | 1118 | void add_timer_on(struct timer_list *timer, int cpu) |
| 964 | { | 1119 | { |
| 965 | struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu); | 1120 | struct timer_base *new_base, *base; |
| 966 | struct tvec_base *base; | ||
| 967 | unsigned long flags; | 1121 | unsigned long flags; |
| 968 | 1122 | ||
| 969 | timer_stats_timer_set_start_info(timer); | 1123 | timer_stats_timer_set_start_info(timer); |
| 970 | BUG_ON(timer_pending(timer) || !timer->function); | 1124 | BUG_ON(timer_pending(timer) || !timer->function); |
| 971 | 1125 | ||
| 1126 | new_base = get_timer_cpu_base(timer->flags, cpu); | ||
| 1127 | |||
| 972 | /* | 1128 | /* |
| 973 | * If @timer was on a different CPU, it should be migrated with the | 1129 | * If @timer was on a different CPU, it should be migrated with the |
| 974 | * old base locked to prevent other operations proceeding with the | 1130 | * old base locked to prevent other operations proceeding with the |
| @@ -1004,7 +1160,7 @@ EXPORT_SYMBOL_GPL(add_timer_on); | |||
| 1004 | */ | 1160 | */ |
| 1005 | int del_timer(struct timer_list *timer) | 1161 | int del_timer(struct timer_list *timer) |
| 1006 | { | 1162 | { |
| 1007 | struct tvec_base *base; | 1163 | struct timer_base *base; |
| 1008 | unsigned long flags; | 1164 | unsigned long flags; |
| 1009 | int ret = 0; | 1165 | int ret = 0; |
| 1010 | 1166 | ||
| @@ -1030,7 +1186,7 @@ EXPORT_SYMBOL(del_timer); | |||
| 1030 | */ | 1186 | */ |
| 1031 | int try_to_del_timer_sync(struct timer_list *timer) | 1187 | int try_to_del_timer_sync(struct timer_list *timer) |
| 1032 | { | 1188 | { |
| 1033 | struct tvec_base *base; | 1189 | struct timer_base *base; |
| 1034 | unsigned long flags; | 1190 | unsigned long flags; |
| 1035 | int ret = -1; | 1191 | int ret = -1; |
| 1036 | 1192 | ||
| @@ -1114,27 +1270,6 @@ int del_timer_sync(struct timer_list *timer) | |||
| 1114 | EXPORT_SYMBOL(del_timer_sync); | 1270 | EXPORT_SYMBOL(del_timer_sync); |
| 1115 | #endif | 1271 | #endif |
| 1116 | 1272 | ||
| 1117 | static int cascade(struct tvec_base *base, struct tvec *tv, int index) | ||
| 1118 | { | ||
| 1119 | /* cascade all the timers from tv up one level */ | ||
| 1120 | struct timer_list *timer; | ||
| 1121 | struct hlist_node *tmp; | ||
| 1122 | struct hlist_head tv_list; | ||
| 1123 | |||
| 1124 | hlist_move_list(tv->vec + index, &tv_list); | ||
| 1125 | |||
| 1126 | /* | ||
| 1127 | * We are removing _all_ timers from the list, so we | ||
| 1128 | * don't have to detach them individually. | ||
| 1129 | */ | ||
| 1130 | hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) { | ||
| 1131 | /* No accounting, while moving them */ | ||
| 1132 | __internal_add_timer(base, timer); | ||
| 1133 | } | ||
| 1134 | |||
| 1135 | return index; | ||
| 1136 | } | ||
| 1137 | |||
| 1138 | static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | 1273 | static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), |
| 1139 | unsigned long data) | 1274 | unsigned long data) |
| 1140 | { | 1275 | { |
| @@ -1178,147 +1313,141 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | |||
| 1178 | } | 1313 | } |
| 1179 | } | 1314 | } |
| 1180 | 1315 | ||
| 1181 | #define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) | 1316 | static void expire_timers(struct timer_base *base, struct hlist_head *head) |
| 1182 | |||
| 1183 | /** | ||
| 1184 | * __run_timers - run all expired timers (if any) on this CPU. | ||
| 1185 | * @base: the timer vector to be processed. | ||
| 1186 | * | ||
| 1187 | * This function cascades all vectors and executes all expired timer | ||
| 1188 | * vectors. | ||
| 1189 | */ | ||
| 1190 | static inline void __run_timers(struct tvec_base *base) | ||
| 1191 | { | 1317 | { |
| 1192 | struct timer_list *timer; | 1318 | while (!hlist_empty(head)) { |
| 1319 | struct timer_list *timer; | ||
| 1320 | void (*fn)(unsigned long); | ||
| 1321 | unsigned long data; | ||
| 1193 | 1322 | ||
| 1194 | spin_lock_irq(&base->lock); | 1323 | timer = hlist_entry(head->first, struct timer_list, entry); |
| 1324 | timer_stats_account_timer(timer); | ||
| 1195 | 1325 | ||
| 1196 | while (time_after_eq(jiffies, base->timer_jiffies)) { | 1326 | base->running_timer = timer; |
| 1197 | struct hlist_head work_list; | 1327 | detach_timer(timer, true); |
| 1198 | struct hlist_head *head = &work_list; | ||
| 1199 | int index; | ||
| 1200 | 1328 | ||
| 1201 | if (!base->all_timers) { | 1329 | fn = timer->function; |
| 1202 | base->timer_jiffies = jiffies; | 1330 | data = timer->data; |
| 1203 | break; | 1331 | |
| 1332 | if (timer->flags & TIMER_IRQSAFE) { | ||
| 1333 | spin_unlock(&base->lock); | ||
| 1334 | call_timer_fn(timer, fn, data); | ||
| 1335 | spin_lock(&base->lock); | ||
| 1336 | } else { | ||
| 1337 | spin_unlock_irq(&base->lock); | ||
| 1338 | call_timer_fn(timer, fn, data); | ||
| 1339 | spin_lock_irq(&base->lock); | ||
| 1204 | } | 1340 | } |
| 1341 | } | ||
| 1342 | } | ||
| 1205 | 1343 | ||
| 1206 | index = base->timer_jiffies & TVR_MASK; | 1344 | static int __collect_expired_timers(struct timer_base *base, |
| 1345 | struct hlist_head *heads) | ||
| 1346 | { | ||
| 1347 | unsigned long clk = base->clk; | ||
| 1348 | struct hlist_head *vec; | ||
| 1349 | int i, levels = 0; | ||
| 1350 | unsigned int idx; | ||
| 1207 | 1351 | ||
| 1208 | /* | 1352 | for (i = 0; i < LVL_DEPTH; i++) { |
| 1209 | * Cascade timers: | 1353 | idx = (clk & LVL_MASK) + i * LVL_SIZE; |
| 1210 | */ | 1354 | |
| 1211 | if (!index && | 1355 | if (__test_and_clear_bit(idx, base->pending_map)) { |
| 1212 | (!cascade(base, &base->tv2, INDEX(0))) && | 1356 | vec = base->vectors + idx; |
| 1213 | (!cascade(base, &base->tv3, INDEX(1))) && | 1357 | hlist_move_list(vec, heads++); |
| 1214 | !cascade(base, &base->tv4, INDEX(2))) | 1358 | levels++; |
| 1215 | cascade(base, &base->tv5, INDEX(3)); | ||
| 1216 | ++base->timer_jiffies; | ||
| 1217 | hlist_move_list(base->tv1.vec + index, head); | ||
| 1218 | while (!hlist_empty(head)) { | ||
| 1219 | void (*fn)(unsigned long); | ||
| 1220 | unsigned long data; | ||
| 1221 | bool irqsafe; | ||
| 1222 | |||
| 1223 | timer = hlist_entry(head->first, struct timer_list, entry); | ||
| 1224 | fn = timer->function; | ||
| 1225 | data = timer->data; | ||
| 1226 | irqsafe = timer->flags & TIMER_IRQSAFE; | ||
| 1227 | |||
| 1228 | timer_stats_account_timer(timer); | ||
| 1229 | |||
| 1230 | base->running_timer = timer; | ||
| 1231 | detach_expired_timer(timer, base); | ||
| 1232 | |||
| 1233 | if (irqsafe) { | ||
| 1234 | spin_unlock(&base->lock); | ||
| 1235 | call_timer_fn(timer, fn, data); | ||
| 1236 | spin_lock(&base->lock); | ||
| 1237 | } else { | ||
| 1238 | spin_unlock_irq(&base->lock); | ||
| 1239 | call_timer_fn(timer, fn, data); | ||
| 1240 | spin_lock_irq(&base->lock); | ||
| 1241 | } | ||
| 1242 | } | 1359 | } |
| 1360 | /* Is it time to look at the next level? */ | ||
| 1361 | if (clk & LVL_CLK_MASK) | ||
| 1362 | break; | ||
| 1363 | /* Shift clock for the next level granularity */ | ||
| 1364 | clk >>= LVL_CLK_SHIFT; | ||
| 1243 | } | 1365 | } |
| 1244 | base->running_timer = NULL; | 1366 | return levels; |
| 1245 | spin_unlock_irq(&base->lock); | ||
| 1246 | } | 1367 | } |
| 1247 | 1368 | ||
| 1248 | #ifdef CONFIG_NO_HZ_COMMON | 1369 | #ifdef CONFIG_NO_HZ_COMMON |
| 1249 | /* | 1370 | /* |
| 1250 | * Find out when the next timer event is due to happen. This | 1371 | * Find the next pending bucket of a level. Search from level start (@offset) |
| 1251 | * is used on S/390 to stop all activity when a CPU is idle. | 1372 | * + @clk upwards and if nothing there, search from start of the level |
| 1252 | * This function needs to be called with interrupts disabled. | 1373 | * (@offset) up to @offset + clk. |
| 1374 | */ | ||
| 1375 | static int next_pending_bucket(struct timer_base *base, unsigned offset, | ||
| 1376 | unsigned clk) | ||
| 1377 | { | ||
| 1378 | unsigned pos, start = offset + clk; | ||
| 1379 | unsigned end = offset + LVL_SIZE; | ||
| 1380 | |||
| 1381 | pos = find_next_bit(base->pending_map, end, start); | ||
| 1382 | if (pos < end) | ||
| 1383 | return pos - start; | ||
| 1384 | |||
| 1385 | pos = find_next_bit(base->pending_map, start, offset); | ||
| 1386 | return pos < start ? pos + LVL_SIZE - start : -1; | ||
| 1387 | } | ||
| 1388 | |||
| 1389 | /* | ||
| 1390 | * Search the first expiring timer in the various clock levels. Caller must | ||
| 1391 | * hold base->lock. | ||
| 1253 | */ | 1392 | */ |
| 1254 | static unsigned long __next_timer_interrupt(struct tvec_base *base) | 1393 | static unsigned long __next_timer_interrupt(struct timer_base *base) |
| 1255 | { | 1394 | { |
| 1256 | unsigned long timer_jiffies = base->timer_jiffies; | 1395 | unsigned long clk, next, adj; |
| 1257 | unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; | 1396 | unsigned lvl, offset = 0; |
| 1258 | int index, slot, array, found = 0; | 1397 | |
| 1259 | struct timer_list *nte; | 1398 | next = base->clk + NEXT_TIMER_MAX_DELTA; |
| 1260 | struct tvec *varray[4]; | 1399 | clk = base->clk; |
| 1261 | 1400 | for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { | |
| 1262 | /* Look for timer events in tv1. */ | 1401 | int pos = next_pending_bucket(base, offset, clk & LVL_MASK); |
| 1263 | index = slot = timer_jiffies & TVR_MASK; | 1402 | |
| 1264 | do { | 1403 | if (pos >= 0) { |
| 1265 | hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { | 1404 | unsigned long tmp = clk + (unsigned long) pos; |
| 1266 | if (nte->flags & TIMER_DEFERRABLE) | 1405 | |
| 1267 | continue; | 1406 | tmp <<= LVL_SHIFT(lvl); |
| 1268 | 1407 | if (time_before(tmp, next)) | |
| 1269 | found = 1; | 1408 | next = tmp; |
| 1270 | expires = nte->expires; | ||
| 1271 | /* Look at the cascade bucket(s)? */ | ||
| 1272 | if (!index || slot < index) | ||
| 1273 | goto cascade; | ||
| 1274 | return expires; | ||
| 1275 | } | 1409 | } |
| 1276 | slot = (slot + 1) & TVR_MASK; | 1410 | /* |
| 1277 | } while (slot != index); | 1411 | * Clock for the next level. If the current level clock lower |
| 1278 | 1412 | * bits are zero, we look at the next level as is. If not we | |
| 1279 | cascade: | 1413 | * need to advance it by one because that's going to be the |
| 1280 | /* Calculate the next cascade event */ | 1414 | * next expiring bucket in that level. base->clk is the next |
| 1281 | if (index) | 1415 | * expiring jiffie. So in case of: |
| 1282 | timer_jiffies += TVR_SIZE - index; | 1416 | * |
| 1283 | timer_jiffies >>= TVR_BITS; | 1417 | * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 |
| 1284 | 1418 | * 0 0 0 0 0 0 | |
| 1285 | /* Check tv2-tv5. */ | 1419 | * |
| 1286 | varray[0] = &base->tv2; | 1420 | * we have to look at all levels @index 0. With |
| 1287 | varray[1] = &base->tv3; | 1421 | * |
| 1288 | varray[2] = &base->tv4; | 1422 | * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 |
| 1289 | varray[3] = &base->tv5; | 1423 | * 0 0 0 0 0 2 |
| 1290 | 1424 | * | |
| 1291 | for (array = 0; array < 4; array++) { | 1425 | * LVL0 has the next expiring bucket @index 2. The upper |
| 1292 | struct tvec *varp = varray[array]; | 1426 | * levels have the next expiring bucket @index 1. |
| 1293 | 1427 | * | |
| 1294 | index = slot = timer_jiffies & TVN_MASK; | 1428 | * In case that the propagation wraps the next level the same |
| 1295 | do { | 1429 | * rules apply: |
| 1296 | hlist_for_each_entry(nte, varp->vec + slot, entry) { | 1430 | * |
| 1297 | if (nte->flags & TIMER_DEFERRABLE) | 1431 | * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 |
| 1298 | continue; | 1432 | * 0 0 0 0 F 2 |
| 1299 | 1433 | * | |
| 1300 | found = 1; | 1434 | * So after looking at LVL0 we get: |
| 1301 | if (time_before(nte->expires, expires)) | 1435 | * |
| 1302 | expires = nte->expires; | 1436 | * LVL5 LVL4 LVL3 LVL2 LVL1 |
| 1303 | } | 1437 | * 0 0 0 1 0 |
| 1304 | /* | 1438 | * |
| 1305 | * Do we still search for the first timer or are | 1439 | * So no propagation from LVL1 to LVL2 because that happened |
| 1306 | * we looking up the cascade buckets ? | 1440 | * with the add already, but then we need to propagate further |
| 1307 | */ | 1441 | * from LVL2 to LVL3. |
| 1308 | if (found) { | 1442 | * |
| 1309 | /* Look at the cascade bucket(s)? */ | 1443 | * So the simple check whether the lower bits of the current |
| 1310 | if (!index || slot < index) | 1444 | * level are 0 or not is sufficient for all cases. |
| 1311 | break; | 1445 | */ |
| 1312 | return expires; | 1446 | adj = clk & LVL_CLK_MASK ? 1 : 0; |
| 1313 | } | 1447 | clk >>= LVL_CLK_SHIFT; |
| 1314 | slot = (slot + 1) & TVN_MASK; | 1448 | clk += adj; |
| 1315 | } while (slot != index); | ||
| 1316 | |||
| 1317 | if (index) | ||
| 1318 | timer_jiffies += TVN_SIZE - index; | ||
| 1319 | timer_jiffies >>= TVN_BITS; | ||
| 1320 | } | 1449 | } |
| 1321 | return expires; | 1450 | return next; |
| 1322 | } | 1451 | } |
| 1323 | 1452 | ||
| 1324 | /* | 1453 | /* |
| @@ -1364,7 +1493,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) | |||
| 1364 | */ | 1493 | */ |
| 1365 | u64 get_next_timer_interrupt(unsigned long basej, u64 basem) | 1494 | u64 get_next_timer_interrupt(unsigned long basej, u64 basem) |
| 1366 | { | 1495 | { |
| 1367 | struct tvec_base *base = this_cpu_ptr(&tvec_bases); | 1496 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); |
| 1368 | u64 expires = KTIME_MAX; | 1497 | u64 expires = KTIME_MAX; |
| 1369 | unsigned long nextevt; | 1498 | unsigned long nextevt; |
| 1370 | 1499 | ||
| @@ -1376,19 +1505,80 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) | |||
| 1376 | return expires; | 1505 | return expires; |
| 1377 | 1506 | ||
| 1378 | spin_lock(&base->lock); | 1507 | spin_lock(&base->lock); |
| 1379 | if (base->active_timers) { | 1508 | nextevt = __next_timer_interrupt(base); |
| 1380 | if (time_before_eq(base->next_timer, base->timer_jiffies)) | 1509 | base->next_expiry = nextevt; |
| 1381 | base->next_timer = __next_timer_interrupt(base); | 1510 | /* |
| 1382 | nextevt = base->next_timer; | 1511 | * We have a fresh next event. Check whether we can forward the base: |
| 1383 | if (time_before_eq(nextevt, basej)) | 1512 | */ |
| 1384 | expires = basem; | 1513 | if (time_after(nextevt, jiffies)) |
| 1385 | else | 1514 | base->clk = jiffies; |
| 1386 | expires = basem + (nextevt - basej) * TICK_NSEC; | 1515 | else if (time_after(nextevt, base->clk)) |
| 1516 | base->clk = nextevt; | ||
| 1517 | |||
| 1518 | if (time_before_eq(nextevt, basej)) { | ||
| 1519 | expires = basem; | ||
| 1520 | base->is_idle = false; | ||
| 1521 | } else { | ||
| 1522 | expires = basem + (nextevt - basej) * TICK_NSEC; | ||
| 1523 | /* | ||
| 1524 | * If we expect to sleep more than a tick, mark the base idle: | ||
| 1525 | */ | ||
| 1526 | if ((expires - basem) > TICK_NSEC) | ||
| 1527 | base->is_idle = true; | ||
| 1387 | } | 1528 | } |
| 1388 | spin_unlock(&base->lock); | 1529 | spin_unlock(&base->lock); |
| 1389 | 1530 | ||
| 1390 | return cmp_next_hrtimer_event(basem, expires); | 1531 | return cmp_next_hrtimer_event(basem, expires); |
| 1391 | } | 1532 | } |
| 1533 | |||
| 1534 | /** | ||
| 1535 | * timer_clear_idle - Clear the idle state of the timer base | ||
| 1536 | * | ||
| 1537 | * Called with interrupts disabled | ||
| 1538 | */ | ||
| 1539 | void timer_clear_idle(void) | ||
| 1540 | { | ||
| 1541 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); | ||
| 1542 | |||
| 1543 | /* | ||
| 1544 | * We do this unlocked. The worst outcome is a remote enqueue sending | ||
| 1545 | * a pointless IPI, but taking the lock would just make the window for | ||
| 1546 | * sending the IPI a few instructions smaller for the cost of taking | ||
| 1547 | * the lock in the exit from idle path. | ||
| 1548 | */ | ||
| 1549 | base->is_idle = false; | ||
| 1550 | } | ||
| 1551 | |||
| 1552 | static int collect_expired_timers(struct timer_base *base, | ||
| 1553 | struct hlist_head *heads) | ||
| 1554 | { | ||
| 1555 | /* | ||
| 1556 | * NOHZ optimization. After a long idle sleep we need to forward the | ||
| 1557 | * base to current jiffies. Avoid a loop by searching the bitfield for | ||
| 1558 | * the next expiring timer. | ||
| 1559 | */ | ||
| 1560 | if ((long)(jiffies - base->clk) > 2) { | ||
| 1561 | unsigned long next = __next_timer_interrupt(base); | ||
| 1562 | |||
| 1563 | /* | ||
| 1564 | * If the next timer is ahead of time forward to current | ||
| 1565 | * jiffies, otherwise forward to the next expiry time: | ||
| 1566 | */ | ||
| 1567 | if (time_after(next, jiffies)) { | ||
| 1568 | /* The call site will increment clock! */ | ||
| 1569 | base->clk = jiffies - 1; | ||
| 1570 | return 0; | ||
| 1571 | } | ||
| 1572 | base->clk = next; | ||
| 1573 | } | ||
| 1574 | return __collect_expired_timers(base, heads); | ||
| 1575 | } | ||
| 1576 | #else | ||
| 1577 | static inline int collect_expired_timers(struct timer_base *base, | ||
| 1578 | struct hlist_head *heads) | ||
| 1579 | { | ||
| 1580 | return __collect_expired_timers(base, heads); | ||
| 1581 | } | ||
| 1392 | #endif | 1582 | #endif |
| 1393 | 1583 | ||
| 1394 | /* | 1584 | /* |
| @@ -1411,15 +1601,42 @@ void update_process_times(int user_tick) | |||
| 1411 | run_posix_cpu_timers(p); | 1601 | run_posix_cpu_timers(p); |
| 1412 | } | 1602 | } |
| 1413 | 1603 | ||
| 1604 | /** | ||
| 1605 | * __run_timers - run all expired timers (if any) on this CPU. | ||
| 1606 | * @base: the timer vector to be processed. | ||
| 1607 | */ | ||
| 1608 | static inline void __run_timers(struct timer_base *base) | ||
| 1609 | { | ||
| 1610 | struct hlist_head heads[LVL_DEPTH]; | ||
| 1611 | int levels; | ||
| 1612 | |||
| 1613 | if (!time_after_eq(jiffies, base->clk)) | ||
| 1614 | return; | ||
| 1615 | |||
| 1616 | spin_lock_irq(&base->lock); | ||
| 1617 | |||
| 1618 | while (time_after_eq(jiffies, base->clk)) { | ||
| 1619 | |||
| 1620 | levels = collect_expired_timers(base, heads); | ||
| 1621 | base->clk++; | ||
| 1622 | |||
| 1623 | while (levels--) | ||
| 1624 | expire_timers(base, heads + levels); | ||
| 1625 | } | ||
| 1626 | base->running_timer = NULL; | ||
| 1627 | spin_unlock_irq(&base->lock); | ||
| 1628 | } | ||
| 1629 | |||
| 1414 | /* | 1630 | /* |
| 1415 | * This function runs timers and the timer-tq in bottom half context. | 1631 | * This function runs timers and the timer-tq in bottom half context. |
| 1416 | */ | 1632 | */ |
| 1417 | static void run_timer_softirq(struct softirq_action *h) | 1633 | static void run_timer_softirq(struct softirq_action *h) |
| 1418 | { | 1634 | { |
| 1419 | struct tvec_base *base = this_cpu_ptr(&tvec_bases); | 1635 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); |
| 1420 | 1636 | ||
| 1421 | if (time_after_eq(jiffies, base->timer_jiffies)) | 1637 | __run_timers(base); |
| 1422 | __run_timers(base); | 1638 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) |
| 1639 | __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); | ||
| 1423 | } | 1640 | } |
| 1424 | 1641 | ||
| 1425 | /* | 1642 | /* |
| @@ -1427,7 +1644,18 @@ static void run_timer_softirq(struct softirq_action *h) | |||
| 1427 | */ | 1644 | */ |
| 1428 | void run_local_timers(void) | 1645 | void run_local_timers(void) |
| 1429 | { | 1646 | { |
| 1647 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); | ||
| 1648 | |||
| 1430 | hrtimer_run_queues(); | 1649 | hrtimer_run_queues(); |
| 1650 | /* Raise the softirq only if required. */ | ||
| 1651 | if (time_before(jiffies, base->clk)) { | ||
| 1652 | if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) | ||
| 1653 | return; | ||
| 1654 | /* CPU is awake, so check the deferrable base. */ | ||
| 1655 | base++; | ||
| 1656 | if (time_before(jiffies, base->clk)) | ||
| 1657 | return; | ||
| 1658 | } | ||
| 1431 | raise_softirq(TIMER_SOFTIRQ); | 1659 | raise_softirq(TIMER_SOFTIRQ); |
| 1432 | } | 1660 | } |
| 1433 | 1661 | ||
| @@ -1512,7 +1740,7 @@ signed long __sched schedule_timeout(signed long timeout) | |||
| 1512 | expire = timeout + jiffies; | 1740 | expire = timeout + jiffies; |
| 1513 | 1741 | ||
| 1514 | setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); | 1742 | setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); |
| 1515 | __mod_timer(&timer, expire, false, TIMER_NOT_PINNED); | 1743 | __mod_timer(&timer, expire, false); |
| 1516 | schedule(); | 1744 | schedule(); |
| 1517 | del_singleshot_timer_sync(&timer); | 1745 | del_singleshot_timer_sync(&timer); |
| 1518 | 1746 | ||
| @@ -1563,87 +1791,62 @@ signed long __sched schedule_timeout_idle(signed long timeout) | |||
| 1563 | EXPORT_SYMBOL(schedule_timeout_idle); | 1791 | EXPORT_SYMBOL(schedule_timeout_idle); |
| 1564 | 1792 | ||
| 1565 | #ifdef CONFIG_HOTPLUG_CPU | 1793 | #ifdef CONFIG_HOTPLUG_CPU |
| 1566 | static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) | 1794 | static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) |
| 1567 | { | 1795 | { |
| 1568 | struct timer_list *timer; | 1796 | struct timer_list *timer; |
| 1569 | int cpu = new_base->cpu; | 1797 | int cpu = new_base->cpu; |
| 1570 | 1798 | ||
| 1571 | while (!hlist_empty(head)) { | 1799 | while (!hlist_empty(head)) { |
| 1572 | timer = hlist_entry(head->first, struct timer_list, entry); | 1800 | timer = hlist_entry(head->first, struct timer_list, entry); |
| 1573 | /* We ignore the accounting on the dying cpu */ | ||
| 1574 | detach_timer(timer, false); | 1801 | detach_timer(timer, false); |
| 1575 | timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; | 1802 | timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; |
| 1576 | internal_add_timer(new_base, timer); | 1803 | internal_add_timer(new_base, timer); |
| 1577 | } | 1804 | } |
| 1578 | } | 1805 | } |
| 1579 | 1806 | ||
| 1580 | static void migrate_timers(int cpu) | 1807 | int timers_dead_cpu(unsigned int cpu) |
| 1581 | { | 1808 | { |
| 1582 | struct tvec_base *old_base; | 1809 | struct timer_base *old_base; |
| 1583 | struct tvec_base *new_base; | 1810 | struct timer_base *new_base; |
| 1584 | int i; | 1811 | int b, i; |
| 1585 | 1812 | ||
| 1586 | BUG_ON(cpu_online(cpu)); | 1813 | BUG_ON(cpu_online(cpu)); |
| 1587 | old_base = per_cpu_ptr(&tvec_bases, cpu); | ||
| 1588 | new_base = get_cpu_ptr(&tvec_bases); | ||
| 1589 | /* | ||
| 1590 | * The caller is globally serialized and nobody else | ||
| 1591 | * takes two locks at once, deadlock is not possible. | ||
| 1592 | */ | ||
| 1593 | spin_lock_irq(&new_base->lock); | ||
| 1594 | spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); | ||
| 1595 | |||
| 1596 | BUG_ON(old_base->running_timer); | ||
| 1597 | |||
| 1598 | for (i = 0; i < TVR_SIZE; i++) | ||
| 1599 | migrate_timer_list(new_base, old_base->tv1.vec + i); | ||
| 1600 | for (i = 0; i < TVN_SIZE; i++) { | ||
| 1601 | migrate_timer_list(new_base, old_base->tv2.vec + i); | ||
| 1602 | migrate_timer_list(new_base, old_base->tv3.vec + i); | ||
| 1603 | migrate_timer_list(new_base, old_base->tv4.vec + i); | ||
| 1604 | migrate_timer_list(new_base, old_base->tv5.vec + i); | ||
| 1605 | } | ||
| 1606 | 1814 | ||
| 1607 | old_base->active_timers = 0; | 1815 | for (b = 0; b < NR_BASES; b++) { |
| 1608 | old_base->all_timers = 0; | 1816 | old_base = per_cpu_ptr(&timer_bases[b], cpu); |
| 1817 | new_base = get_cpu_ptr(&timer_bases[b]); | ||
| 1818 | /* | ||
| 1819 | * The caller is globally serialized and nobody else | ||
| 1820 | * takes two locks at once, deadlock is not possible. | ||
| 1821 | */ | ||
| 1822 | spin_lock_irq(&new_base->lock); | ||
| 1823 | spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); | ||
| 1609 | 1824 | ||
| 1610 | spin_unlock(&old_base->lock); | 1825 | BUG_ON(old_base->running_timer); |
| 1611 | spin_unlock_irq(&new_base->lock); | ||
| 1612 | put_cpu_ptr(&tvec_bases); | ||
| 1613 | } | ||
| 1614 | 1826 | ||
| 1615 | static int timer_cpu_notify(struct notifier_block *self, | 1827 | for (i = 0; i < WHEEL_SIZE; i++) |
| 1616 | unsigned long action, void *hcpu) | 1828 | migrate_timer_list(new_base, old_base->vectors + i); |
| 1617 | { | ||
| 1618 | switch (action) { | ||
| 1619 | case CPU_DEAD: | ||
| 1620 | case CPU_DEAD_FROZEN: | ||
| 1621 | migrate_timers((long)hcpu); | ||
| 1622 | break; | ||
| 1623 | default: | ||
| 1624 | break; | ||
| 1625 | } | ||
| 1626 | 1829 | ||
| 1627 | return NOTIFY_OK; | 1830 | spin_unlock(&old_base->lock); |
| 1831 | spin_unlock_irq(&new_base->lock); | ||
| 1832 | put_cpu_ptr(&timer_bases); | ||
| 1833 | } | ||
| 1834 | return 0; | ||
| 1628 | } | 1835 | } |
| 1629 | 1836 | ||
| 1630 | static inline void timer_register_cpu_notifier(void) | ||
| 1631 | { | ||
| 1632 | cpu_notifier(timer_cpu_notify, 0); | ||
| 1633 | } | ||
| 1634 | #else | ||
| 1635 | static inline void timer_register_cpu_notifier(void) { } | ||
| 1636 | #endif /* CONFIG_HOTPLUG_CPU */ | 1837 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 1637 | 1838 | ||
| 1638 | static void __init init_timer_cpu(int cpu) | 1839 | static void __init init_timer_cpu(int cpu) |
| 1639 | { | 1840 | { |
| 1640 | struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); | 1841 | struct timer_base *base; |
| 1641 | 1842 | int i; | |
| 1642 | base->cpu = cpu; | ||
| 1643 | spin_lock_init(&base->lock); | ||
| 1644 | 1843 | ||
| 1645 | base->timer_jiffies = jiffies; | 1844 | for (i = 0; i < NR_BASES; i++) { |
| 1646 | base->next_timer = base->timer_jiffies; | 1845 | base = per_cpu_ptr(&timer_bases[i], cpu); |
| 1846 | base->cpu = cpu; | ||
| 1847 | spin_lock_init(&base->lock); | ||
| 1848 | base->clk = jiffies; | ||
| 1849 | } | ||
| 1647 | } | 1850 | } |
| 1648 | 1851 | ||
| 1649 | static void __init init_timer_cpus(void) | 1852 | static void __init init_timer_cpus(void) |
| @@ -1658,7 +1861,6 @@ void __init init_timers(void) | |||
| 1658 | { | 1861 | { |
| 1659 | init_timer_cpus(); | 1862 | init_timer_cpus(); |
| 1660 | init_timer_stats(); | 1863 | init_timer_stats(); |
| 1661 | timer_register_cpu_notifier(); | ||
| 1662 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq); | 1864 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq); |
| 1663 | } | 1865 | } |
| 1664 | 1866 | ||
| @@ -1702,9 +1904,15 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max) | |||
| 1702 | } | 1904 | } |
| 1703 | 1905 | ||
| 1704 | /** | 1906 | /** |
| 1705 | * usleep_range - Drop in replacement for udelay where wakeup is flexible | 1907 | * usleep_range - Sleep for an approximate time |
| 1706 | * @min: Minimum time in usecs to sleep | 1908 | * @min: Minimum time in usecs to sleep |
| 1707 | * @max: Maximum time in usecs to sleep | 1909 | * @max: Maximum time in usecs to sleep |
| 1910 | * | ||
| 1911 | * In non-atomic context where the exact wakeup time is flexible, use | ||
| 1912 | * usleep_range() instead of udelay(). The sleep improves responsiveness | ||
| 1913 | * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces | ||
| 1914 | * power usage by allowing hrtimers to take advantage of an already- | ||
| 1915 | * scheduled interrupt instead of scheduling a new one just for this sleep. | ||
| 1708 | */ | 1916 | */ |
| 1709 | void __sched usleep_range(unsigned long min, unsigned long max) | 1917 | void __sched usleep_range(unsigned long min, unsigned long max) |
| 1710 | { | 1918 | { |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1adecb4b87c8..087204c733eb 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
| @@ -279,7 +279,7 @@ static void print_name_offset(struct seq_file *m, unsigned long addr) | |||
| 279 | 279 | ||
| 280 | static int tstats_show(struct seq_file *m, void *v) | 280 | static int tstats_show(struct seq_file *m, void *v) |
| 281 | { | 281 | { |
| 282 | struct timespec period; | 282 | struct timespec64 period; |
| 283 | struct entry *entry; | 283 | struct entry *entry; |
| 284 | unsigned long ms; | 284 | unsigned long ms; |
| 285 | long events = 0; | 285 | long events = 0; |
| @@ -295,11 +295,11 @@ static int tstats_show(struct seq_file *m, void *v) | |||
| 295 | 295 | ||
| 296 | time = ktime_sub(time_stop, time_start); | 296 | time = ktime_sub(time_stop, time_start); |
| 297 | 297 | ||
| 298 | period = ktime_to_timespec(time); | 298 | period = ktime_to_timespec64(time); |
| 299 | ms = period.tv_nsec / 1000000; | 299 | ms = period.tv_nsec / 1000000; |
| 300 | 300 | ||
| 301 | seq_puts(m, "Timer Stats Version: v0.3\n"); | 301 | seq_puts(m, "Timer Stats Version: v0.3\n"); |
| 302 | seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); | 302 | seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms); |
| 303 | if (atomic_read(&overflow_count)) | 303 | if (atomic_read(&overflow_count)) |
| 304 | seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); | 304 | seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); |
| 305 | seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); | 305 | seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); |
diff --git a/kernel/torture.c b/kernel/torture.c index fa0bdeee17ac..75961b3decfe 100644 --- a/kernel/torture.c +++ b/kernel/torture.c | |||
| @@ -82,6 +82,104 @@ static int min_online = -1; | |||
| 82 | static int max_online; | 82 | static int max_online; |
| 83 | 83 | ||
| 84 | /* | 84 | /* |
| 85 | * Attempt to take a CPU offline. Return false if the CPU is already | ||
| 86 | * offline or if it is not subject to CPU-hotplug operations. The | ||
| 87 | * caller can detect other failures by looking at the statistics. | ||
| 88 | */ | ||
| 89 | bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, | ||
| 90 | unsigned long *sum_offl, int *min_offl, int *max_offl) | ||
| 91 | { | ||
| 92 | unsigned long delta; | ||
| 93 | int ret; | ||
| 94 | unsigned long starttime; | ||
| 95 | |||
| 96 | if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) | ||
| 97 | return false; | ||
| 98 | |||
| 99 | if (verbose) | ||
| 100 | pr_alert("%s" TORTURE_FLAG | ||
| 101 | "torture_onoff task: offlining %d\n", | ||
| 102 | torture_type, cpu); | ||
| 103 | starttime = jiffies; | ||
| 104 | (*n_offl_attempts)++; | ||
| 105 | ret = cpu_down(cpu); | ||
| 106 | if (ret) { | ||
| 107 | if (verbose) | ||
| 108 | pr_alert("%s" TORTURE_FLAG | ||
| 109 | "torture_onoff task: offline %d failed: errno %d\n", | ||
| 110 | torture_type, cpu, ret); | ||
| 111 | } else { | ||
| 112 | if (verbose) | ||
| 113 | pr_alert("%s" TORTURE_FLAG | ||
| 114 | "torture_onoff task: offlined %d\n", | ||
| 115 | torture_type, cpu); | ||
| 116 | (*n_offl_successes)++; | ||
| 117 | delta = jiffies - starttime; | ||
| 118 | sum_offl += delta; | ||
| 119 | if (*min_offl < 0) { | ||
| 120 | *min_offl = delta; | ||
| 121 | *max_offl = delta; | ||
| 122 | } | ||
| 123 | if (*min_offl > delta) | ||
| 124 | *min_offl = delta; | ||
| 125 | if (*max_offl < delta) | ||
| 126 | *max_offl = delta; | ||
| 127 | } | ||
| 128 | |||
| 129 | return true; | ||
| 130 | } | ||
| 131 | EXPORT_SYMBOL_GPL(torture_offline); | ||
| 132 | |||
| 133 | /* | ||
| 134 | * Attempt to bring a CPU online. Return false if the CPU is already | ||
| 135 | * online or if it is not subject to CPU-hotplug operations. The | ||
| 136 | * caller can detect other failures by looking at the statistics. | ||
| 137 | */ | ||
| 138 | bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, | ||
| 139 | unsigned long *sum_onl, int *min_onl, int *max_onl) | ||
| 140 | { | ||
| 141 | unsigned long delta; | ||
| 142 | int ret; | ||
| 143 | unsigned long starttime; | ||
| 144 | |||
| 145 | if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) | ||
| 146 | return false; | ||
| 147 | |||
| 148 | if (verbose) | ||
| 149 | pr_alert("%s" TORTURE_FLAG | ||
| 150 | "torture_onoff task: onlining %d\n", | ||
| 151 | torture_type, cpu); | ||
| 152 | starttime = jiffies; | ||
| 153 | (*n_onl_attempts)++; | ||
| 154 | ret = cpu_up(cpu); | ||
| 155 | if (ret) { | ||
| 156 | if (verbose) | ||
| 157 | pr_alert("%s" TORTURE_FLAG | ||
| 158 | "torture_onoff task: online %d failed: errno %d\n", | ||
| 159 | torture_type, cpu, ret); | ||
| 160 | } else { | ||
| 161 | if (verbose) | ||
| 162 | pr_alert("%s" TORTURE_FLAG | ||
| 163 | "torture_onoff task: onlined %d\n", | ||
| 164 | torture_type, cpu); | ||
| 165 | (*n_onl_successes)++; | ||
| 166 | delta = jiffies - starttime; | ||
| 167 | *sum_onl += delta; | ||
| 168 | if (*min_onl < 0) { | ||
| 169 | *min_onl = delta; | ||
| 170 | *max_onl = delta; | ||
| 171 | } | ||
| 172 | if (*min_onl > delta) | ||
| 173 | *min_onl = delta; | ||
| 174 | if (*max_onl < delta) | ||
| 175 | *max_onl = delta; | ||
| 176 | } | ||
| 177 | |||
| 178 | return true; | ||
| 179 | } | ||
| 180 | EXPORT_SYMBOL_GPL(torture_online); | ||
| 181 | |||
| 182 | /* | ||
| 85 | * Execute random CPU-hotplug operations at the interval specified | 183 | * Execute random CPU-hotplug operations at the interval specified |
| 86 | * by the onoff_interval. | 184 | * by the onoff_interval. |
| 87 | */ | 185 | */ |
| @@ -89,16 +187,19 @@ static int | |||
| 89 | torture_onoff(void *arg) | 187 | torture_onoff(void *arg) |
| 90 | { | 188 | { |
| 91 | int cpu; | 189 | int cpu; |
| 92 | unsigned long delta; | ||
| 93 | int maxcpu = -1; | 190 | int maxcpu = -1; |
| 94 | DEFINE_TORTURE_RANDOM(rand); | 191 | DEFINE_TORTURE_RANDOM(rand); |
| 95 | int ret; | ||
| 96 | unsigned long starttime; | ||
| 97 | 192 | ||
| 98 | VERBOSE_TOROUT_STRING("torture_onoff task started"); | 193 | VERBOSE_TOROUT_STRING("torture_onoff task started"); |
| 99 | for_each_online_cpu(cpu) | 194 | for_each_online_cpu(cpu) |
| 100 | maxcpu = cpu; | 195 | maxcpu = cpu; |
| 101 | WARN_ON(maxcpu < 0); | 196 | WARN_ON(maxcpu < 0); |
| 197 | |||
| 198 | if (maxcpu == 0) { | ||
| 199 | VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); | ||
| 200 | goto stop; | ||
| 201 | } | ||
| 202 | |||
| 102 | if (onoff_holdoff > 0) { | 203 | if (onoff_holdoff > 0) { |
| 103 | VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); | 204 | VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); |
| 104 | schedule_timeout_interruptible(onoff_holdoff); | 205 | schedule_timeout_interruptible(onoff_holdoff); |
| @@ -106,69 +207,16 @@ torture_onoff(void *arg) | |||
| 106 | } | 207 | } |
| 107 | while (!torture_must_stop()) { | 208 | while (!torture_must_stop()) { |
| 108 | cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); | 209 | cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); |
| 109 | if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { | 210 | if (!torture_offline(cpu, |
| 110 | if (verbose) | 211 | &n_offline_attempts, &n_offline_successes, |
| 111 | pr_alert("%s" TORTURE_FLAG | 212 | &sum_offline, &min_offline, &max_offline)) |
| 112 | "torture_onoff task: offlining %d\n", | 213 | torture_online(cpu, |
| 113 | torture_type, cpu); | 214 | &n_online_attempts, &n_online_successes, |
| 114 | starttime = jiffies; | 215 | &sum_online, &min_online, &max_online); |
| 115 | n_offline_attempts++; | ||
| 116 | ret = cpu_down(cpu); | ||
| 117 | if (ret) { | ||
| 118 | if (verbose) | ||
| 119 | pr_alert("%s" TORTURE_FLAG | ||
| 120 | "torture_onoff task: offline %d failed: errno %d\n", | ||
| 121 | torture_type, cpu, ret); | ||
| 122 | } else { | ||
| 123 | if (verbose) | ||
| 124 | pr_alert("%s" TORTURE_FLAG | ||
| 125 | "torture_onoff task: offlined %d\n", | ||
| 126 | torture_type, cpu); | ||
| 127 | n_offline_successes++; | ||
| 128 | delta = jiffies - starttime; | ||
| 129 | sum_offline += delta; | ||
| 130 | if (min_offline < 0) { | ||
| 131 | min_offline = delta; | ||
| 132 | max_offline = delta; | ||
| 133 | } | ||
| 134 | if (min_offline > delta) | ||
| 135 | min_offline = delta; | ||
| 136 | if (max_offline < delta) | ||
| 137 | max_offline = delta; | ||
| 138 | } | ||
| 139 | } else if (cpu_is_hotpluggable(cpu)) { | ||
| 140 | if (verbose) | ||
| 141 | pr_alert("%s" TORTURE_FLAG | ||
| 142 | "torture_onoff task: onlining %d\n", | ||
| 143 | torture_type, cpu); | ||
| 144 | starttime = jiffies; | ||
| 145 | n_online_attempts++; | ||
| 146 | ret = cpu_up(cpu); | ||
| 147 | if (ret) { | ||
| 148 | if (verbose) | ||
| 149 | pr_alert("%s" TORTURE_FLAG | ||
| 150 | "torture_onoff task: online %d failed: errno %d\n", | ||
| 151 | torture_type, cpu, ret); | ||
| 152 | } else { | ||
| 153 | if (verbose) | ||
| 154 | pr_alert("%s" TORTURE_FLAG | ||
| 155 | "torture_onoff task: onlined %d\n", | ||
| 156 | torture_type, cpu); | ||
| 157 | n_online_successes++; | ||
| 158 | delta = jiffies - starttime; | ||
| 159 | sum_online += delta; | ||
| 160 | if (min_online < 0) { | ||
| 161 | min_online = delta; | ||
| 162 | max_online = delta; | ||
| 163 | } | ||
| 164 | if (min_online > delta) | ||
| 165 | min_online = delta; | ||
| 166 | if (max_online < delta) | ||
| 167 | max_online = delta; | ||
| 168 | } | ||
| 169 | } | ||
| 170 | schedule_timeout_interruptible(onoff_interval); | 216 | schedule_timeout_interruptible(onoff_interval); |
| 171 | } | 217 | } |
| 218 | |||
| 219 | stop: | ||
| 172 | torture_kthread_stopping("torture_onoff"); | 220 | torture_kthread_stopping("torture_onoff"); |
| 173 | return 0; | 221 | return 0; |
| 174 | } | 222 | } |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fafeaf803bd0..f4b86e8ca1e7 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -542,6 +542,7 @@ config HIST_TRIGGERS | |||
| 542 | bool "Histogram triggers" | 542 | bool "Histogram triggers" |
| 543 | depends on ARCH_HAVE_NMI_SAFE_CMPXCHG | 543 | depends on ARCH_HAVE_NMI_SAFE_CMPXCHG |
| 544 | select TRACING_MAP | 544 | select TRACING_MAP |
| 545 | select TRACING | ||
| 545 | default n | 546 | default n |
| 546 | help | 547 | help |
| 547 | Hist triggers allow one or more arbitrary trace event fields | 548 | Hist triggers allow one or more arbitrary trace event fields |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9aef8654e90d..fb345cd11883 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
| @@ -127,12 +127,13 @@ static void trace_note_tsk(struct task_struct *tsk) | |||
| 127 | 127 | ||
| 128 | static void trace_note_time(struct blk_trace *bt) | 128 | static void trace_note_time(struct blk_trace *bt) |
| 129 | { | 129 | { |
| 130 | struct timespec now; | 130 | struct timespec64 now; |
| 131 | unsigned long flags; | 131 | unsigned long flags; |
| 132 | u32 words[2]; | 132 | u32 words[2]; |
| 133 | 133 | ||
| 134 | getnstimeofday(&now); | 134 | /* need to check user space to see if this breaks in y2038 or y2106 */ |
| 135 | words[0] = now.tv_sec; | 135 | ktime_get_real_ts64(&now); |
| 136 | words[0] = (u32)now.tv_sec; | ||
| 136 | words[1] = now.tv_nsec; | 137 | words[1] = now.tv_nsec; |
| 137 | 138 | ||
| 138 | local_irq_save(flags); | 139 | local_irq_save(flags); |
| @@ -189,6 +190,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), | |||
| 189 | BLK_TC_ACT(BLK_TC_WRITE) }; | 190 | BLK_TC_ACT(BLK_TC_WRITE) }; |
| 190 | 191 | ||
| 191 | #define BLK_TC_RAHEAD BLK_TC_AHEAD | 192 | #define BLK_TC_RAHEAD BLK_TC_AHEAD |
| 193 | #define BLK_TC_PREFLUSH BLK_TC_FLUSH | ||
| 192 | 194 | ||
| 193 | /* The ilog2() calls fall out because they're constant */ | 195 | /* The ilog2() calls fall out because they're constant */ |
| 194 | #define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ | 196 | #define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ |
| @@ -199,7 +201,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), | |||
| 199 | * blk_io_trace structure and places it in a per-cpu subbuffer. | 201 | * blk_io_trace structure and places it in a per-cpu subbuffer. |
| 200 | */ | 202 | */ |
| 201 | static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, | 203 | static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, |
| 202 | int rw, u32 what, int error, int pdu_len, void *pdu_data) | 204 | int op, int op_flags, u32 what, int error, int pdu_len, |
| 205 | void *pdu_data) | ||
| 203 | { | 206 | { |
| 204 | struct task_struct *tsk = current; | 207 | struct task_struct *tsk = current; |
| 205 | struct ring_buffer_event *event = NULL; | 208 | struct ring_buffer_event *event = NULL; |
| @@ -214,13 +217,16 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, | |||
| 214 | if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) | 217 | if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) |
| 215 | return; | 218 | return; |
| 216 | 219 | ||
| 217 | what |= ddir_act[rw & WRITE]; | 220 | what |= ddir_act[op_is_write(op) ? WRITE : READ]; |
| 218 | what |= MASK_TC_BIT(rw, SYNC); | 221 | what |= MASK_TC_BIT(op_flags, SYNC); |
| 219 | what |= MASK_TC_BIT(rw, RAHEAD); | 222 | what |= MASK_TC_BIT(op_flags, RAHEAD); |
| 220 | what |= MASK_TC_BIT(rw, META); | 223 | what |= MASK_TC_BIT(op_flags, META); |
| 221 | what |= MASK_TC_BIT(rw, DISCARD); | 224 | what |= MASK_TC_BIT(op_flags, PREFLUSH); |
| 222 | what |= MASK_TC_BIT(rw, FLUSH); | 225 | what |= MASK_TC_BIT(op_flags, FUA); |
| 223 | what |= MASK_TC_BIT(rw, FUA); | 226 | if (op == REQ_OP_DISCARD) |
| 227 | what |= BLK_TC_ACT(BLK_TC_DISCARD); | ||
| 228 | if (op == REQ_OP_FLUSH) | ||
| 229 | what |= BLK_TC_ACT(BLK_TC_FLUSH); | ||
| 224 | 230 | ||
| 225 | pid = tsk->pid; | 231 | pid = tsk->pid; |
| 226 | if (act_log_check(bt, what, sector, pid)) | 232 | if (act_log_check(bt, what, sector, pid)) |
| @@ -708,11 +714,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, | |||
| 708 | 714 | ||
| 709 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { | 715 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { |
| 710 | what |= BLK_TC_ACT(BLK_TC_PC); | 716 | what |= BLK_TC_ACT(BLK_TC_PC); |
| 711 | __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags, | 717 | __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags, |
| 712 | what, rq->errors, rq->cmd_len, rq->cmd); | 718 | what, rq->errors, rq->cmd_len, rq->cmd); |
| 713 | } else { | 719 | } else { |
| 714 | what |= BLK_TC_ACT(BLK_TC_FS); | 720 | what |= BLK_TC_ACT(BLK_TC_FS); |
| 715 | __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, | 721 | __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq), |
| 716 | rq->cmd_flags, what, rq->errors, 0, NULL); | 722 | rq->cmd_flags, what, rq->errors, 0, NULL); |
| 717 | } | 723 | } |
| 718 | } | 724 | } |
| @@ -770,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, | |||
| 770 | return; | 776 | return; |
| 771 | 777 | ||
| 772 | __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, | 778 | __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, |
| 773 | bio->bi_rw, what, error, 0, NULL); | 779 | bio_op(bio), bio->bi_rw, what, error, 0, NULL); |
| 774 | } | 780 | } |
| 775 | 781 | ||
| 776 | static void blk_add_trace_bio_bounce(void *ignore, | 782 | static void blk_add_trace_bio_bounce(void *ignore, |
| @@ -818,7 +824,8 @@ static void blk_add_trace_getrq(void *ignore, | |||
| 818 | struct blk_trace *bt = q->blk_trace; | 824 | struct blk_trace *bt = q->blk_trace; |
| 819 | 825 | ||
| 820 | if (bt) | 826 | if (bt) |
| 821 | __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); | 827 | __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, |
| 828 | NULL); | ||
| 822 | } | 829 | } |
| 823 | } | 830 | } |
| 824 | 831 | ||
| @@ -833,7 +840,7 @@ static void blk_add_trace_sleeprq(void *ignore, | |||
| 833 | struct blk_trace *bt = q->blk_trace; | 840 | struct blk_trace *bt = q->blk_trace; |
| 834 | 841 | ||
| 835 | if (bt) | 842 | if (bt) |
| 836 | __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, | 843 | __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, |
| 837 | 0, 0, NULL); | 844 | 0, 0, NULL); |
| 838 | } | 845 | } |
| 839 | } | 846 | } |
| @@ -843,7 +850,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) | |||
| 843 | struct blk_trace *bt = q->blk_trace; | 850 | struct blk_trace *bt = q->blk_trace; |
| 844 | 851 | ||
| 845 | if (bt) | 852 | if (bt) |
| 846 | __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); | 853 | __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); |
| 847 | } | 854 | } |
| 848 | 855 | ||
| 849 | static void blk_add_trace_unplug(void *ignore, struct request_queue *q, | 856 | static void blk_add_trace_unplug(void *ignore, struct request_queue *q, |
| @@ -860,7 +867,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, | |||
| 860 | else | 867 | else |
| 861 | what = BLK_TA_UNPLUG_TIMER; | 868 | what = BLK_TA_UNPLUG_TIMER; |
| 862 | 869 | ||
| 863 | __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); | 870 | __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); |
| 864 | } | 871 | } |
| 865 | } | 872 | } |
| 866 | 873 | ||
| @@ -874,8 +881,9 @@ static void blk_add_trace_split(void *ignore, | |||
| 874 | __be64 rpdu = cpu_to_be64(pdu); | 881 | __be64 rpdu = cpu_to_be64(pdu); |
| 875 | 882 | ||
| 876 | __blk_add_trace(bt, bio->bi_iter.bi_sector, | 883 | __blk_add_trace(bt, bio->bi_iter.bi_sector, |
| 877 | bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, | 884 | bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw, |
| 878 | bio->bi_error, sizeof(rpdu), &rpdu); | 885 | BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), |
| 886 | &rpdu); | ||
| 879 | } | 887 | } |
| 880 | } | 888 | } |
| 881 | 889 | ||
| @@ -907,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore, | |||
| 907 | r.sector_from = cpu_to_be64(from); | 915 | r.sector_from = cpu_to_be64(from); |
| 908 | 916 | ||
| 909 | __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, | 917 | __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, |
| 910 | bio->bi_rw, BLK_TA_REMAP, bio->bi_error, | 918 | bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error, |
| 911 | sizeof(r), &r); | 919 | sizeof(r), &r); |
| 912 | } | 920 | } |
| 913 | 921 | ||
| @@ -940,7 +948,7 @@ static void blk_add_trace_rq_remap(void *ignore, | |||
| 940 | r.sector_from = cpu_to_be64(from); | 948 | r.sector_from = cpu_to_be64(from); |
| 941 | 949 | ||
| 942 | __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), | 950 | __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), |
| 943 | rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, | 951 | rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors, |
| 944 | sizeof(r), &r); | 952 | sizeof(r), &r); |
| 945 | } | 953 | } |
| 946 | 954 | ||
| @@ -965,10 +973,10 @@ void blk_add_driver_data(struct request_queue *q, | |||
| 965 | return; | 973 | return; |
| 966 | 974 | ||
| 967 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) | 975 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) |
| 968 | __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, | 976 | __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0, |
| 969 | BLK_TA_DRV_DATA, rq->errors, len, data); | 977 | BLK_TA_DRV_DATA, rq->errors, len, data); |
| 970 | else | 978 | else |
| 971 | __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, | 979 | __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0, |
| 972 | BLK_TA_DRV_DATA, rq->errors, len, data); | 980 | BLK_TA_DRV_DATA, rq->errors, len, data); |
| 973 | } | 981 | } |
| 974 | EXPORT_SYMBOL_GPL(blk_add_driver_data); | 982 | EXPORT_SYMBOL_GPL(blk_add_driver_data); |
| @@ -1769,21 +1777,34 @@ void blk_dump_cmd(char *buf, struct request *rq) | |||
| 1769 | } | 1777 | } |
| 1770 | } | 1778 | } |
| 1771 | 1779 | ||
| 1772 | void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) | 1780 | void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) |
| 1773 | { | 1781 | { |
| 1774 | int i = 0; | 1782 | int i = 0; |
| 1775 | 1783 | ||
| 1776 | if (rw & REQ_FLUSH) | 1784 | if (rw & REQ_PREFLUSH) |
| 1777 | rwbs[i++] = 'F'; | 1785 | rwbs[i++] = 'F'; |
| 1778 | 1786 | ||
| 1779 | if (rw & WRITE) | 1787 | switch (op) { |
| 1788 | case REQ_OP_WRITE: | ||
| 1789 | case REQ_OP_WRITE_SAME: | ||
| 1780 | rwbs[i++] = 'W'; | 1790 | rwbs[i++] = 'W'; |
| 1781 | else if (rw & REQ_DISCARD) | 1791 | break; |
| 1792 | case REQ_OP_DISCARD: | ||
| 1793 | rwbs[i++] = 'D'; | ||
| 1794 | break; | ||
| 1795 | case REQ_OP_SECURE_ERASE: | ||
| 1782 | rwbs[i++] = 'D'; | 1796 | rwbs[i++] = 'D'; |
| 1783 | else if (bytes) | 1797 | rwbs[i++] = 'E'; |
| 1798 | break; | ||
| 1799 | case REQ_OP_FLUSH: | ||
| 1800 | rwbs[i++] = 'F'; | ||
| 1801 | break; | ||
| 1802 | case REQ_OP_READ: | ||
| 1784 | rwbs[i++] = 'R'; | 1803 | rwbs[i++] = 'R'; |
| 1785 | else | 1804 | break; |
| 1805 | default: | ||
| 1786 | rwbs[i++] = 'N'; | 1806 | rwbs[i++] = 'N'; |
| 1807 | } | ||
| 1787 | 1808 | ||
| 1788 | if (rw & REQ_FUA) | 1809 | if (rw & REQ_FUA) |
| 1789 | rwbs[i++] = 'F'; | 1810 | rwbs[i++] = 'F'; |
| @@ -1793,8 +1814,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) | |||
| 1793 | rwbs[i++] = 'S'; | 1814 | rwbs[i++] = 'S'; |
| 1794 | if (rw & REQ_META) | 1815 | if (rw & REQ_META) |
| 1795 | rwbs[i++] = 'M'; | 1816 | rwbs[i++] = 'M'; |
| 1796 | if (rw & REQ_SECURE) | ||
| 1797 | rwbs[i++] = 'E'; | ||
| 1798 | 1817 | ||
| 1799 | rwbs[i] = '\0'; | 1818 | rwbs[i] = '\0'; |
| 1800 | } | 1819 | } |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 26f603da7e26..b20438fdb029 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
| @@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = { | |||
| 81 | .arg3_type = ARG_ANYTHING, | 81 | .arg3_type = ARG_ANYTHING, |
| 82 | }; | 82 | }; |
| 83 | 83 | ||
| 84 | static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
| 85 | { | ||
| 86 | void *unsafe_ptr = (void *) (long) r1; | ||
| 87 | void *src = (void *) (long) r2; | ||
| 88 | int size = (int) r3; | ||
| 89 | |||
| 90 | /* | ||
| 91 | * Ensure we're in user context which is safe for the helper to | ||
| 92 | * run. This helper has no business in a kthread. | ||
| 93 | * | ||
| 94 | * access_ok() should prevent writing to non-user memory, but in | ||
| 95 | * some situations (nommu, temporary switch, etc) access_ok() does | ||
| 96 | * not provide enough validation, hence the check on KERNEL_DS. | ||
| 97 | */ | ||
| 98 | |||
| 99 | if (unlikely(in_interrupt() || | ||
| 100 | current->flags & (PF_KTHREAD | PF_EXITING))) | ||
| 101 | return -EPERM; | ||
| 102 | if (unlikely(segment_eq(get_fs(), KERNEL_DS))) | ||
| 103 | return -EPERM; | ||
| 104 | if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) | ||
| 105 | return -EPERM; | ||
| 106 | |||
| 107 | return probe_kernel_write(unsafe_ptr, src, size); | ||
| 108 | } | ||
| 109 | |||
| 110 | static const struct bpf_func_proto bpf_probe_write_user_proto = { | ||
| 111 | .func = bpf_probe_write_user, | ||
| 112 | .gpl_only = true, | ||
| 113 | .ret_type = RET_INTEGER, | ||
| 114 | .arg1_type = ARG_ANYTHING, | ||
| 115 | .arg2_type = ARG_PTR_TO_STACK, | ||
| 116 | .arg3_type = ARG_CONST_STACK_SIZE, | ||
| 117 | }; | ||
| 118 | |||
| 119 | static const struct bpf_func_proto *bpf_get_probe_write_proto(void) | ||
| 120 | { | ||
| 121 | pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", | ||
| 122 | current->comm, task_pid_nr(current)); | ||
| 123 | |||
| 124 | return &bpf_probe_write_user_proto; | ||
| 125 | } | ||
| 126 | |||
| 84 | /* | 127 | /* |
| 85 | * limited trace_printk() | 128 | * limited trace_printk() |
| 86 | * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed | 129 | * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed |
| @@ -188,31 +231,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) | |||
| 188 | return &bpf_trace_printk_proto; | 231 | return &bpf_trace_printk_proto; |
| 189 | } | 232 | } |
| 190 | 233 | ||
| 191 | static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) | 234 | static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5) |
| 192 | { | 235 | { |
| 193 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | 236 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; |
| 194 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 237 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
| 238 | unsigned int cpu = smp_processor_id(); | ||
| 239 | u64 index = flags & BPF_F_INDEX_MASK; | ||
| 240 | struct bpf_event_entry *ee; | ||
| 195 | struct perf_event *event; | 241 | struct perf_event *event; |
| 196 | struct file *file; | ||
| 197 | 242 | ||
| 243 | if (unlikely(flags & ~(BPF_F_INDEX_MASK))) | ||
| 244 | return -EINVAL; | ||
| 245 | if (index == BPF_F_CURRENT_CPU) | ||
| 246 | index = cpu; | ||
| 198 | if (unlikely(index >= array->map.max_entries)) | 247 | if (unlikely(index >= array->map.max_entries)) |
| 199 | return -E2BIG; | 248 | return -E2BIG; |
| 200 | 249 | ||
| 201 | file = READ_ONCE(array->ptrs[index]); | 250 | ee = READ_ONCE(array->ptrs[index]); |
| 202 | if (unlikely(!file)) | 251 | if (!ee) |
| 203 | return -ENOENT; | 252 | return -ENOENT; |
| 204 | 253 | ||
| 205 | event = file->private_data; | 254 | event = ee->event; |
| 206 | |||
| 207 | /* make sure event is local and doesn't have pmu::count */ | ||
| 208 | if (event->oncpu != smp_processor_id() || | ||
| 209 | event->pmu->count) | ||
| 210 | return -EINVAL; | ||
| 211 | |||
| 212 | if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && | 255 | if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && |
| 213 | event->attr.type != PERF_TYPE_RAW)) | 256 | event->attr.type != PERF_TYPE_RAW)) |
| 214 | return -EINVAL; | 257 | return -EINVAL; |
| 215 | 258 | ||
| 259 | /* make sure event is local and doesn't have pmu::count */ | ||
| 260 | if (unlikely(event->oncpu != cpu || event->pmu->count)) | ||
| 261 | return -EINVAL; | ||
| 262 | |||
| 216 | /* | 263 | /* |
| 217 | * we don't know if the function is run successfully by the | 264 | * we don't know if the function is run successfully by the |
| 218 | * return value. It can be judged in other places, such as | 265 | * return value. It can be judged in other places, such as |
| @@ -229,47 +276,58 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { | |||
| 229 | .arg2_type = ARG_ANYTHING, | 276 | .arg2_type = ARG_ANYTHING, |
| 230 | }; | 277 | }; |
| 231 | 278 | ||
| 232 | static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) | 279 | static __always_inline u64 |
| 280 | __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, | ||
| 281 | u64 flags, struct perf_raw_record *raw) | ||
| 233 | { | 282 | { |
| 234 | struct pt_regs *regs = (struct pt_regs *) (long) r1; | ||
| 235 | struct bpf_map *map = (struct bpf_map *) (long) r2; | ||
| 236 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 283 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
| 284 | unsigned int cpu = smp_processor_id(); | ||
| 237 | u64 index = flags & BPF_F_INDEX_MASK; | 285 | u64 index = flags & BPF_F_INDEX_MASK; |
| 238 | void *data = (void *) (long) r4; | ||
| 239 | struct perf_sample_data sample_data; | 286 | struct perf_sample_data sample_data; |
| 287 | struct bpf_event_entry *ee; | ||
| 240 | struct perf_event *event; | 288 | struct perf_event *event; |
| 241 | struct file *file; | ||
| 242 | struct perf_raw_record raw = { | ||
| 243 | .size = size, | ||
| 244 | .data = data, | ||
| 245 | }; | ||
| 246 | 289 | ||
| 247 | if (unlikely(flags & ~(BPF_F_INDEX_MASK))) | ||
| 248 | return -EINVAL; | ||
| 249 | if (index == BPF_F_CURRENT_CPU) | 290 | if (index == BPF_F_CURRENT_CPU) |
| 250 | index = raw_smp_processor_id(); | 291 | index = cpu; |
| 251 | if (unlikely(index >= array->map.max_entries)) | 292 | if (unlikely(index >= array->map.max_entries)) |
| 252 | return -E2BIG; | 293 | return -E2BIG; |
| 253 | 294 | ||
| 254 | file = READ_ONCE(array->ptrs[index]); | 295 | ee = READ_ONCE(array->ptrs[index]); |
| 255 | if (unlikely(!file)) | 296 | if (!ee) |
| 256 | return -ENOENT; | 297 | return -ENOENT; |
| 257 | 298 | ||
| 258 | event = file->private_data; | 299 | event = ee->event; |
| 259 | |||
| 260 | if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || | 300 | if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || |
| 261 | event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) | 301 | event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) |
| 262 | return -EINVAL; | 302 | return -EINVAL; |
| 263 | 303 | ||
| 264 | if (unlikely(event->oncpu != smp_processor_id())) | 304 | if (unlikely(event->oncpu != cpu)) |
| 265 | return -EOPNOTSUPP; | 305 | return -EOPNOTSUPP; |
| 266 | 306 | ||
| 267 | perf_sample_data_init(&sample_data, 0, 0); | 307 | perf_sample_data_init(&sample_data, 0, 0); |
| 268 | sample_data.raw = &raw; | 308 | sample_data.raw = raw; |
| 269 | perf_event_output(event, &sample_data, regs); | 309 | perf_event_output(event, &sample_data, regs); |
| 270 | return 0; | 310 | return 0; |
| 271 | } | 311 | } |
| 272 | 312 | ||
| 313 | static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) | ||
| 314 | { | ||
| 315 | struct pt_regs *regs = (struct pt_regs *)(long) r1; | ||
| 316 | struct bpf_map *map = (struct bpf_map *)(long) r2; | ||
| 317 | void *data = (void *)(long) r4; | ||
| 318 | struct perf_raw_record raw = { | ||
| 319 | .frag = { | ||
| 320 | .size = size, | ||
| 321 | .data = data, | ||
| 322 | }, | ||
| 323 | }; | ||
| 324 | |||
| 325 | if (unlikely(flags & ~(BPF_F_INDEX_MASK))) | ||
| 326 | return -EINVAL; | ||
| 327 | |||
| 328 | return __bpf_perf_event_output(regs, map, flags, &raw); | ||
| 329 | } | ||
| 330 | |||
| 273 | static const struct bpf_func_proto bpf_perf_event_output_proto = { | 331 | static const struct bpf_func_proto bpf_perf_event_output_proto = { |
| 274 | .func = bpf_perf_event_output, | 332 | .func = bpf_perf_event_output, |
| 275 | .gpl_only = true, | 333 | .gpl_only = true, |
| @@ -283,31 +341,41 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { | |||
| 283 | 341 | ||
| 284 | static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); | 342 | static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); |
| 285 | 343 | ||
| 286 | static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) | 344 | u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, |
| 345 | void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) | ||
| 287 | { | 346 | { |
| 288 | struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); | 347 | struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); |
| 348 | struct perf_raw_frag frag = { | ||
| 349 | .copy = ctx_copy, | ||
| 350 | .size = ctx_size, | ||
| 351 | .data = ctx, | ||
| 352 | }; | ||
| 353 | struct perf_raw_record raw = { | ||
| 354 | .frag = { | ||
| 355 | { | ||
| 356 | .next = ctx_size ? &frag : NULL, | ||
| 357 | }, | ||
| 358 | .size = meta_size, | ||
| 359 | .data = meta, | ||
| 360 | }, | ||
| 361 | }; | ||
| 289 | 362 | ||
| 290 | perf_fetch_caller_regs(regs); | 363 | perf_fetch_caller_regs(regs); |
| 291 | 364 | ||
| 292 | return bpf_perf_event_output((long)regs, r2, flags, r4, size); | 365 | return __bpf_perf_event_output(regs, map, flags, &raw); |
| 366 | } | ||
| 367 | |||
| 368 | static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
| 369 | { | ||
| 370 | return (long) current; | ||
| 293 | } | 371 | } |
| 294 | 372 | ||
| 295 | static const struct bpf_func_proto bpf_event_output_proto = { | 373 | static const struct bpf_func_proto bpf_get_current_task_proto = { |
| 296 | .func = bpf_event_output, | 374 | .func = bpf_get_current_task, |
| 297 | .gpl_only = true, | 375 | .gpl_only = true, |
| 298 | .ret_type = RET_INTEGER, | 376 | .ret_type = RET_INTEGER, |
| 299 | .arg1_type = ARG_PTR_TO_CTX, | ||
| 300 | .arg2_type = ARG_CONST_MAP_PTR, | ||
| 301 | .arg3_type = ARG_ANYTHING, | ||
| 302 | .arg4_type = ARG_PTR_TO_STACK, | ||
| 303 | .arg5_type = ARG_CONST_STACK_SIZE, | ||
| 304 | }; | 377 | }; |
| 305 | 378 | ||
| 306 | const struct bpf_func_proto *bpf_get_event_output_proto(void) | ||
| 307 | { | ||
| 308 | return &bpf_event_output_proto; | ||
| 309 | } | ||
| 310 | |||
| 311 | static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) | 379 | static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) |
| 312 | { | 380 | { |
| 313 | switch (func_id) { | 381 | switch (func_id) { |
| @@ -325,6 +393,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) | |||
| 325 | return &bpf_tail_call_proto; | 393 | return &bpf_tail_call_proto; |
| 326 | case BPF_FUNC_get_current_pid_tgid: | 394 | case BPF_FUNC_get_current_pid_tgid: |
| 327 | return &bpf_get_current_pid_tgid_proto; | 395 | return &bpf_get_current_pid_tgid_proto; |
| 396 | case BPF_FUNC_get_current_task: | ||
| 397 | return &bpf_get_current_task_proto; | ||
| 328 | case BPF_FUNC_get_current_uid_gid: | 398 | case BPF_FUNC_get_current_uid_gid: |
| 329 | return &bpf_get_current_uid_gid_proto; | 399 | return &bpf_get_current_uid_gid_proto; |
| 330 | case BPF_FUNC_get_current_comm: | 400 | case BPF_FUNC_get_current_comm: |
| @@ -335,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) | |||
| 335 | return &bpf_get_smp_processor_id_proto; | 405 | return &bpf_get_smp_processor_id_proto; |
| 336 | case BPF_FUNC_perf_event_read: | 406 | case BPF_FUNC_perf_event_read: |
| 337 | return &bpf_perf_event_read_proto; | 407 | return &bpf_perf_event_read_proto; |
| 408 | case BPF_FUNC_probe_write_user: | ||
| 409 | return bpf_get_probe_write_proto(); | ||
| 338 | default: | 410 | default: |
| 339 | return NULL; | 411 | return NULL; |
| 340 | } | 412 | } |
| @@ -356,18 +428,12 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func | |||
| 356 | static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, | 428 | static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, |
| 357 | enum bpf_reg_type *reg_type) | 429 | enum bpf_reg_type *reg_type) |
| 358 | { | 430 | { |
| 359 | /* check bounds */ | ||
| 360 | if (off < 0 || off >= sizeof(struct pt_regs)) | 431 | if (off < 0 || off >= sizeof(struct pt_regs)) |
| 361 | return false; | 432 | return false; |
| 362 | |||
| 363 | /* only read is allowed */ | ||
| 364 | if (type != BPF_READ) | 433 | if (type != BPF_READ) |
| 365 | return false; | 434 | return false; |
| 366 | |||
| 367 | /* disallow misaligned access */ | ||
| 368 | if (off % size != 0) | 435 | if (off % size != 0) |
| 369 | return false; | 436 | return false; |
| 370 | |||
| 371 | return true; | 437 | return true; |
| 372 | } | 438 | } |
| 373 | 439 | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 900dbb1efff2..84752c8e28b5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -89,16 +89,16 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; | |||
| 89 | /* What to set function_trace_op to */ | 89 | /* What to set function_trace_op to */ |
| 90 | static struct ftrace_ops *set_function_trace_op; | 90 | static struct ftrace_ops *set_function_trace_op; |
| 91 | 91 | ||
| 92 | /* List for set_ftrace_pid's pids. */ | 92 | static bool ftrace_pids_enabled(struct ftrace_ops *ops) |
| 93 | LIST_HEAD(ftrace_pids); | ||
| 94 | struct ftrace_pid { | ||
| 95 | struct list_head list; | ||
| 96 | struct pid *pid; | ||
| 97 | }; | ||
| 98 | |||
| 99 | static bool ftrace_pids_enabled(void) | ||
| 100 | { | 93 | { |
| 101 | return !list_empty(&ftrace_pids); | 94 | struct trace_array *tr; |
| 95 | |||
| 96 | if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private) | ||
| 97 | return false; | ||
| 98 | |||
| 99 | tr = ops->private; | ||
| 100 | |||
| 101 | return tr->function_pids != NULL; | ||
| 102 | } | 102 | } |
| 103 | 103 | ||
| 104 | static void ftrace_update_trampoline(struct ftrace_ops *ops); | 104 | static void ftrace_update_trampoline(struct ftrace_ops *ops); |
| @@ -179,7 +179,9 @@ int ftrace_nr_registered_ops(void) | |||
| 179 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, | 179 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, |
| 180 | struct ftrace_ops *op, struct pt_regs *regs) | 180 | struct ftrace_ops *op, struct pt_regs *regs) |
| 181 | { | 181 | { |
| 182 | if (!test_tsk_trace_trace(current)) | 182 | struct trace_array *tr = op->private; |
| 183 | |||
| 184 | if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid)) | ||
| 183 | return; | 185 | return; |
| 184 | 186 | ||
| 185 | op->saved_func(ip, parent_ip, op, regs); | 187 | op->saved_func(ip, parent_ip, op, regs); |
| @@ -417,7 +419,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) | |||
| 417 | /* Always save the function, and reset at unregistering */ | 419 | /* Always save the function, and reset at unregistering */ |
| 418 | ops->saved_func = ops->func; | 420 | ops->saved_func = ops->func; |
| 419 | 421 | ||
| 420 | if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled()) | 422 | if (ftrace_pids_enabled(ops)) |
| 421 | ops->func = ftrace_pid_func; | 423 | ops->func = ftrace_pid_func; |
| 422 | 424 | ||
| 423 | ftrace_update_trampoline(ops); | 425 | ftrace_update_trampoline(ops); |
| @@ -450,7 +452,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
| 450 | 452 | ||
| 451 | static void ftrace_update_pid_func(void) | 453 | static void ftrace_update_pid_func(void) |
| 452 | { | 454 | { |
| 453 | bool enabled = ftrace_pids_enabled(); | ||
| 454 | struct ftrace_ops *op; | 455 | struct ftrace_ops *op; |
| 455 | 456 | ||
| 456 | /* Only do something if we are tracing something */ | 457 | /* Only do something if we are tracing something */ |
| @@ -459,8 +460,8 @@ static void ftrace_update_pid_func(void) | |||
| 459 | 460 | ||
| 460 | do_for_each_ftrace_op(op, ftrace_ops_list) { | 461 | do_for_each_ftrace_op(op, ftrace_ops_list) { |
| 461 | if (op->flags & FTRACE_OPS_FL_PID) { | 462 | if (op->flags & FTRACE_OPS_FL_PID) { |
| 462 | op->func = enabled ? ftrace_pid_func : | 463 | op->func = ftrace_pids_enabled(op) ? |
| 463 | op->saved_func; | 464 | ftrace_pid_func : op->saved_func; |
| 464 | ftrace_update_trampoline(op); | 465 | ftrace_update_trampoline(op); |
| 465 | } | 466 | } |
| 466 | } while_for_each_ftrace_op(op); | 467 | } while_for_each_ftrace_op(op); |
| @@ -5324,179 +5325,99 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) | |||
| 5324 | return ops->func; | 5325 | return ops->func; |
| 5325 | } | 5326 | } |
| 5326 | 5327 | ||
| 5327 | static void clear_ftrace_swapper(void) | 5328 | static void |
| 5329 | ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, | ||
| 5330 | struct task_struct *prev, struct task_struct *next) | ||
| 5328 | { | 5331 | { |
| 5329 | struct task_struct *p; | 5332 | struct trace_array *tr = data; |
| 5330 | int cpu; | 5333 | struct trace_pid_list *pid_list; |
| 5331 | 5334 | ||
| 5332 | get_online_cpus(); | 5335 | pid_list = rcu_dereference_sched(tr->function_pids); |
| 5333 | for_each_online_cpu(cpu) { | ||
| 5334 | p = idle_task(cpu); | ||
| 5335 | clear_tsk_trace_trace(p); | ||
| 5336 | } | ||
| 5337 | put_online_cpus(); | ||
| 5338 | } | ||
| 5339 | |||
| 5340 | static void set_ftrace_swapper(void) | ||
| 5341 | { | ||
| 5342 | struct task_struct *p; | ||
| 5343 | int cpu; | ||
| 5344 | 5336 | ||
| 5345 | get_online_cpus(); | 5337 | this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, |
| 5346 | for_each_online_cpu(cpu) { | 5338 | trace_ignore_this_task(pid_list, next)); |
| 5347 | p = idle_task(cpu); | ||
| 5348 | set_tsk_trace_trace(p); | ||
| 5349 | } | ||
| 5350 | put_online_cpus(); | ||
| 5351 | } | 5339 | } |
| 5352 | 5340 | ||
| 5353 | static void clear_ftrace_pid(struct pid *pid) | 5341 | static void clear_ftrace_pids(struct trace_array *tr) |
| 5354 | { | 5342 | { |
| 5355 | struct task_struct *p; | 5343 | struct trace_pid_list *pid_list; |
| 5344 | int cpu; | ||
| 5356 | 5345 | ||
| 5357 | rcu_read_lock(); | 5346 | pid_list = rcu_dereference_protected(tr->function_pids, |
| 5358 | do_each_pid_task(pid, PIDTYPE_PID, p) { | 5347 | lockdep_is_held(&ftrace_lock)); |
| 5359 | clear_tsk_trace_trace(p); | 5348 | if (!pid_list) |
| 5360 | } while_each_pid_task(pid, PIDTYPE_PID, p); | 5349 | return; |
| 5361 | rcu_read_unlock(); | ||
| 5362 | 5350 | ||
| 5363 | put_pid(pid); | 5351 | unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); |
| 5364 | } | ||
| 5365 | 5352 | ||
| 5366 | static void set_ftrace_pid(struct pid *pid) | 5353 | for_each_possible_cpu(cpu) |
| 5367 | { | 5354 | per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false; |
| 5368 | struct task_struct *p; | ||
| 5369 | 5355 | ||
| 5370 | rcu_read_lock(); | 5356 | rcu_assign_pointer(tr->function_pids, NULL); |
| 5371 | do_each_pid_task(pid, PIDTYPE_PID, p) { | ||
| 5372 | set_tsk_trace_trace(p); | ||
| 5373 | } while_each_pid_task(pid, PIDTYPE_PID, p); | ||
| 5374 | rcu_read_unlock(); | ||
| 5375 | } | ||
| 5376 | 5357 | ||
| 5377 | static void clear_ftrace_pid_task(struct pid *pid) | 5358 | /* Wait till all users are no longer using pid filtering */ |
| 5378 | { | 5359 | synchronize_sched(); |
| 5379 | if (pid == ftrace_swapper_pid) | ||
| 5380 | clear_ftrace_swapper(); | ||
| 5381 | else | ||
| 5382 | clear_ftrace_pid(pid); | ||
| 5383 | } | ||
| 5384 | 5360 | ||
| 5385 | static void set_ftrace_pid_task(struct pid *pid) | 5361 | trace_free_pid_list(pid_list); |
| 5386 | { | ||
| 5387 | if (pid == ftrace_swapper_pid) | ||
| 5388 | set_ftrace_swapper(); | ||
| 5389 | else | ||
| 5390 | set_ftrace_pid(pid); | ||
| 5391 | } | 5362 | } |
| 5392 | 5363 | ||
| 5393 | static int ftrace_pid_add(int p) | 5364 | static void ftrace_pid_reset(struct trace_array *tr) |
| 5394 | { | 5365 | { |
| 5395 | struct pid *pid; | ||
| 5396 | struct ftrace_pid *fpid; | ||
| 5397 | int ret = -EINVAL; | ||
| 5398 | |||
| 5399 | mutex_lock(&ftrace_lock); | 5366 | mutex_lock(&ftrace_lock); |
| 5400 | 5367 | clear_ftrace_pids(tr); | |
| 5401 | if (!p) | ||
| 5402 | pid = ftrace_swapper_pid; | ||
| 5403 | else | ||
| 5404 | pid = find_get_pid(p); | ||
| 5405 | |||
| 5406 | if (!pid) | ||
| 5407 | goto out; | ||
| 5408 | |||
| 5409 | ret = 0; | ||
| 5410 | |||
| 5411 | list_for_each_entry(fpid, &ftrace_pids, list) | ||
| 5412 | if (fpid->pid == pid) | ||
| 5413 | goto out_put; | ||
| 5414 | |||
| 5415 | ret = -ENOMEM; | ||
| 5416 | |||
| 5417 | fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); | ||
| 5418 | if (!fpid) | ||
| 5419 | goto out_put; | ||
| 5420 | |||
| 5421 | list_add(&fpid->list, &ftrace_pids); | ||
| 5422 | fpid->pid = pid; | ||
| 5423 | |||
| 5424 | set_ftrace_pid_task(pid); | ||
| 5425 | 5368 | ||
| 5426 | ftrace_update_pid_func(); | 5369 | ftrace_update_pid_func(); |
| 5427 | |||
| 5428 | ftrace_startup_all(0); | 5370 | ftrace_startup_all(0); |
| 5429 | 5371 | ||
| 5430 | mutex_unlock(&ftrace_lock); | 5372 | mutex_unlock(&ftrace_lock); |
| 5431 | return 0; | ||
| 5432 | |||
| 5433 | out_put: | ||
| 5434 | if (pid != ftrace_swapper_pid) | ||
| 5435 | put_pid(pid); | ||
| 5436 | |||
| 5437 | out: | ||
| 5438 | mutex_unlock(&ftrace_lock); | ||
| 5439 | return ret; | ||
| 5440 | } | 5373 | } |
| 5441 | 5374 | ||
| 5442 | static void ftrace_pid_reset(void) | 5375 | /* Greater than any max PID */ |
| 5443 | { | 5376 | #define FTRACE_NO_PIDS (void *)(PID_MAX_LIMIT + 1) |
| 5444 | struct ftrace_pid *fpid, *safe; | ||
| 5445 | |||
| 5446 | mutex_lock(&ftrace_lock); | ||
| 5447 | list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { | ||
| 5448 | struct pid *pid = fpid->pid; | ||
| 5449 | |||
| 5450 | clear_ftrace_pid_task(pid); | ||
| 5451 | |||
| 5452 | list_del(&fpid->list); | ||
| 5453 | kfree(fpid); | ||
| 5454 | } | ||
| 5455 | |||
| 5456 | ftrace_update_pid_func(); | ||
| 5457 | ftrace_startup_all(0); | ||
| 5458 | |||
| 5459 | mutex_unlock(&ftrace_lock); | ||
| 5460 | } | ||
| 5461 | 5377 | ||
| 5462 | static void *fpid_start(struct seq_file *m, loff_t *pos) | 5378 | static void *fpid_start(struct seq_file *m, loff_t *pos) |
| 5379 | __acquires(RCU) | ||
| 5463 | { | 5380 | { |
| 5381 | struct trace_pid_list *pid_list; | ||
| 5382 | struct trace_array *tr = m->private; | ||
| 5383 | |||
| 5464 | mutex_lock(&ftrace_lock); | 5384 | mutex_lock(&ftrace_lock); |
| 5385 | rcu_read_lock_sched(); | ||
| 5465 | 5386 | ||
| 5466 | if (!ftrace_pids_enabled() && (!*pos)) | 5387 | pid_list = rcu_dereference_sched(tr->function_pids); |
| 5467 | return (void *) 1; | ||
| 5468 | 5388 | ||
| 5469 | return seq_list_start(&ftrace_pids, *pos); | 5389 | if (!pid_list) |
| 5390 | return !(*pos) ? FTRACE_NO_PIDS : NULL; | ||
| 5391 | |||
| 5392 | return trace_pid_start(pid_list, pos); | ||
| 5470 | } | 5393 | } |
| 5471 | 5394 | ||
| 5472 | static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) | 5395 | static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) |
| 5473 | { | 5396 | { |
| 5474 | if (v == (void *)1) | 5397 | struct trace_array *tr = m->private; |
| 5398 | struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids); | ||
| 5399 | |||
| 5400 | if (v == FTRACE_NO_PIDS) | ||
| 5475 | return NULL; | 5401 | return NULL; |
| 5476 | 5402 | ||
| 5477 | return seq_list_next(v, &ftrace_pids, pos); | 5403 | return trace_pid_next(pid_list, v, pos); |
| 5478 | } | 5404 | } |
| 5479 | 5405 | ||
| 5480 | static void fpid_stop(struct seq_file *m, void *p) | 5406 | static void fpid_stop(struct seq_file *m, void *p) |
| 5407 | __releases(RCU) | ||
| 5481 | { | 5408 | { |
| 5409 | rcu_read_unlock_sched(); | ||
| 5482 | mutex_unlock(&ftrace_lock); | 5410 | mutex_unlock(&ftrace_lock); |
| 5483 | } | 5411 | } |
| 5484 | 5412 | ||
| 5485 | static int fpid_show(struct seq_file *m, void *v) | 5413 | static int fpid_show(struct seq_file *m, void *v) |
| 5486 | { | 5414 | { |
| 5487 | const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); | 5415 | if (v == FTRACE_NO_PIDS) { |
| 5488 | |||
| 5489 | if (v == (void *)1) { | ||
| 5490 | seq_puts(m, "no pid\n"); | 5416 | seq_puts(m, "no pid\n"); |
| 5491 | return 0; | 5417 | return 0; |
| 5492 | } | 5418 | } |
| 5493 | 5419 | ||
| 5494 | if (fpid->pid == ftrace_swapper_pid) | 5420 | return trace_pid_show(m, v); |
| 5495 | seq_puts(m, "swapper tasks\n"); | ||
| 5496 | else | ||
| 5497 | seq_printf(m, "%u\n", pid_vnr(fpid->pid)); | ||
| 5498 | |||
| 5499 | return 0; | ||
| 5500 | } | 5421 | } |
| 5501 | 5422 | ||
| 5502 | static const struct seq_operations ftrace_pid_sops = { | 5423 | static const struct seq_operations ftrace_pid_sops = { |
| @@ -5509,58 +5430,103 @@ static const struct seq_operations ftrace_pid_sops = { | |||
| 5509 | static int | 5430 | static int |
| 5510 | ftrace_pid_open(struct inode *inode, struct file *file) | 5431 | ftrace_pid_open(struct inode *inode, struct file *file) |
| 5511 | { | 5432 | { |
| 5433 | struct trace_array *tr = inode->i_private; | ||
| 5434 | struct seq_file *m; | ||
| 5512 | int ret = 0; | 5435 | int ret = 0; |
| 5513 | 5436 | ||
| 5437 | if (trace_array_get(tr) < 0) | ||
| 5438 | return -ENODEV; | ||
| 5439 | |||
| 5514 | if ((file->f_mode & FMODE_WRITE) && | 5440 | if ((file->f_mode & FMODE_WRITE) && |
| 5515 | (file->f_flags & O_TRUNC)) | 5441 | (file->f_flags & O_TRUNC)) |
| 5516 | ftrace_pid_reset(); | 5442 | ftrace_pid_reset(tr); |
| 5517 | 5443 | ||
| 5518 | if (file->f_mode & FMODE_READ) | 5444 | ret = seq_open(file, &ftrace_pid_sops); |
| 5519 | ret = seq_open(file, &ftrace_pid_sops); | 5445 | if (ret < 0) { |
| 5446 | trace_array_put(tr); | ||
| 5447 | } else { | ||
| 5448 | m = file->private_data; | ||
| 5449 | /* copy tr over to seq ops */ | ||
| 5450 | m->private = tr; | ||
| 5451 | } | ||
| 5520 | 5452 | ||
| 5521 | return ret; | 5453 | return ret; |
| 5522 | } | 5454 | } |
| 5523 | 5455 | ||
| 5456 | static void ignore_task_cpu(void *data) | ||
| 5457 | { | ||
| 5458 | struct trace_array *tr = data; | ||
| 5459 | struct trace_pid_list *pid_list; | ||
| 5460 | |||
| 5461 | /* | ||
| 5462 | * This function is called by on_each_cpu() while the | ||
| 5463 | * event_mutex is held. | ||
| 5464 | */ | ||
| 5465 | pid_list = rcu_dereference_protected(tr->function_pids, | ||
| 5466 | mutex_is_locked(&ftrace_lock)); | ||
| 5467 | |||
| 5468 | this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, | ||
| 5469 | trace_ignore_this_task(pid_list, current)); | ||
| 5470 | } | ||
| 5471 | |||
| 5524 | static ssize_t | 5472 | static ssize_t |
| 5525 | ftrace_pid_write(struct file *filp, const char __user *ubuf, | 5473 | ftrace_pid_write(struct file *filp, const char __user *ubuf, |
| 5526 | size_t cnt, loff_t *ppos) | 5474 | size_t cnt, loff_t *ppos) |
| 5527 | { | 5475 | { |
| 5528 | char buf[64], *tmp; | 5476 | struct seq_file *m = filp->private_data; |
| 5529 | long val; | 5477 | struct trace_array *tr = m->private; |
| 5530 | int ret; | 5478 | struct trace_pid_list *filtered_pids = NULL; |
| 5479 | struct trace_pid_list *pid_list; | ||
| 5480 | ssize_t ret; | ||
| 5531 | 5481 | ||
| 5532 | if (cnt >= sizeof(buf)) | 5482 | if (!cnt) |
| 5533 | return -EINVAL; | 5483 | return 0; |
| 5484 | |||
| 5485 | mutex_lock(&ftrace_lock); | ||
| 5486 | |||
| 5487 | filtered_pids = rcu_dereference_protected(tr->function_pids, | ||
| 5488 | lockdep_is_held(&ftrace_lock)); | ||
| 5489 | |||
| 5490 | ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); | ||
| 5491 | if (ret < 0) | ||
| 5492 | goto out; | ||
| 5534 | 5493 | ||
| 5535 | if (copy_from_user(&buf, ubuf, cnt)) | 5494 | rcu_assign_pointer(tr->function_pids, pid_list); |
| 5536 | return -EFAULT; | ||
| 5537 | 5495 | ||
| 5538 | buf[cnt] = 0; | 5496 | if (filtered_pids) { |
| 5497 | synchronize_sched(); | ||
| 5498 | trace_free_pid_list(filtered_pids); | ||
| 5499 | } else if (pid_list) { | ||
| 5500 | /* Register a probe to set whether to ignore the tracing of a task */ | ||
| 5501 | register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); | ||
| 5502 | } | ||
| 5539 | 5503 | ||
| 5540 | /* | 5504 | /* |
| 5541 | * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" | 5505 | * Ignoring of pids is done at task switch. But we have to |
| 5542 | * to clean the filter quietly. | 5506 | * check for those tasks that are currently running. |
| 5507 | * Always do this in case a pid was appended or removed. | ||
| 5543 | */ | 5508 | */ |
| 5544 | tmp = strstrip(buf); | 5509 | on_each_cpu(ignore_task_cpu, tr, 1); |
| 5545 | if (strlen(tmp) == 0) | ||
| 5546 | return 1; | ||
| 5547 | 5510 | ||
| 5548 | ret = kstrtol(tmp, 10, &val); | 5511 | ftrace_update_pid_func(); |
| 5549 | if (ret < 0) | 5512 | ftrace_startup_all(0); |
| 5550 | return ret; | 5513 | out: |
| 5514 | mutex_unlock(&ftrace_lock); | ||
| 5551 | 5515 | ||
| 5552 | ret = ftrace_pid_add(val); | 5516 | if (ret > 0) |
| 5517 | *ppos += ret; | ||
| 5553 | 5518 | ||
| 5554 | return ret ? ret : cnt; | 5519 | return ret; |
| 5555 | } | 5520 | } |
| 5556 | 5521 | ||
| 5557 | static int | 5522 | static int |
| 5558 | ftrace_pid_release(struct inode *inode, struct file *file) | 5523 | ftrace_pid_release(struct inode *inode, struct file *file) |
| 5559 | { | 5524 | { |
| 5560 | if (file->f_mode & FMODE_READ) | 5525 | struct trace_array *tr = inode->i_private; |
| 5561 | seq_release(inode, file); | ||
| 5562 | 5526 | ||
| 5563 | return 0; | 5527 | trace_array_put(tr); |
| 5528 | |||
| 5529 | return seq_release(inode, file); | ||
| 5564 | } | 5530 | } |
| 5565 | 5531 | ||
| 5566 | static const struct file_operations ftrace_pid_fops = { | 5532 | static const struct file_operations ftrace_pid_fops = { |
| @@ -5571,24 +5537,21 @@ static const struct file_operations ftrace_pid_fops = { | |||
| 5571 | .release = ftrace_pid_release, | 5537 | .release = ftrace_pid_release, |
| 5572 | }; | 5538 | }; |
| 5573 | 5539 | ||
| 5574 | static __init int ftrace_init_tracefs(void) | 5540 | void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer) |
| 5575 | { | 5541 | { |
| 5576 | struct dentry *d_tracer; | 5542 | trace_create_file("set_ftrace_pid", 0644, d_tracer, |
| 5543 | tr, &ftrace_pid_fops); | ||
| 5544 | } | ||
| 5577 | 5545 | ||
| 5578 | d_tracer = tracing_init_dentry(); | 5546 | void __init ftrace_init_tracefs_toplevel(struct trace_array *tr, |
| 5579 | if (IS_ERR(d_tracer)) | 5547 | struct dentry *d_tracer) |
| 5580 | return 0; | 5548 | { |
| 5549 | /* Only the top level directory has the dyn_tracefs and profile */ | ||
| 5550 | WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); | ||
| 5581 | 5551 | ||
| 5582 | ftrace_init_dyn_tracefs(d_tracer); | 5552 | ftrace_init_dyn_tracefs(d_tracer); |
| 5583 | |||
| 5584 | trace_create_file("set_ftrace_pid", 0644, d_tracer, | ||
| 5585 | NULL, &ftrace_pid_fops); | ||
| 5586 | |||
| 5587 | ftrace_profile_tracefs(d_tracer); | 5553 | ftrace_profile_tracefs(d_tracer); |
| 5588 | |||
| 5589 | return 0; | ||
| 5590 | } | 5554 | } |
| 5591 | fs_initcall(ftrace_init_tracefs); | ||
| 5592 | 5555 | ||
| 5593 | /** | 5556 | /** |
| 5594 | * ftrace_kill - kill ftrace | 5557 | * ftrace_kill - kill ftrace |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a4bd6b68a0b..dade4c9559cc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -25,7 +25,7 @@ | |||
| 25 | #include <linux/hardirq.h> | 25 | #include <linux/hardirq.h> |
| 26 | #include <linux/linkage.h> | 26 | #include <linux/linkage.h> |
| 27 | #include <linux/uaccess.h> | 27 | #include <linux/uaccess.h> |
| 28 | #include <linux/kprobes.h> | 28 | #include <linux/vmalloc.h> |
| 29 | #include <linux/ftrace.h> | 29 | #include <linux/ftrace.h> |
| 30 | #include <linux/module.h> | 30 | #include <linux/module.h> |
| 31 | #include <linux/percpu.h> | 31 | #include <linux/percpu.h> |
| @@ -319,6 +319,258 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec, | |||
| 319 | return 0; | 319 | return 0; |
| 320 | } | 320 | } |
| 321 | 321 | ||
| 322 | void trace_free_pid_list(struct trace_pid_list *pid_list) | ||
| 323 | { | ||
| 324 | vfree(pid_list->pids); | ||
| 325 | kfree(pid_list); | ||
| 326 | } | ||
| 327 | |||
| 328 | /** | ||
| 329 | * trace_find_filtered_pid - check if a pid exists in a filtered_pid list | ||
| 330 | * @filtered_pids: The list of pids to check | ||
| 331 | * @search_pid: The PID to find in @filtered_pids | ||
| 332 | * | ||
| 333 | * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis. | ||
| 334 | */ | ||
| 335 | bool | ||
| 336 | trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) | ||
| 337 | { | ||
| 338 | /* | ||
| 339 | * If pid_max changed after filtered_pids was created, we | ||
| 340 | * by default ignore all pids greater than the previous pid_max. | ||
| 341 | */ | ||
| 342 | if (search_pid >= filtered_pids->pid_max) | ||
| 343 | return false; | ||
| 344 | |||
| 345 | return test_bit(search_pid, filtered_pids->pids); | ||
| 346 | } | ||
| 347 | |||
| 348 | /** | ||
| 349 | * trace_ignore_this_task - should a task be ignored for tracing | ||
| 350 | * @filtered_pids: The list of pids to check | ||
| 351 | * @task: The task that should be ignored if not filtered | ||
| 352 | * | ||
| 353 | * Checks if @task should be traced or not from @filtered_pids. | ||
| 354 | * Returns true if @task should *NOT* be traced. | ||
| 355 | * Returns false if @task should be traced. | ||
| 356 | */ | ||
| 357 | bool | ||
| 358 | trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) | ||
| 359 | { | ||
| 360 | /* | ||
| 361 | * Return false, because if filtered_pids does not exist, | ||
| 362 | * all pids are good to trace. | ||
| 363 | */ | ||
| 364 | if (!filtered_pids) | ||
| 365 | return false; | ||
| 366 | |||
| 367 | return !trace_find_filtered_pid(filtered_pids, task->pid); | ||
| 368 | } | ||
| 369 | |||
| 370 | /** | ||
| 371 | * trace_pid_filter_add_remove - Add or remove a task from a pid_list | ||
| 372 | * @pid_list: The list to modify | ||
| 373 | * @self: The current task for fork or NULL for exit | ||
| 374 | * @task: The task to add or remove | ||
| 375 | * | ||
| 376 | * If adding a task, if @self is defined, the task is only added if @self | ||
| 377 | * is also included in @pid_list. This happens on fork and tasks should | ||
| 378 | * only be added when the parent is listed. If @self is NULL, then the | ||
| 379 | * @task pid will be removed from the list, which would happen on exit | ||
| 380 | * of a task. | ||
| 381 | */ | ||
| 382 | void trace_filter_add_remove_task(struct trace_pid_list *pid_list, | ||
| 383 | struct task_struct *self, | ||
| 384 | struct task_struct *task) | ||
| 385 | { | ||
| 386 | if (!pid_list) | ||
| 387 | return; | ||
| 388 | |||
| 389 | /* For forks, we only add if the forking task is listed */ | ||
| 390 | if (self) { | ||
| 391 | if (!trace_find_filtered_pid(pid_list, self->pid)) | ||
| 392 | return; | ||
| 393 | } | ||
| 394 | |||
| 395 | /* Sorry, but we don't support pid_max changing after setting */ | ||
| 396 | if (task->pid >= pid_list->pid_max) | ||
| 397 | return; | ||
| 398 | |||
| 399 | /* "self" is set for forks, and NULL for exits */ | ||
| 400 | if (self) | ||
| 401 | set_bit(task->pid, pid_list->pids); | ||
| 402 | else | ||
| 403 | clear_bit(task->pid, pid_list->pids); | ||
| 404 | } | ||
| 405 | |||
| 406 | /** | ||
| 407 | * trace_pid_next - Used for seq_file to get to the next pid of a pid_list | ||
| 408 | * @pid_list: The pid list to show | ||
| 409 | * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) | ||
| 410 | * @pos: The position of the file | ||
| 411 | * | ||
| 412 | * This is used by the seq_file "next" operation to iterate the pids | ||
| 413 | * listed in a trace_pid_list structure. | ||
| 414 | * | ||
| 415 | * Returns the pid+1 as we want to display pid of zero, but NULL would | ||
| 416 | * stop the iteration. | ||
| 417 | */ | ||
| 418 | void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) | ||
| 419 | { | ||
| 420 | unsigned long pid = (unsigned long)v; | ||
| 421 | |||
| 422 | (*pos)++; | ||
| 423 | |||
| 424 | /* pid already is +1 of the actual prevous bit */ | ||
| 425 | pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); | ||
| 426 | |||
| 427 | /* Return pid + 1 to allow zero to be represented */ | ||
| 428 | if (pid < pid_list->pid_max) | ||
| 429 | return (void *)(pid + 1); | ||
| 430 | |||
| 431 | return NULL; | ||
| 432 | } | ||
| 433 | |||
| 434 | /** | ||
| 435 | * trace_pid_start - Used for seq_file to start reading pid lists | ||
| 436 | * @pid_list: The pid list to show | ||
| 437 | * @pos: The position of the file | ||
| 438 | * | ||
| 439 | * This is used by seq_file "start" operation to start the iteration | ||
| 440 | * of listing pids. | ||
| 441 | * | ||
| 442 | * Returns the pid+1 as we want to display pid of zero, but NULL would | ||
| 443 | * stop the iteration. | ||
| 444 | */ | ||
| 445 | void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) | ||
| 446 | { | ||
| 447 | unsigned long pid; | ||
| 448 | loff_t l = 0; | ||
| 449 | |||
| 450 | pid = find_first_bit(pid_list->pids, pid_list->pid_max); | ||
| 451 | if (pid >= pid_list->pid_max) | ||
| 452 | return NULL; | ||
| 453 | |||
| 454 | /* Return pid + 1 so that zero can be the exit value */ | ||
| 455 | for (pid++; pid && l < *pos; | ||
| 456 | pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) | ||
| 457 | ; | ||
| 458 | return (void *)pid; | ||
| 459 | } | ||
| 460 | |||
| 461 | /** | ||
| 462 | * trace_pid_show - show the current pid in seq_file processing | ||
| 463 | * @m: The seq_file structure to write into | ||
| 464 | * @v: A void pointer of the pid (+1) value to display | ||
| 465 | * | ||
| 466 | * Can be directly used by seq_file operations to display the current | ||
| 467 | * pid value. | ||
| 468 | */ | ||
| 469 | int trace_pid_show(struct seq_file *m, void *v) | ||
| 470 | { | ||
| 471 | unsigned long pid = (unsigned long)v - 1; | ||
| 472 | |||
| 473 | seq_printf(m, "%lu\n", pid); | ||
| 474 | return 0; | ||
| 475 | } | ||
| 476 | |||
| 477 | /* 128 should be much more than enough */ | ||
| 478 | #define PID_BUF_SIZE 127 | ||
| 479 | |||
| 480 | int trace_pid_write(struct trace_pid_list *filtered_pids, | ||
| 481 | struct trace_pid_list **new_pid_list, | ||
| 482 | const char __user *ubuf, size_t cnt) | ||
| 483 | { | ||
| 484 | struct trace_pid_list *pid_list; | ||
| 485 | struct trace_parser parser; | ||
| 486 | unsigned long val; | ||
| 487 | int nr_pids = 0; | ||
| 488 | ssize_t read = 0; | ||
| 489 | ssize_t ret = 0; | ||
| 490 | loff_t pos; | ||
| 491 | pid_t pid; | ||
| 492 | |||
| 493 | if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) | ||
| 494 | return -ENOMEM; | ||
| 495 | |||
| 496 | /* | ||
| 497 | * Always recreate a new array. The write is an all or nothing | ||
| 498 | * operation. Always create a new array when adding new pids by | ||
| 499 | * the user. If the operation fails, then the current list is | ||
| 500 | * not modified. | ||
| 501 | */ | ||
| 502 | pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); | ||
| 503 | if (!pid_list) | ||
| 504 | return -ENOMEM; | ||
| 505 | |||
| 506 | pid_list->pid_max = READ_ONCE(pid_max); | ||
| 507 | |||
| 508 | /* Only truncating will shrink pid_max */ | ||
| 509 | if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) | ||
| 510 | pid_list->pid_max = filtered_pids->pid_max; | ||
| 511 | |||
| 512 | pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); | ||
| 513 | if (!pid_list->pids) { | ||
| 514 | kfree(pid_list); | ||
| 515 | return -ENOMEM; | ||
| 516 | } | ||
| 517 | |||
| 518 | if (filtered_pids) { | ||
| 519 | /* copy the current bits to the new max */ | ||
| 520 | for_each_set_bit(pid, filtered_pids->pids, | ||
| 521 | filtered_pids->pid_max) { | ||
| 522 | set_bit(pid, pid_list->pids); | ||
| 523 | nr_pids++; | ||
| 524 | } | ||
| 525 | } | ||
| 526 | |||
| 527 | while (cnt > 0) { | ||
| 528 | |||
| 529 | pos = 0; | ||
| 530 | |||
| 531 | ret = trace_get_user(&parser, ubuf, cnt, &pos); | ||
| 532 | if (ret < 0 || !trace_parser_loaded(&parser)) | ||
| 533 | break; | ||
| 534 | |||
| 535 | read += ret; | ||
| 536 | ubuf += ret; | ||
| 537 | cnt -= ret; | ||
| 538 | |||
| 539 | parser.buffer[parser.idx] = 0; | ||
| 540 | |||
| 541 | ret = -EINVAL; | ||
| 542 | if (kstrtoul(parser.buffer, 0, &val)) | ||
| 543 | break; | ||
| 544 | if (val >= pid_list->pid_max) | ||
| 545 | break; | ||
| 546 | |||
| 547 | pid = (pid_t)val; | ||
| 548 | |||
| 549 | set_bit(pid, pid_list->pids); | ||
| 550 | nr_pids++; | ||
| 551 | |||
| 552 | trace_parser_clear(&parser); | ||
| 553 | ret = 0; | ||
| 554 | } | ||
| 555 | trace_parser_put(&parser); | ||
| 556 | |||
| 557 | if (ret < 0) { | ||
| 558 | trace_free_pid_list(pid_list); | ||
| 559 | return ret; | ||
| 560 | } | ||
| 561 | |||
| 562 | if (!nr_pids) { | ||
| 563 | /* Cleared the list of pids */ | ||
| 564 | trace_free_pid_list(pid_list); | ||
| 565 | read = ret; | ||
| 566 | pid_list = NULL; | ||
| 567 | } | ||
| 568 | |||
| 569 | *new_pid_list = pid_list; | ||
| 570 | |||
| 571 | return read; | ||
| 572 | } | ||
| 573 | |||
| 322 | static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) | 574 | static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) |
| 323 | { | 575 | { |
| 324 | u64 ts; | 576 | u64 ts; |
| @@ -1862,7 +2114,17 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, | |||
| 1862 | { | 2114 | { |
| 1863 | __buffer_unlock_commit(buffer, event); | 2115 | __buffer_unlock_commit(buffer, event); |
| 1864 | 2116 | ||
| 1865 | ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); | 2117 | /* |
| 2118 | * If regs is not set, then skip the following callers: | ||
| 2119 | * trace_buffer_unlock_commit_regs | ||
| 2120 | * event_trigger_unlock_commit | ||
| 2121 | * trace_event_buffer_commit | ||
| 2122 | * trace_event_raw_event_sched_switch | ||
| 2123 | * Note, we can still get here via blktrace, wakeup tracer | ||
| 2124 | * and mmiotrace, but that's ok if they lose a function or | ||
| 2125 | * two. They are that meaningful. | ||
| 2126 | */ | ||
| 2127 | ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); | ||
| 1866 | ftrace_trace_userstack(buffer, flags, pc); | 2128 | ftrace_trace_userstack(buffer, flags, pc); |
| 1867 | } | 2129 | } |
| 1868 | 2130 | ||
| @@ -1913,6 +2175,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, | |||
| 1913 | trace.skip = skip; | 2175 | trace.skip = skip; |
| 1914 | 2176 | ||
| 1915 | /* | 2177 | /* |
| 2178 | * Add two, for this function and the call to save_stack_trace() | ||
| 2179 | * If regs is set, then these functions will not be in the way. | ||
| 2180 | */ | ||
| 2181 | if (!regs) | ||
| 2182 | trace.skip += 2; | ||
| 2183 | |||
| 2184 | /* | ||
| 1916 | * Since events can happen in NMIs there's no safe way to | 2185 | * Since events can happen in NMIs there's no safe way to |
| 1917 | * use the per cpu ftrace_stacks. We reserve it and if an interrupt | 2186 | * use the per cpu ftrace_stacks. We reserve it and if an interrupt |
| 1918 | * or NMI comes in, it will just have to use the default | 2187 | * or NMI comes in, it will just have to use the default |
| @@ -2083,83 +2352,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) | |||
| 2083 | 2352 | ||
| 2084 | /* created for use with alloc_percpu */ | 2353 | /* created for use with alloc_percpu */ |
| 2085 | struct trace_buffer_struct { | 2354 | struct trace_buffer_struct { |
| 2086 | char buffer[TRACE_BUF_SIZE]; | 2355 | int nesting; |
| 2356 | char buffer[4][TRACE_BUF_SIZE]; | ||
| 2087 | }; | 2357 | }; |
| 2088 | 2358 | ||
| 2089 | static struct trace_buffer_struct *trace_percpu_buffer; | 2359 | static struct trace_buffer_struct *trace_percpu_buffer; |
| 2090 | static struct trace_buffer_struct *trace_percpu_sirq_buffer; | ||
| 2091 | static struct trace_buffer_struct *trace_percpu_irq_buffer; | ||
| 2092 | static struct trace_buffer_struct *trace_percpu_nmi_buffer; | ||
| 2093 | 2360 | ||
| 2094 | /* | 2361 | /* |
| 2095 | * The buffer used is dependent on the context. There is a per cpu | 2362 | * Thise allows for lockless recording. If we're nested too deeply, then |
| 2096 | * buffer for normal context, softirq contex, hard irq context and | 2363 | * this returns NULL. |
| 2097 | * for NMI context. Thise allows for lockless recording. | ||
| 2098 | * | ||
| 2099 | * Note, if the buffers failed to be allocated, then this returns NULL | ||
| 2100 | */ | 2364 | */ |
| 2101 | static char *get_trace_buf(void) | 2365 | static char *get_trace_buf(void) |
| 2102 | { | 2366 | { |
| 2103 | struct trace_buffer_struct *percpu_buffer; | 2367 | struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); |
| 2104 | |||
| 2105 | /* | ||
| 2106 | * If we have allocated per cpu buffers, then we do not | ||
| 2107 | * need to do any locking. | ||
| 2108 | */ | ||
| 2109 | if (in_nmi()) | ||
| 2110 | percpu_buffer = trace_percpu_nmi_buffer; | ||
| 2111 | else if (in_irq()) | ||
| 2112 | percpu_buffer = trace_percpu_irq_buffer; | ||
| 2113 | else if (in_softirq()) | ||
| 2114 | percpu_buffer = trace_percpu_sirq_buffer; | ||
| 2115 | else | ||
| 2116 | percpu_buffer = trace_percpu_buffer; | ||
| 2117 | 2368 | ||
| 2118 | if (!percpu_buffer) | 2369 | if (!buffer || buffer->nesting >= 4) |
| 2119 | return NULL; | 2370 | return NULL; |
| 2120 | 2371 | ||
| 2121 | return this_cpu_ptr(&percpu_buffer->buffer[0]); | 2372 | return &buffer->buffer[buffer->nesting++][0]; |
| 2373 | } | ||
| 2374 | |||
| 2375 | static void put_trace_buf(void) | ||
| 2376 | { | ||
| 2377 | this_cpu_dec(trace_percpu_buffer->nesting); | ||
| 2122 | } | 2378 | } |
| 2123 | 2379 | ||
| 2124 | static int alloc_percpu_trace_buffer(void) | 2380 | static int alloc_percpu_trace_buffer(void) |
| 2125 | { | 2381 | { |
| 2126 | struct trace_buffer_struct *buffers; | 2382 | struct trace_buffer_struct *buffers; |
| 2127 | struct trace_buffer_struct *sirq_buffers; | ||
| 2128 | struct trace_buffer_struct *irq_buffers; | ||
| 2129 | struct trace_buffer_struct *nmi_buffers; | ||
| 2130 | 2383 | ||
| 2131 | buffers = alloc_percpu(struct trace_buffer_struct); | 2384 | buffers = alloc_percpu(struct trace_buffer_struct); |
| 2132 | if (!buffers) | 2385 | if (WARN(!buffers, "Could not allocate percpu trace_printk buffer")) |
| 2133 | goto err_warn; | 2386 | return -ENOMEM; |
| 2134 | |||
| 2135 | sirq_buffers = alloc_percpu(struct trace_buffer_struct); | ||
| 2136 | if (!sirq_buffers) | ||
| 2137 | goto err_sirq; | ||
| 2138 | |||
| 2139 | irq_buffers = alloc_percpu(struct trace_buffer_struct); | ||
| 2140 | if (!irq_buffers) | ||
| 2141 | goto err_irq; | ||
| 2142 | |||
| 2143 | nmi_buffers = alloc_percpu(struct trace_buffer_struct); | ||
| 2144 | if (!nmi_buffers) | ||
| 2145 | goto err_nmi; | ||
| 2146 | 2387 | ||
| 2147 | trace_percpu_buffer = buffers; | 2388 | trace_percpu_buffer = buffers; |
| 2148 | trace_percpu_sirq_buffer = sirq_buffers; | ||
| 2149 | trace_percpu_irq_buffer = irq_buffers; | ||
| 2150 | trace_percpu_nmi_buffer = nmi_buffers; | ||
| 2151 | |||
| 2152 | return 0; | 2389 | return 0; |
| 2153 | |||
| 2154 | err_nmi: | ||
| 2155 | free_percpu(irq_buffers); | ||
| 2156 | err_irq: | ||
| 2157 | free_percpu(sirq_buffers); | ||
| 2158 | err_sirq: | ||
| 2159 | free_percpu(buffers); | ||
| 2160 | err_warn: | ||
| 2161 | WARN(1, "Could not allocate percpu trace_printk buffer"); | ||
| 2162 | return -ENOMEM; | ||
| 2163 | } | 2390 | } |
| 2164 | 2391 | ||
| 2165 | static int buffers_allocated; | 2392 | static int buffers_allocated; |
| @@ -2250,7 +2477,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
| 2250 | tbuffer = get_trace_buf(); | 2477 | tbuffer = get_trace_buf(); |
| 2251 | if (!tbuffer) { | 2478 | if (!tbuffer) { |
| 2252 | len = 0; | 2479 | len = 0; |
| 2253 | goto out; | 2480 | goto out_nobuffer; |
| 2254 | } | 2481 | } |
| 2255 | 2482 | ||
| 2256 | len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); | 2483 | len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); |
| @@ -2276,6 +2503,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
| 2276 | } | 2503 | } |
| 2277 | 2504 | ||
| 2278 | out: | 2505 | out: |
| 2506 | put_trace_buf(); | ||
| 2507 | |||
| 2508 | out_nobuffer: | ||
| 2279 | preempt_enable_notrace(); | 2509 | preempt_enable_notrace(); |
| 2280 | unpause_graph_tracing(); | 2510 | unpause_graph_tracing(); |
| 2281 | 2511 | ||
| @@ -2307,7 +2537,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, | |||
| 2307 | tbuffer = get_trace_buf(); | 2537 | tbuffer = get_trace_buf(); |
| 2308 | if (!tbuffer) { | 2538 | if (!tbuffer) { |
| 2309 | len = 0; | 2539 | len = 0; |
| 2310 | goto out; | 2540 | goto out_nobuffer; |
| 2311 | } | 2541 | } |
| 2312 | 2542 | ||
| 2313 | len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); | 2543 | len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); |
| @@ -2326,7 +2556,11 @@ __trace_array_vprintk(struct ring_buffer *buffer, | |||
| 2326 | __buffer_unlock_commit(buffer, event); | 2556 | __buffer_unlock_commit(buffer, event); |
| 2327 | ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); | 2557 | ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); |
| 2328 | } | 2558 | } |
| 2329 | out: | 2559 | |
| 2560 | out: | ||
| 2561 | put_trace_buf(); | ||
| 2562 | |||
| 2563 | out_nobuffer: | ||
| 2330 | preempt_enable_notrace(); | 2564 | preempt_enable_notrace(); |
| 2331 | unpause_graph_tracing(); | 2565 | unpause_graph_tracing(); |
| 2332 | 2566 | ||
| @@ -6977,6 +7211,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) | |||
| 6977 | for_each_tracing_cpu(cpu) | 7211 | for_each_tracing_cpu(cpu) |
| 6978 | tracing_init_tracefs_percpu(tr, cpu); | 7212 | tracing_init_tracefs_percpu(tr, cpu); |
| 6979 | 7213 | ||
| 7214 | ftrace_init_tracefs(tr, d_tracer); | ||
| 6980 | } | 7215 | } |
| 6981 | 7216 | ||
| 6982 | static struct vfsmount *trace_automount(void *ingore) | 7217 | static struct vfsmount *trace_automount(void *ingore) |
| @@ -7130,6 +7365,7 @@ static __init int tracer_init_tracefs(void) | |||
| 7130 | return 0; | 7365 | return 0; |
| 7131 | 7366 | ||
| 7132 | init_tracer_tracefs(&global_trace, d_tracer); | 7367 | init_tracer_tracefs(&global_trace, d_tracer); |
| 7368 | ftrace_init_tracefs_toplevel(&global_trace, d_tracer); | ||
| 7133 | 7369 | ||
| 7134 | trace_create_file("tracing_thresh", 0644, d_tracer, | 7370 | trace_create_file("tracing_thresh", 0644, d_tracer, |
| 7135 | &global_trace, &tracing_thresh_fops); | 7371 | &global_trace, &tracing_thresh_fops); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5167c366d6b7..f783df416726 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -80,6 +80,12 @@ enum trace_type { | |||
| 80 | FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ | 80 | FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ |
| 81 | filter) | 81 | filter) |
| 82 | 82 | ||
| 83 | #undef FTRACE_ENTRY_PACKED | ||
| 84 | #define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \ | ||
| 85 | filter) \ | ||
| 86 | FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ | ||
| 87 | filter) __packed | ||
| 88 | |||
| 83 | #include "trace_entries.h" | 89 | #include "trace_entries.h" |
| 84 | 90 | ||
| 85 | /* | 91 | /* |
| @@ -156,6 +162,9 @@ struct trace_array_cpu { | |||
| 156 | char comm[TASK_COMM_LEN]; | 162 | char comm[TASK_COMM_LEN]; |
| 157 | 163 | ||
| 158 | bool ignore_pid; | 164 | bool ignore_pid; |
| 165 | #ifdef CONFIG_FUNCTION_TRACER | ||
| 166 | bool ftrace_ignore_pid; | ||
| 167 | #endif | ||
| 159 | }; | 168 | }; |
| 160 | 169 | ||
| 161 | struct tracer; | 170 | struct tracer; |
| @@ -247,6 +256,7 @@ struct trace_array { | |||
| 247 | int ref; | 256 | int ref; |
| 248 | #ifdef CONFIG_FUNCTION_TRACER | 257 | #ifdef CONFIG_FUNCTION_TRACER |
| 249 | struct ftrace_ops *ops; | 258 | struct ftrace_ops *ops; |
| 259 | struct trace_pid_list __rcu *function_pids; | ||
| 250 | /* function tracing enabled */ | 260 | /* function tracing enabled */ |
| 251 | int function_enabled; | 261 | int function_enabled; |
| 252 | #endif | 262 | #endif |
| @@ -628,6 +638,25 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); | |||
| 628 | 638 | ||
| 629 | extern unsigned long tracing_thresh; | 639 | extern unsigned long tracing_thresh; |
| 630 | 640 | ||
| 641 | /* PID filtering */ | ||
| 642 | |||
| 643 | extern int pid_max; | ||
| 644 | |||
| 645 | bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, | ||
| 646 | pid_t search_pid); | ||
| 647 | bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, | ||
| 648 | struct task_struct *task); | ||
| 649 | void trace_filter_add_remove_task(struct trace_pid_list *pid_list, | ||
| 650 | struct task_struct *self, | ||
| 651 | struct task_struct *task); | ||
| 652 | void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos); | ||
| 653 | void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos); | ||
| 654 | int trace_pid_show(struct seq_file *m, void *v); | ||
| 655 | void trace_free_pid_list(struct trace_pid_list *pid_list); | ||
| 656 | int trace_pid_write(struct trace_pid_list *filtered_pids, | ||
| 657 | struct trace_pid_list **new_pid_list, | ||
| 658 | const char __user *ubuf, size_t cnt); | ||
| 659 | |||
| 631 | #ifdef CONFIG_TRACER_MAX_TRACE | 660 | #ifdef CONFIG_TRACER_MAX_TRACE |
| 632 | void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); | 661 | void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); |
| 633 | void update_max_tr_single(struct trace_array *tr, | 662 | void update_max_tr_single(struct trace_array *tr, |
| @@ -821,12 +850,9 @@ extern struct list_head ftrace_pids; | |||
| 821 | 850 | ||
| 822 | #ifdef CONFIG_FUNCTION_TRACER | 851 | #ifdef CONFIG_FUNCTION_TRACER |
| 823 | extern bool ftrace_filter_param __initdata; | 852 | extern bool ftrace_filter_param __initdata; |
| 824 | static inline int ftrace_trace_task(struct task_struct *task) | 853 | static inline int ftrace_trace_task(struct trace_array *tr) |
| 825 | { | 854 | { |
| 826 | if (list_empty(&ftrace_pids)) | 855 | return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid); |
| 827 | return 1; | ||
| 828 | |||
| 829 | return test_tsk_trace_trace(task); | ||
| 830 | } | 856 | } |
| 831 | extern int ftrace_is_dead(void); | 857 | extern int ftrace_is_dead(void); |
| 832 | int ftrace_create_function_files(struct trace_array *tr, | 858 | int ftrace_create_function_files(struct trace_array *tr, |
| @@ -836,8 +862,11 @@ void ftrace_init_global_array_ops(struct trace_array *tr); | |||
| 836 | void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); | 862 | void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); |
| 837 | void ftrace_reset_array_ops(struct trace_array *tr); | 863 | void ftrace_reset_array_ops(struct trace_array *tr); |
| 838 | int using_ftrace_ops_list_func(void); | 864 | int using_ftrace_ops_list_func(void); |
| 865 | void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); | ||
| 866 | void ftrace_init_tracefs_toplevel(struct trace_array *tr, | ||
| 867 | struct dentry *d_tracer); | ||
| 839 | #else | 868 | #else |
| 840 | static inline int ftrace_trace_task(struct task_struct *task) | 869 | static inline int ftrace_trace_task(struct trace_array *tr) |
| 841 | { | 870 | { |
| 842 | return 1; | 871 | return 1; |
| 843 | } | 872 | } |
| @@ -852,6 +881,8 @@ static inline void ftrace_destroy_function_files(struct trace_array *tr) { } | |||
| 852 | static inline __init void | 881 | static inline __init void |
| 853 | ftrace_init_global_array_ops(struct trace_array *tr) { } | 882 | ftrace_init_global_array_ops(struct trace_array *tr) { } |
| 854 | static inline void ftrace_reset_array_ops(struct trace_array *tr) { } | 883 | static inline void ftrace_reset_array_ops(struct trace_array *tr) { } |
| 884 | static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } | ||
| 885 | static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } | ||
| 855 | /* ftace_func_t type is not defined, use macro instead of static inline */ | 886 | /* ftace_func_t type is not defined, use macro instead of static inline */ |
| 856 | #define ftrace_init_array_ops(tr, func) do { } while (0) | 887 | #define ftrace_init_array_ops(tr, func) do { } while (0) |
| 857 | #endif /* CONFIG_FUNCTION_TRACER */ | 888 | #endif /* CONFIG_FUNCTION_TRACER */ |
| @@ -1600,6 +1631,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); | |||
| 1600 | #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ | 1631 | #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ |
| 1601 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ | 1632 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ |
| 1602 | filter) | 1633 | filter) |
| 1634 | #undef FTRACE_ENTRY_PACKED | ||
| 1635 | #define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \ | ||
| 1636 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ | ||
| 1637 | filter) | ||
| 1638 | |||
| 1603 | #include "trace_entries.h" | 1639 | #include "trace_entries.h" |
| 1604 | 1640 | ||
| 1605 | #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) | 1641 | #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ee7b94a4810a..5c30efcda5e6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
| @@ -72,7 +72,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry, | |||
| 72 | ); | 72 | ); |
| 73 | 73 | ||
| 74 | /* Function call entry */ | 74 | /* Function call entry */ |
| 75 | FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, | 75 | FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, |
| 76 | 76 | ||
| 77 | TRACE_GRAPH_ENT, | 77 | TRACE_GRAPH_ENT, |
| 78 | 78 | ||
| @@ -88,7 +88,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, | |||
| 88 | ); | 88 | ); |
| 89 | 89 | ||
| 90 | /* Function return entry */ | 90 | /* Function return entry */ |
| 91 | FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, | 91 | FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, |
| 92 | 92 | ||
| 93 | TRACE_GRAPH_RET, | 93 | TRACE_GRAPH_RET, |
| 94 | 94 | ||
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3d4155892a1e..03c0a48c3ac4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -15,7 +15,6 @@ | |||
| 15 | #include <linux/kthread.h> | 15 | #include <linux/kthread.h> |
| 16 | #include <linux/tracefs.h> | 16 | #include <linux/tracefs.h> |
| 17 | #include <linux/uaccess.h> | 17 | #include <linux/uaccess.h> |
| 18 | #include <linux/vmalloc.h> | ||
| 19 | #include <linux/module.h> | 18 | #include <linux/module.h> |
| 20 | #include <linux/ctype.h> | 19 | #include <linux/ctype.h> |
| 21 | #include <linux/sort.h> | 20 | #include <linux/sort.h> |
| @@ -262,6 +261,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, | |||
| 262 | 261 | ||
| 263 | local_save_flags(fbuffer->flags); | 262 | local_save_flags(fbuffer->flags); |
| 264 | fbuffer->pc = preempt_count(); | 263 | fbuffer->pc = preempt_count(); |
| 264 | /* | ||
| 265 | * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables | ||
| 266 | * preemption (adding one to the preempt_count). Since we are | ||
| 267 | * interested in the preempt_count at the time the tracepoint was | ||
| 268 | * hit, we need to subtract one to offset the increment. | ||
| 269 | */ | ||
| 270 | if (IS_ENABLED(CONFIG_PREEMPT)) | ||
| 271 | fbuffer->pc--; | ||
| 265 | fbuffer->trace_file = trace_file; | 272 | fbuffer->trace_file = trace_file; |
| 266 | 273 | ||
| 267 | fbuffer->event = | 274 | fbuffer->event = |
| @@ -499,60 +506,6 @@ static void ftrace_clear_events(struct trace_array *tr) | |||
| 499 | mutex_unlock(&event_mutex); | 506 | mutex_unlock(&event_mutex); |
| 500 | } | 507 | } |
| 501 | 508 | ||
| 502 | /* Shouldn't this be in a header? */ | ||
| 503 | extern int pid_max; | ||
| 504 | |||
| 505 | /* Returns true if found in filter */ | ||
| 506 | static bool | ||
| 507 | find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) | ||
| 508 | { | ||
| 509 | /* | ||
| 510 | * If pid_max changed after filtered_pids was created, we | ||
| 511 | * by default ignore all pids greater than the previous pid_max. | ||
| 512 | */ | ||
| 513 | if (search_pid >= filtered_pids->pid_max) | ||
| 514 | return false; | ||
| 515 | |||
| 516 | return test_bit(search_pid, filtered_pids->pids); | ||
| 517 | } | ||
| 518 | |||
| 519 | static bool | ||
| 520 | ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) | ||
| 521 | { | ||
| 522 | /* | ||
| 523 | * Return false, because if filtered_pids does not exist, | ||
| 524 | * all pids are good to trace. | ||
| 525 | */ | ||
| 526 | if (!filtered_pids) | ||
| 527 | return false; | ||
| 528 | |||
| 529 | return !find_filtered_pid(filtered_pids, task->pid); | ||
| 530 | } | ||
| 531 | |||
| 532 | static void filter_add_remove_task(struct trace_pid_list *pid_list, | ||
| 533 | struct task_struct *self, | ||
| 534 | struct task_struct *task) | ||
| 535 | { | ||
| 536 | if (!pid_list) | ||
| 537 | return; | ||
| 538 | |||
| 539 | /* For forks, we only add if the forking task is listed */ | ||
| 540 | if (self) { | ||
| 541 | if (!find_filtered_pid(pid_list, self->pid)) | ||
| 542 | return; | ||
| 543 | } | ||
| 544 | |||
| 545 | /* Sorry, but we don't support pid_max changing after setting */ | ||
| 546 | if (task->pid >= pid_list->pid_max) | ||
| 547 | return; | ||
| 548 | |||
| 549 | /* "self" is set for forks, and NULL for exits */ | ||
| 550 | if (self) | ||
| 551 | set_bit(task->pid, pid_list->pids); | ||
| 552 | else | ||
| 553 | clear_bit(task->pid, pid_list->pids); | ||
| 554 | } | ||
| 555 | |||
| 556 | static void | 509 | static void |
| 557 | event_filter_pid_sched_process_exit(void *data, struct task_struct *task) | 510 | event_filter_pid_sched_process_exit(void *data, struct task_struct *task) |
| 558 | { | 511 | { |
| @@ -560,7 +513,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task) | |||
| 560 | struct trace_array *tr = data; | 513 | struct trace_array *tr = data; |
| 561 | 514 | ||
| 562 | pid_list = rcu_dereference_sched(tr->filtered_pids); | 515 | pid_list = rcu_dereference_sched(tr->filtered_pids); |
| 563 | filter_add_remove_task(pid_list, NULL, task); | 516 | trace_filter_add_remove_task(pid_list, NULL, task); |
| 564 | } | 517 | } |
| 565 | 518 | ||
| 566 | static void | 519 | static void |
| @@ -572,7 +525,7 @@ event_filter_pid_sched_process_fork(void *data, | |||
| 572 | struct trace_array *tr = data; | 525 | struct trace_array *tr = data; |
| 573 | 526 | ||
| 574 | pid_list = rcu_dereference_sched(tr->filtered_pids); | 527 | pid_list = rcu_dereference_sched(tr->filtered_pids); |
| 575 | filter_add_remove_task(pid_list, self, task); | 528 | trace_filter_add_remove_task(pid_list, self, task); |
| 576 | } | 529 | } |
| 577 | 530 | ||
| 578 | void trace_event_follow_fork(struct trace_array *tr, bool enable) | 531 | void trace_event_follow_fork(struct trace_array *tr, bool enable) |
| @@ -600,8 +553,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, | |||
| 600 | pid_list = rcu_dereference_sched(tr->filtered_pids); | 553 | pid_list = rcu_dereference_sched(tr->filtered_pids); |
| 601 | 554 | ||
| 602 | this_cpu_write(tr->trace_buffer.data->ignore_pid, | 555 | this_cpu_write(tr->trace_buffer.data->ignore_pid, |
| 603 | ignore_this_task(pid_list, prev) && | 556 | trace_ignore_this_task(pid_list, prev) && |
| 604 | ignore_this_task(pid_list, next)); | 557 | trace_ignore_this_task(pid_list, next)); |
| 605 | } | 558 | } |
| 606 | 559 | ||
| 607 | static void | 560 | static void |
| @@ -614,7 +567,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt, | |||
| 614 | pid_list = rcu_dereference_sched(tr->filtered_pids); | 567 | pid_list = rcu_dereference_sched(tr->filtered_pids); |
| 615 | 568 | ||
| 616 | this_cpu_write(tr->trace_buffer.data->ignore_pid, | 569 | this_cpu_write(tr->trace_buffer.data->ignore_pid, |
| 617 | ignore_this_task(pid_list, next)); | 570 | trace_ignore_this_task(pid_list, next)); |
| 618 | } | 571 | } |
| 619 | 572 | ||
| 620 | static void | 573 | static void |
| @@ -630,7 +583,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) | |||
| 630 | pid_list = rcu_dereference_sched(tr->filtered_pids); | 583 | pid_list = rcu_dereference_sched(tr->filtered_pids); |
| 631 | 584 | ||
| 632 | this_cpu_write(tr->trace_buffer.data->ignore_pid, | 585 | this_cpu_write(tr->trace_buffer.data->ignore_pid, |
| 633 | ignore_this_task(pid_list, task)); | 586 | trace_ignore_this_task(pid_list, task)); |
| 634 | } | 587 | } |
| 635 | 588 | ||
| 636 | static void | 589 | static void |
| @@ -647,7 +600,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) | |||
| 647 | 600 | ||
| 648 | /* Set tracing if current is enabled */ | 601 | /* Set tracing if current is enabled */ |
| 649 | this_cpu_write(tr->trace_buffer.data->ignore_pid, | 602 | this_cpu_write(tr->trace_buffer.data->ignore_pid, |
| 650 | ignore_this_task(pid_list, current)); | 603 | trace_ignore_this_task(pid_list, current)); |
| 651 | } | 604 | } |
| 652 | 605 | ||
| 653 | static void __ftrace_clear_event_pids(struct trace_array *tr) | 606 | static void __ftrace_clear_event_pids(struct trace_array *tr) |
| @@ -685,8 +638,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) | |||
| 685 | /* Wait till all users are no longer using pid filtering */ | 638 | /* Wait till all users are no longer using pid filtering */ |
| 686 | synchronize_sched(); | 639 | synchronize_sched(); |
| 687 | 640 | ||
| 688 | vfree(pid_list->pids); | 641 | trace_free_pid_list(pid_list); |
| 689 | kfree(pid_list); | ||
| 690 | } | 642 | } |
| 691 | 643 | ||
| 692 | static void ftrace_clear_event_pids(struct trace_array *tr) | 644 | static void ftrace_clear_event_pids(struct trace_array *tr) |
| @@ -1034,18 +986,8 @@ p_next(struct seq_file *m, void *v, loff_t *pos) | |||
| 1034 | { | 986 | { |
| 1035 | struct trace_array *tr = m->private; | 987 | struct trace_array *tr = m->private; |
| 1036 | struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); | 988 | struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); |
| 1037 | unsigned long pid = (unsigned long)v; | ||
| 1038 | |||
| 1039 | (*pos)++; | ||
| 1040 | |||
| 1041 | /* pid already is +1 of the actual prevous bit */ | ||
| 1042 | pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); | ||
| 1043 | 989 | ||
| 1044 | /* Return pid + 1 to allow zero to be represented */ | 990 | return trace_pid_next(pid_list, v, pos); |
| 1045 | if (pid < pid_list->pid_max) | ||
| 1046 | return (void *)(pid + 1); | ||
| 1047 | |||
| 1048 | return NULL; | ||
| 1049 | } | 991 | } |
| 1050 | 992 | ||
| 1051 | static void *p_start(struct seq_file *m, loff_t *pos) | 993 | static void *p_start(struct seq_file *m, loff_t *pos) |
| @@ -1053,8 +995,6 @@ static void *p_start(struct seq_file *m, loff_t *pos) | |||
| 1053 | { | 995 | { |
| 1054 | struct trace_pid_list *pid_list; | 996 | struct trace_pid_list *pid_list; |
| 1055 | struct trace_array *tr = m->private; | 997 | struct trace_array *tr = m->private; |
| 1056 | unsigned long pid; | ||
| 1057 | loff_t l = 0; | ||
| 1058 | 998 | ||
| 1059 | /* | 999 | /* |
| 1060 | * Grab the mutex, to keep calls to p_next() having the same | 1000 | * Grab the mutex, to keep calls to p_next() having the same |
| @@ -1070,15 +1010,7 @@ static void *p_start(struct seq_file *m, loff_t *pos) | |||
| 1070 | if (!pid_list) | 1010 | if (!pid_list) |
| 1071 | return NULL; | 1011 | return NULL; |
| 1072 | 1012 | ||
| 1073 | pid = find_first_bit(pid_list->pids, pid_list->pid_max); | 1013 | return trace_pid_start(pid_list, pos); |
| 1074 | if (pid >= pid_list->pid_max) | ||
| 1075 | return NULL; | ||
| 1076 | |||
| 1077 | /* Return pid + 1 so that zero can be the exit value */ | ||
| 1078 | for (pid++; pid && l < *pos; | ||
| 1079 | pid = (unsigned long)p_next(m, (void *)pid, &l)) | ||
| 1080 | ; | ||
| 1081 | return (void *)pid; | ||
| 1082 | } | 1014 | } |
| 1083 | 1015 | ||
| 1084 | static void p_stop(struct seq_file *m, void *p) | 1016 | static void p_stop(struct seq_file *m, void *p) |
| @@ -1088,14 +1020,6 @@ static void p_stop(struct seq_file *m, void *p) | |||
| 1088 | mutex_unlock(&event_mutex); | 1020 | mutex_unlock(&event_mutex); |
| 1089 | } | 1021 | } |
| 1090 | 1022 | ||
| 1091 | static int p_show(struct seq_file *m, void *v) | ||
| 1092 | { | ||
| 1093 | unsigned long pid = (unsigned long)v - 1; | ||
| 1094 | |||
| 1095 | seq_printf(m, "%lu\n", pid); | ||
| 1096 | return 0; | ||
| 1097 | } | ||
| 1098 | |||
| 1099 | static ssize_t | 1023 | static ssize_t |
| 1100 | event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | 1024 | event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, |
| 1101 | loff_t *ppos) | 1025 | loff_t *ppos) |
| @@ -1654,7 +1578,7 @@ static void ignore_task_cpu(void *data) | |||
| 1654 | mutex_is_locked(&event_mutex)); | 1578 | mutex_is_locked(&event_mutex)); |
| 1655 | 1579 | ||
| 1656 | this_cpu_write(tr->trace_buffer.data->ignore_pid, | 1580 | this_cpu_write(tr->trace_buffer.data->ignore_pid, |
| 1657 | ignore_this_task(pid_list, current)); | 1581 | trace_ignore_this_task(pid_list, current)); |
| 1658 | } | 1582 | } |
| 1659 | 1583 | ||
| 1660 | static ssize_t | 1584 | static ssize_t |
| @@ -1666,13 +1590,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, | |||
| 1666 | struct trace_pid_list *filtered_pids = NULL; | 1590 | struct trace_pid_list *filtered_pids = NULL; |
| 1667 | struct trace_pid_list *pid_list; | 1591 | struct trace_pid_list *pid_list; |
| 1668 | struct trace_event_file *file; | 1592 | struct trace_event_file *file; |
| 1669 | struct trace_parser parser; | 1593 | ssize_t ret; |
| 1670 | unsigned long val; | ||
| 1671 | loff_t this_pos; | ||
| 1672 | ssize_t read = 0; | ||
| 1673 | ssize_t ret = 0; | ||
| 1674 | pid_t pid; | ||
| 1675 | int nr_pids = 0; | ||
| 1676 | 1594 | ||
| 1677 | if (!cnt) | 1595 | if (!cnt) |
| 1678 | return 0; | 1596 | return 0; |
| @@ -1681,93 +1599,15 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, | |||
| 1681 | if (ret < 0) | 1599 | if (ret < 0) |
| 1682 | return ret; | 1600 | return ret; |
| 1683 | 1601 | ||
| 1684 | if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) | ||
| 1685 | return -ENOMEM; | ||
| 1686 | |||
| 1687 | mutex_lock(&event_mutex); | 1602 | mutex_lock(&event_mutex); |
| 1603 | |||
| 1688 | filtered_pids = rcu_dereference_protected(tr->filtered_pids, | 1604 | filtered_pids = rcu_dereference_protected(tr->filtered_pids, |
| 1689 | lockdep_is_held(&event_mutex)); | 1605 | lockdep_is_held(&event_mutex)); |
| 1690 | 1606 | ||
| 1691 | /* | 1607 | ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); |
| 1692 | * Always recreate a new array. The write is an all or nothing | 1608 | if (ret < 0) |
| 1693 | * operation. Always create a new array when adding new pids by | ||
| 1694 | * the user. If the operation fails, then the current list is | ||
| 1695 | * not modified. | ||
| 1696 | */ | ||
| 1697 | pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); | ||
| 1698 | if (!pid_list) { | ||
| 1699 | read = -ENOMEM; | ||
| 1700 | goto out; | ||
| 1701 | } | ||
| 1702 | pid_list->pid_max = READ_ONCE(pid_max); | ||
| 1703 | /* Only truncating will shrink pid_max */ | ||
| 1704 | if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) | ||
| 1705 | pid_list->pid_max = filtered_pids->pid_max; | ||
| 1706 | pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); | ||
| 1707 | if (!pid_list->pids) { | ||
| 1708 | kfree(pid_list); | ||
| 1709 | read = -ENOMEM; | ||
| 1710 | goto out; | ||
| 1711 | } | ||
| 1712 | if (filtered_pids) { | ||
| 1713 | /* copy the current bits to the new max */ | ||
| 1714 | pid = find_first_bit(filtered_pids->pids, | ||
| 1715 | filtered_pids->pid_max); | ||
| 1716 | while (pid < filtered_pids->pid_max) { | ||
| 1717 | set_bit(pid, pid_list->pids); | ||
| 1718 | pid = find_next_bit(filtered_pids->pids, | ||
| 1719 | filtered_pids->pid_max, | ||
| 1720 | pid + 1); | ||
| 1721 | nr_pids++; | ||
| 1722 | } | ||
| 1723 | } | ||
| 1724 | |||
| 1725 | while (cnt > 0) { | ||
| 1726 | |||
| 1727 | this_pos = 0; | ||
| 1728 | |||
| 1729 | ret = trace_get_user(&parser, ubuf, cnt, &this_pos); | ||
| 1730 | if (ret < 0 || !trace_parser_loaded(&parser)) | ||
| 1731 | break; | ||
| 1732 | |||
| 1733 | read += ret; | ||
| 1734 | ubuf += ret; | ||
| 1735 | cnt -= ret; | ||
| 1736 | |||
| 1737 | parser.buffer[parser.idx] = 0; | ||
| 1738 | |||
| 1739 | ret = -EINVAL; | ||
| 1740 | if (kstrtoul(parser.buffer, 0, &val)) | ||
| 1741 | break; | ||
| 1742 | if (val >= pid_list->pid_max) | ||
| 1743 | break; | ||
| 1744 | |||
| 1745 | pid = (pid_t)val; | ||
| 1746 | |||
| 1747 | set_bit(pid, pid_list->pids); | ||
| 1748 | nr_pids++; | ||
| 1749 | |||
| 1750 | trace_parser_clear(&parser); | ||
| 1751 | ret = 0; | ||
| 1752 | } | ||
| 1753 | trace_parser_put(&parser); | ||
| 1754 | |||
| 1755 | if (ret < 0) { | ||
| 1756 | vfree(pid_list->pids); | ||
| 1757 | kfree(pid_list); | ||
| 1758 | read = ret; | ||
| 1759 | goto out; | 1609 | goto out; |
| 1760 | } | ||
| 1761 | 1610 | ||
| 1762 | if (!nr_pids) { | ||
| 1763 | /* Cleared the list of pids */ | ||
| 1764 | vfree(pid_list->pids); | ||
| 1765 | kfree(pid_list); | ||
| 1766 | read = ret; | ||
| 1767 | if (!filtered_pids) | ||
| 1768 | goto out; | ||
| 1769 | pid_list = NULL; | ||
| 1770 | } | ||
| 1771 | rcu_assign_pointer(tr->filtered_pids, pid_list); | 1611 | rcu_assign_pointer(tr->filtered_pids, pid_list); |
| 1772 | 1612 | ||
| 1773 | list_for_each_entry(file, &tr->events, list) { | 1613 | list_for_each_entry(file, &tr->events, list) { |
| @@ -1776,10 +1616,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, | |||
| 1776 | 1616 | ||
| 1777 | if (filtered_pids) { | 1617 | if (filtered_pids) { |
| 1778 | synchronize_sched(); | 1618 | synchronize_sched(); |
| 1779 | 1619 | trace_free_pid_list(filtered_pids); | |
| 1780 | vfree(filtered_pids->pids); | 1620 | } else if (pid_list) { |
| 1781 | kfree(filtered_pids); | ||
| 1782 | } else { | ||
| 1783 | /* | 1621 | /* |
| 1784 | * Register a probe that is called before all other probes | 1622 | * Register a probe that is called before all other probes |
| 1785 | * to set ignore_pid if next or prev do not match. | 1623 | * to set ignore_pid if next or prev do not match. |
| @@ -1817,9 +1655,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, | |||
| 1817 | out: | 1655 | out: |
| 1818 | mutex_unlock(&event_mutex); | 1656 | mutex_unlock(&event_mutex); |
| 1819 | 1657 | ||
| 1820 | ret = read; | 1658 | if (ret > 0) |
| 1821 | if (read > 0) | 1659 | *ppos += ret; |
| 1822 | *ppos += read; | ||
| 1823 | 1660 | ||
| 1824 | return ret; | 1661 | return ret; |
| 1825 | } | 1662 | } |
| @@ -1846,7 +1683,7 @@ static const struct seq_operations show_set_event_seq_ops = { | |||
| 1846 | static const struct seq_operations show_set_pid_seq_ops = { | 1683 | static const struct seq_operations show_set_pid_seq_ops = { |
| 1847 | .start = p_start, | 1684 | .start = p_start, |
| 1848 | .next = p_next, | 1685 | .next = p_next, |
| 1849 | .show = p_show, | 1686 | .show = trace_pid_show, |
| 1850 | .stop = p_stop, | 1687 | .stop = p_stop, |
| 1851 | }; | 1688 | }; |
| 1852 | 1689 | ||
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5a095c2e4b69..0efa00d80623 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
| @@ -43,7 +43,7 @@ static int allocate_ftrace_ops(struct trace_array *tr) | |||
| 43 | 43 | ||
| 44 | /* Currently only the non stack verision is supported */ | 44 | /* Currently only the non stack verision is supported */ |
| 45 | ops->func = function_trace_call; | 45 | ops->func = function_trace_call; |
| 46 | ops->flags = FTRACE_OPS_FL_RECURSION_SAFE; | 46 | ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID; |
| 47 | 47 | ||
| 48 | tr->ops = ops; | 48 | tr->ops = ops; |
| 49 | ops->private = tr; | 49 | ops->private = tr; |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 3a0244ff7ea8..7363ccf79512 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -319,7 +319,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
| 319 | int cpu; | 319 | int cpu; |
| 320 | int pc; | 320 | int pc; |
| 321 | 321 | ||
| 322 | if (!ftrace_trace_task(current)) | 322 | if (!ftrace_trace_task(tr)) |
| 323 | return 0; | 323 | return 0; |
| 324 | 324 | ||
| 325 | /* trace it when it is-nested-in or is a function enabled. */ | 325 | /* trace it when it is-nested-in or is a function enabled. */ |
| @@ -338,6 +338,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
| 338 | if (ftrace_graph_notrace_addr(trace->func)) | 338 | if (ftrace_graph_notrace_addr(trace->func)) |
| 339 | return 1; | 339 | return 1; |
| 340 | 340 | ||
| 341 | /* | ||
| 342 | * Stop here if tracing_threshold is set. We only write function return | ||
| 343 | * events to the ring buffer. | ||
| 344 | */ | ||
| 345 | if (tracing_thresh) | ||
| 346 | return 1; | ||
| 347 | |||
| 341 | local_irq_save(flags); | 348 | local_irq_save(flags); |
| 342 | cpu = raw_smp_processor_id(); | 349 | cpu = raw_smp_processor_id(); |
| 343 | data = per_cpu_ptr(tr->trace_buffer.data, cpu); | 350 | data = per_cpu_ptr(tr->trace_buffer.data, cpu); |
| @@ -355,14 +362,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
| 355 | return ret; | 362 | return ret; |
| 356 | } | 363 | } |
| 357 | 364 | ||
| 358 | static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) | ||
| 359 | { | ||
| 360 | if (tracing_thresh) | ||
| 361 | return 1; | ||
| 362 | else | ||
| 363 | return trace_graph_entry(trace); | ||
| 364 | } | ||
| 365 | |||
| 366 | static void | 365 | static void |
| 367 | __trace_graph_function(struct trace_array *tr, | 366 | __trace_graph_function(struct trace_array *tr, |
| 368 | unsigned long ip, unsigned long flags, int pc) | 367 | unsigned long ip, unsigned long flags, int pc) |
| @@ -457,7 +456,7 @@ static int graph_trace_init(struct trace_array *tr) | |||
| 457 | set_graph_array(tr); | 456 | set_graph_array(tr); |
| 458 | if (tracing_thresh) | 457 | if (tracing_thresh) |
| 459 | ret = register_ftrace_graph(&trace_graph_thresh_return, | 458 | ret = register_ftrace_graph(&trace_graph_thresh_return, |
| 460 | &trace_graph_thresh_entry); | 459 | &trace_graph_entry); |
| 461 | else | 460 | else |
| 462 | ret = register_ftrace_graph(&trace_graph_return, | 461 | ret = register_ftrace_graph(&trace_graph_return, |
| 463 | &trace_graph_entry); | 462 | &trace_graph_entry); |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5546eec0505f..9aedb0b06683 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -587,6 +587,7 @@ static int create_trace_kprobe(int argc, char **argv) | |||
| 587 | * $retval : fetch return value | 587 | * $retval : fetch return value |
| 588 | * $stack : fetch stack address | 588 | * $stack : fetch stack address |
| 589 | * $stackN : fetch Nth of stack (N:0-) | 589 | * $stackN : fetch Nth of stack (N:0-) |
| 590 | * $comm : fetch current task comm | ||
| 590 | * @ADDR : fetch memory at ADDR (ADDR should be in kernel) | 591 | * @ADDR : fetch memory at ADDR (ADDR should be in kernel) |
| 591 | * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) | 592 | * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) |
| 592 | * %REG : fetch register REG | 593 | * %REG : fetch register REG |
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 68f376ca6d3f..cd7480d0a201 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c | |||
| @@ -68,19 +68,15 @@ static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) | |||
| 68 | trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", | 68 | trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", |
| 69 | dev->bus->number, dev->devfn, | 69 | dev->bus->number, dev->devfn, |
| 70 | dev->vendor, dev->device, dev->irq); | 70 | dev->vendor, dev->device, dev->irq); |
| 71 | /* | ||
| 72 | * XXX: is pci_resource_to_user() appropriate, since we are | ||
| 73 | * supposed to interpret the __ioremap() phys_addr argument based on | ||
| 74 | * these printed values? | ||
| 75 | */ | ||
| 76 | for (i = 0; i < 7; i++) { | 71 | for (i = 0; i < 7; i++) { |
| 77 | pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); | 72 | start = dev->resource[i].start; |
| 78 | trace_seq_printf(s, " %llx", | 73 | trace_seq_printf(s, " %llx", |
| 79 | (unsigned long long)(start | | 74 | (unsigned long long)(start | |
| 80 | (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); | 75 | (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); |
| 81 | } | 76 | } |
| 82 | for (i = 0; i < 7; i++) { | 77 | for (i = 0; i < 7; i++) { |
| 83 | pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); | 78 | start = dev->resource[i].start; |
| 79 | end = dev->resource[i].end; | ||
| 84 | trace_seq_printf(s, " %llx", | 80 | trace_seq_printf(s, " %llx", |
| 85 | dev->resource[i].start < dev->resource[i].end ? | 81 | dev->resource[i].start < dev->resource[i].end ? |
| 86 | (unsigned long long)(end - start) + 1 : 0); | 82 | (unsigned long long)(end - start) + 1 : 0); |
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1d372fa6fefb..74e80a582c28 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c | |||
| @@ -218,6 +218,28 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data) | |||
| 218 | kfree(data); | 218 | kfree(data); |
| 219 | } | 219 | } |
| 220 | 220 | ||
| 221 | void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs, | ||
| 222 | void *data, void *dest) | ||
| 223 | { | ||
| 224 | int maxlen = get_rloc_len(*(u32 *)dest); | ||
| 225 | u8 *dst = get_rloc_data(dest); | ||
| 226 | long ret; | ||
| 227 | |||
| 228 | if (!maxlen) | ||
| 229 | return; | ||
| 230 | |||
| 231 | ret = strlcpy(dst, current->comm, maxlen); | ||
| 232 | *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); | ||
| 233 | } | ||
| 234 | NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string)); | ||
| 235 | |||
| 236 | void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs, | ||
| 237 | void *data, void *dest) | ||
| 238 | { | ||
| 239 | *(u32 *)dest = strlen(current->comm) + 1; | ||
| 240 | } | ||
| 241 | NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size)); | ||
| 242 | |||
| 221 | static const struct fetch_type *find_fetch_type(const char *type, | 243 | static const struct fetch_type *find_fetch_type(const char *type, |
| 222 | const struct fetch_type *ftbl) | 244 | const struct fetch_type *ftbl) |
| 223 | { | 245 | { |
| @@ -348,6 +370,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, | |||
| 348 | } | 370 | } |
| 349 | } else | 371 | } else |
| 350 | ret = -EINVAL; | 372 | ret = -EINVAL; |
| 373 | } else if (strcmp(arg, "comm") == 0) { | ||
| 374 | if (strcmp(t->name, "string") != 0 && | ||
| 375 | strcmp(t->name, "string_size") != 0) | ||
| 376 | return -EINVAL; | ||
| 377 | f->fn = t->fetch[FETCH_MTD_comm]; | ||
| 351 | } else | 378 | } else |
| 352 | ret = -EINVAL; | 379 | ret = -EINVAL; |
| 353 | 380 | ||
| @@ -522,6 +549,12 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | |||
| 522 | arg[t - parg->comm] = '\0'; | 549 | arg[t - parg->comm] = '\0'; |
| 523 | t++; | 550 | t++; |
| 524 | } | 551 | } |
| 552 | /* | ||
| 553 | * The default type of $comm should be "string", and it can't be | ||
| 554 | * dereferenced. | ||
| 555 | */ | ||
| 556 | if (!t && strcmp(arg, "$comm") == 0) | ||
| 557 | t = "string"; | ||
| 525 | parg->type = find_fetch_type(t, ftbl); | 558 | parg->type = find_fetch_type(t, ftbl); |
| 526 | if (!parg->type) { | 559 | if (!parg->type) { |
| 527 | pr_info("Unsupported type: %s\n", t); | 560 | pr_info("Unsupported type: %s\n", t); |
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index f6398db09114..45400ca5ded1 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h | |||
| @@ -102,6 +102,7 @@ enum { | |||
| 102 | FETCH_MTD_reg = 0, | 102 | FETCH_MTD_reg = 0, |
| 103 | FETCH_MTD_stack, | 103 | FETCH_MTD_stack, |
| 104 | FETCH_MTD_retval, | 104 | FETCH_MTD_retval, |
| 105 | FETCH_MTD_comm, | ||
| 105 | FETCH_MTD_memory, | 106 | FETCH_MTD_memory, |
| 106 | FETCH_MTD_symbol, | 107 | FETCH_MTD_symbol, |
| 107 | FETCH_MTD_deref, | 108 | FETCH_MTD_deref, |
| @@ -183,6 +184,14 @@ DECLARE_BASIC_FETCH_FUNCS(bitfield); | |||
| 183 | #define fetch_bitfield_string NULL | 184 | #define fetch_bitfield_string NULL |
| 184 | #define fetch_bitfield_string_size NULL | 185 | #define fetch_bitfield_string_size NULL |
| 185 | 186 | ||
| 187 | /* comm only makes sense as a string */ | ||
| 188 | #define fetch_comm_u8 NULL | ||
| 189 | #define fetch_comm_u16 NULL | ||
| 190 | #define fetch_comm_u32 NULL | ||
| 191 | #define fetch_comm_u64 NULL | ||
| 192 | DECLARE_FETCH_FUNC(comm, string); | ||
| 193 | DECLARE_FETCH_FUNC(comm, string_size); | ||
| 194 | |||
| 186 | /* | 195 | /* |
| 187 | * Define macro for basic types - we don't need to define s* types, because | 196 | * Define macro for basic types - we don't need to define s* types, because |
| 188 | * we have to care only about bitwidth at recording time. | 197 | * we have to care only about bitwidth at recording time. |
| @@ -213,6 +222,7 @@ DEFINE_FETCH_##method(u64) | |||
| 213 | ASSIGN_FETCH_FUNC(reg, ftype), \ | 222 | ASSIGN_FETCH_FUNC(reg, ftype), \ |
| 214 | ASSIGN_FETCH_FUNC(stack, ftype), \ | 223 | ASSIGN_FETCH_FUNC(stack, ftype), \ |
| 215 | ASSIGN_FETCH_FUNC(retval, ftype), \ | 224 | ASSIGN_FETCH_FUNC(retval, ftype), \ |
| 225 | ASSIGN_FETCH_FUNC(comm, ftype), \ | ||
| 216 | ASSIGN_FETCH_FUNC(memory, ftype), \ | 226 | ASSIGN_FETCH_FUNC(memory, ftype), \ |
| 217 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | 227 | ASSIGN_FETCH_FUNC(symbol, ftype), \ |
| 218 | ASSIGN_FETCH_FUNC(deref, ftype), \ | 228 | ASSIGN_FETCH_FUNC(deref, ftype), \ |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9bafc211930c..68f594212759 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
| @@ -938,6 +938,20 @@ bool userns_may_setgroups(const struct user_namespace *ns) | |||
| 938 | return allowed; | 938 | return allowed; |
| 939 | } | 939 | } |
| 940 | 940 | ||
| 941 | /* | ||
| 942 | * Returns true if @ns is the same namespace as or a descendant of | ||
| 943 | * @target_ns. | ||
| 944 | */ | ||
| 945 | bool current_in_userns(const struct user_namespace *target_ns) | ||
| 946 | { | ||
| 947 | struct user_namespace *ns; | ||
| 948 | for (ns = current_user_ns(); ns; ns = ns->parent) { | ||
| 949 | if (ns == target_ns) | ||
| 950 | return true; | ||
| 951 | } | ||
| 952 | return false; | ||
| 953 | } | ||
| 954 | |||
| 941 | static inline struct user_namespace *to_user_ns(struct ns_common *ns) | 955 | static inline struct user_namespace *to_user_ns(struct ns_common *ns) |
| 942 | { | 956 | { |
| 943 | return container_of(ns, struct user_namespace, ns); | 957 | return container_of(ns, struct user_namespace, ns); |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 97e7b793df35..ef071ca73fc3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -4369,8 +4369,8 @@ static void show_pwq(struct pool_workqueue *pwq) | |||
| 4369 | /** | 4369 | /** |
| 4370 | * show_workqueue_state - dump workqueue state | 4370 | * show_workqueue_state - dump workqueue state |
| 4371 | * | 4371 | * |
| 4372 | * Called from a sysrq handler and prints out all busy workqueues and | 4372 | * Called from a sysrq handler or try_to_freeze_tasks() and prints out |
| 4373 | * pools. | 4373 | * all busy workqueues and pools. |
| 4374 | */ | 4374 | */ |
| 4375 | void show_workqueue_state(void) | 4375 | void show_workqueue_state(void) |
| 4376 | { | 4376 | { |
| @@ -4607,84 +4607,65 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) | |||
| 4607 | WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0); | 4607 | WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0); |
| 4608 | } | 4608 | } |
| 4609 | 4609 | ||
| 4610 | /* | 4610 | int workqueue_prepare_cpu(unsigned int cpu) |
| 4611 | * Workqueues should be brought up before normal priority CPU notifiers. | 4611 | { |
| 4612 | * This will be registered high priority CPU notifier. | 4612 | struct worker_pool *pool; |
| 4613 | */ | 4613 | |
| 4614 | static int workqueue_cpu_up_callback(struct notifier_block *nfb, | 4614 | for_each_cpu_worker_pool(pool, cpu) { |
| 4615 | unsigned long action, | 4615 | if (pool->nr_workers) |
| 4616 | void *hcpu) | 4616 | continue; |
| 4617 | if (!create_worker(pool)) | ||
| 4618 | return -ENOMEM; | ||
| 4619 | } | ||
| 4620 | return 0; | ||
| 4621 | } | ||
| 4622 | |||
| 4623 | int workqueue_online_cpu(unsigned int cpu) | ||
| 4617 | { | 4624 | { |
| 4618 | int cpu = (unsigned long)hcpu; | ||
| 4619 | struct worker_pool *pool; | 4625 | struct worker_pool *pool; |
| 4620 | struct workqueue_struct *wq; | 4626 | struct workqueue_struct *wq; |
| 4621 | int pi; | 4627 | int pi; |
| 4622 | 4628 | ||
| 4623 | switch (action & ~CPU_TASKS_FROZEN) { | 4629 | mutex_lock(&wq_pool_mutex); |
| 4624 | case CPU_UP_PREPARE: | ||
| 4625 | for_each_cpu_worker_pool(pool, cpu) { | ||
| 4626 | if (pool->nr_workers) | ||
| 4627 | continue; | ||
| 4628 | if (!create_worker(pool)) | ||
| 4629 | return NOTIFY_BAD; | ||
| 4630 | } | ||
| 4631 | break; | ||
| 4632 | |||
| 4633 | case CPU_DOWN_FAILED: | ||
| 4634 | case CPU_ONLINE: | ||
| 4635 | mutex_lock(&wq_pool_mutex); | ||
| 4636 | 4630 | ||
| 4637 | for_each_pool(pool, pi) { | 4631 | for_each_pool(pool, pi) { |
| 4638 | mutex_lock(&pool->attach_mutex); | 4632 | mutex_lock(&pool->attach_mutex); |
| 4639 | 4633 | ||
| 4640 | if (pool->cpu == cpu) | 4634 | if (pool->cpu == cpu) |
| 4641 | rebind_workers(pool); | 4635 | rebind_workers(pool); |
| 4642 | else if (pool->cpu < 0) | 4636 | else if (pool->cpu < 0) |
| 4643 | restore_unbound_workers_cpumask(pool, cpu); | 4637 | restore_unbound_workers_cpumask(pool, cpu); |
| 4644 | 4638 | ||
| 4645 | mutex_unlock(&pool->attach_mutex); | 4639 | mutex_unlock(&pool->attach_mutex); |
| 4646 | } | 4640 | } |
| 4647 | 4641 | ||
| 4648 | /* update NUMA affinity of unbound workqueues */ | 4642 | /* update NUMA affinity of unbound workqueues */ |
| 4649 | list_for_each_entry(wq, &workqueues, list) | 4643 | list_for_each_entry(wq, &workqueues, list) |
| 4650 | wq_update_unbound_numa(wq, cpu, true); | 4644 | wq_update_unbound_numa(wq, cpu, true); |
| 4651 | 4645 | ||
| 4652 | mutex_unlock(&wq_pool_mutex); | 4646 | mutex_unlock(&wq_pool_mutex); |
| 4653 | break; | 4647 | return 0; |
| 4654 | } | ||
| 4655 | return NOTIFY_OK; | ||
| 4656 | } | 4648 | } |
| 4657 | 4649 | ||
| 4658 | /* | 4650 | int workqueue_offline_cpu(unsigned int cpu) |
| 4659 | * Workqueues should be brought down after normal priority CPU notifiers. | ||
| 4660 | * This will be registered as low priority CPU notifier. | ||
| 4661 | */ | ||
| 4662 | static int workqueue_cpu_down_callback(struct notifier_block *nfb, | ||
| 4663 | unsigned long action, | ||
| 4664 | void *hcpu) | ||
| 4665 | { | 4651 | { |
| 4666 | int cpu = (unsigned long)hcpu; | ||
| 4667 | struct work_struct unbind_work; | 4652 | struct work_struct unbind_work; |
| 4668 | struct workqueue_struct *wq; | 4653 | struct workqueue_struct *wq; |
| 4669 | 4654 | ||
| 4670 | switch (action & ~CPU_TASKS_FROZEN) { | 4655 | /* unbinding per-cpu workers should happen on the local CPU */ |
| 4671 | case CPU_DOWN_PREPARE: | 4656 | INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); |
| 4672 | /* unbinding per-cpu workers should happen on the local CPU */ | 4657 | queue_work_on(cpu, system_highpri_wq, &unbind_work); |
| 4673 | INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); | 4658 | |
| 4674 | queue_work_on(cpu, system_highpri_wq, &unbind_work); | 4659 | /* update NUMA affinity of unbound workqueues */ |
| 4675 | 4660 | mutex_lock(&wq_pool_mutex); | |
| 4676 | /* update NUMA affinity of unbound workqueues */ | 4661 | list_for_each_entry(wq, &workqueues, list) |
| 4677 | mutex_lock(&wq_pool_mutex); | 4662 | wq_update_unbound_numa(wq, cpu, false); |
| 4678 | list_for_each_entry(wq, &workqueues, list) | 4663 | mutex_unlock(&wq_pool_mutex); |
| 4679 | wq_update_unbound_numa(wq, cpu, false); | 4664 | |
| 4680 | mutex_unlock(&wq_pool_mutex); | 4665 | /* wait for per-cpu unbinding to finish */ |
| 4681 | 4666 | flush_work(&unbind_work); | |
| 4682 | /* wait for per-cpu unbinding to finish */ | 4667 | destroy_work_on_stack(&unbind_work); |
| 4683 | flush_work(&unbind_work); | 4668 | return 0; |
| 4684 | destroy_work_on_stack(&unbind_work); | ||
| 4685 | break; | ||
| 4686 | } | ||
| 4687 | return NOTIFY_OK; | ||
| 4688 | } | 4669 | } |
| 4689 | 4670 | ||
| 4690 | #ifdef CONFIG_SMP | 4671 | #ifdef CONFIG_SMP |
| @@ -5486,9 +5467,6 @@ static int __init init_workqueues(void) | |||
| 5486 | 5467 | ||
| 5487 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); | 5468 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); |
| 5488 | 5469 | ||
| 5489 | cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); | ||
| 5490 | hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); | ||
| 5491 | |||
| 5492 | wq_numa_init(); | 5470 | wq_numa_init(); |
| 5493 | 5471 | ||
| 5494 | /* initialize CPU pools */ | 5472 | /* initialize CPU pools */ |
